├── README
├── bin
├── orthomclLoadBlast.sql
├── orthomclMclToGroups
├── orthomclAdjustFasta
├── orthomclInstallSchema.sql
├── orthomclLoadBlast
├── orthomclFilterFasta
├── orthomclDumpPairsFiles
├── orthomclInstallSchema
├── orthomclBlastParser
└── orthomclPairs
├── doc
└── OrthoMCLEngine
│ └── Main
│ ├── oracleConfigurationGuide.txt
│ ├── orthomcl.config.template
│ ├── mysql.cnf
│ ├── mysqlConfigurationGuide.txt
│ ├── mysqlInstallGuide.txt
│ └── UserGuide.txt
├── config
├── .build.info
└── gus.config
├── SoftwareLicense.txt
└── lib
└── perl
└── OrthoMCLEngine
└── Main
└── Base.pm
/README:
--------------------------------------------------------------------------------
1 | This is the OrthoMCL software. See http://www.orthomcl.org for the offical version.
2 |
--------------------------------------------------------------------------------
/bin/orthomclLoadBlast.sql:
--------------------------------------------------------------------------------
1 | USE orthomcl;
2 |
3 | LOAD DATA
4 | LOCAL INFILE "~/orthoPackage/data/orthomcl_all_3.lst"
5 | REPLACE INTO TABLE orthomcl.SimilarSequences
6 | FIELDS TERMINATED BY '\t'
7 |
--------------------------------------------------------------------------------
/doc/OrthoMCLEngine/Main/oracleConfigurationGuide.txt:
--------------------------------------------------------------------------------
1 | Whatever tablespace is used for the orthomcl data may need to be very large.
2 |
3 | A good estimate is 5x size of the file produced by the orthomclBlastParser program.
4 |
5 | If the DBA desires indexes to be in a separate tablespace, use the oracleIndexTblSpc property in the orthomcl.config file
6 |
--------------------------------------------------------------------------------
/doc/OrthoMCLEngine/Main/orthomcl.config.template:
--------------------------------------------------------------------------------
1 | # this config assumes a mysql database named 'orthomcl'. adjust according
2 | # to your situation.
3 | dbVendor=mysql
4 | dbConnectString=dbi:mysql:orthomcl:3307
5 | dbLogin=
6 | dbPassword=
7 | similarSequencesTable=SimilarSequences
8 | orthologTable=Ortholog
9 | inParalogTable=InParalog
10 | coOrthologTable=CoOrtholog
11 | interTaxonMatchView=InterTaxonMatch
12 | percentMatchCutoff=50
13 | evalueExponentCutoff=-5
14 | oracleIndexTblSpc=NONE
--------------------------------------------------------------------------------
/config/.build.info:
--------------------------------------------------------------------------------
1 | #Build Information
2 | #Wed Jan 27 11:41:13 EST 2010
3 | OrthoMCLEngine.svn.status=
4 | \!Last.build=OrthoMCLEngine @ 2010/01/27 11\:41\:13
5 | \!Last.build.component=OrthoMCLEngine
6 | OrthoMCLEngine.svn.info=URL\: https\://www.cbil.upenn.edu/svn/apidb/OrthoMCLEngine/trunk\nRevision\: 33071\nLast Changed Rev\: 32454\nLast Changed Date\: 2009-12-04 13\:29\:53 -0500 (Fri, 04 Dec 2009)
7 | \!Last.build.initialTarget=install
8 | \!Last.build.timestamp=2010/01/27 11\:41\:13
9 | OrthoMCLEngine.Main.buildtime=2010/01/27 11\:41\:13
10 |
--------------------------------------------------------------------------------
/SoftwareLicense.txt:
--------------------------------------------------------------------------------
1 | The OrthoMCL Software is Copyright 2010 by the EuPathDB Bioinformatics Resource Center.
2 |
3 | This program is free software: you can redistribute it and/or modify
4 | it under the terms of the GNU General Public License as published by
5 | the Free Software Foundation, either version 3 of the License, or
6 | (at your option) any later version.
7 |
8 | This program is distributed in the hope that it will be useful,
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 | GNU General Public License for more details.
12 |
13 | You should have received a copy of the GNU General Public License
14 | along with this program. If not, see .
15 |
--------------------------------------------------------------------------------
/bin/orthomclMclToGroups:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | my ($prefix, $startId) = @ARGV;
4 |
5 | &usage unless ($prefix && ($startId =~ /\d+/));
6 |
7 |
8 | while () {
9 | s/\t/ /g;
10 | print "$prefix$startId: $_";
11 | $startId++;
12 | }
13 |
14 | sub usage {
15 | print "
16 | mclOutput2groupsFile prefix starting_id_num
17 |
18 | create an orthomcl groups file from an mcl output file. just generate a group ID for each group, and prepend it to that group's line.
19 |
20 | where:
21 | prefix a prefix to use when generating group ids. For example OG2_
22 | starting_id_num a number to start the id generating with. For example 1000
23 |
24 | std input: mcl output file (label mode)
25 | std output: orthomcl groups file
26 |
27 | an orthomcl group file has one line per group and looks like this:
28 |
29 | OG2_1009: osa|ENS1222992 pfa|PF11_0844
30 |
31 |
32 | ";
33 | exit(1);
34 | }
35 |
--------------------------------------------------------------------------------
/doc/OrthoMCLEngine/Main/mysql.cnf:
--------------------------------------------------------------------------------
1 | [client]
2 |
3 | socket=/tmp/mysql_2.sock
4 |
5 | port=3307
6 |
7 | ####################################
8 |
9 | [mysqld]
10 |
11 | #REQUIRED!!
12 | #Change the basedir directory to reflect your mysql home directory
13 | basedir=your_mysql_dir
14 |
15 | #REQUIRED!!
16 | #Change the data direcory to reflect your mysql data directory
17 | datadir=your_mysql_dir/data
18 |
19 | port=3307
20 |
21 | socket=/tmp/mysql_2.sock
22 |
23 | key_buffer_size=64M
24 |
25 | #[OPTIMIZATION]
26 | #Set this value to 50% of available RAM if your environment permits.
27 | myisam_sort_buffer_size=4G
28 |
29 | #[OPTIMIZATION]
30 | #This value should be at least 50% of free hard drive space. Use caution if setting it to 100% of free space however. Your hard disk may fill up!
31 | myisam_max_sort_file_size=200G
32 |
33 | #[OPTIMIZATION]
34 | #Our default of 2G is probably fine for this value. Change this value only if you are using a machine with little resources available.
35 | read_buffer_size=2G
36 |
--------------------------------------------------------------------------------
/doc/OrthoMCLEngine/Main/mysqlConfigurationGuide.txt:
--------------------------------------------------------------------------------
1 | There are three key configuration properties that must be set so MySQL can handle the size of data you are using.
2 |
3 | They are:
4 | myisam_max_sort_file_size
5 | myisam_sort_buffer_size
6 | read_buffer_size
7 |
8 | * If you are using an existing MySQL database, login to it and run these commands to see the current values of these properties:
9 |
10 | mysql> show variables LIKE 'myisam_max_sort_file_size';
11 |
12 | mysql> show variables LIKE 'myisam_sort_buffer_size';
13 |
14 | mysql> show variables LIKE 'read_buffer_size';
15 |
16 |
17 | NOTE: These values will display in bytes, i.e. a value of
18 | 2147479552 for read_buffer_size is equivalent to 2 Gigabytes.
19 |
20 | If these values are less than what you will need, as described below, contact your mysql administrator (provide this file for reference).
21 |
22 |
23 | * If you are installing your own mysql, edit these properties in the mysql.cnf file.
24 |
25 |
26 | * Suggested settings
27 |
28 | a) set myisam_sort_buffer_size= to 50% of available ram
29 | - to find out how much ram you have:
30 | dmesg | grep Memory
31 | - it gives you a report in kilobytes. divide by 1000000 to get a number of Gs.
32 |
33 | b) set myisam_max_sort_file_size= to 5 x the size of the file made by orthomclBlastParser.
34 | - (revisit this after you have run orthomclBlastParser)
35 |
36 | c) set read_buffer_size= to ???
37 |
--------------------------------------------------------------------------------
/lib/perl/OrthoMCLEngine/Main/Base.pm:
--------------------------------------------------------------------------------
1 | package OrthoMCLEngine::Main::Base;
2 |
3 | use strict;
4 | use DBI;
5 |
6 | sub new {
7 | my ($class, $configFile, $loghandle) = @_;
8 |
9 | my $self = {};
10 | bless($self,$class);
11 | $self->parseConfigFile($configFile, $loghandle);
12 | return $self;
13 | }
14 |
15 | sub parseConfigFile {
16 | my ($self, $configFile, $loghandle) = @_;
17 |
18 | open(F, $configFile) || die "Can't open config file '$configFile'\n";
19 |
20 | $self->{configFile} = $configFile;
21 | while() {
22 | chomp;
23 | s/\s+$//;
24 | next if /^\#/;
25 | /^(\w+)\=(.+)/ || die "illegal line in config file '$_'\n";
26 | my $key=$1;
27 | my $val=$2;
28 | $self->{config}->{$key} = $val;
29 | if ($loghandle) {
30 | $val = '********' if $key eq 'dbPassword';
31 | print $loghandle localtime() . " configuration: $key=$val\n";
32 | }
33 | }
34 | }
35 |
36 | sub getConfig {
37 | my ($self, $prop) = @_;
38 | die "can't find property $prop in config file" unless $self->{config}->{$prop};
39 | return $self->{config}->{$prop};
40 | }
41 |
42 |
43 | sub getDbh {
44 | my ($self) = @_;
45 |
46 | if (!$self->{dbh}) {
47 | my $dbVendor = $self->getConfig("dbVendor");
48 | if ($dbVendor eq 'oracle') {
49 | require DBD::Oracle;
50 | } elsif ($dbVendor eq 'mysql') {
51 | require DBD::mysql;
52 | } else {
53 | die "config file '$self->{configFile}' has invalid value '$dbVendor' for dbVendor property\n";
54 | }
55 |
56 | $self->{dbh} = DBI->connect($self->getConfig("dbConnectString"),
57 | $self->getConfig("dbLogin"),
58 | $self->getConfig("dbPassword")) or die DBI::errstr;
59 | }
60 | return $self->{dbh};
61 | }
62 | 1;
63 |
--------------------------------------------------------------------------------
/bin/orthomclAdjustFasta:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use strict;
4 |
5 | &usage() unless scalar(@ARGV) == 3;
6 |
7 | my $taxoncode = $ARGV[0];
8 | my $inputfile = $ARGV[1];
9 | my $idField = $ARGV[2];
10 |
11 | open(IN, $inputfile) || die "Can't open input file '$inputfile'\n";
12 | open(OUT, ">$taxoncode.fasta") || die "Can't open output file '$taxoncode.fasta'\n";
13 |
14 | while() {
15 | if (/\>/) {
16 | s/^\>\s*//;
17 | s/\s+/ /g;
18 | s/\s*\|\s*/\|/g;
19 | my @a = split(/[\s\|]/);
20 | print OUT ">$taxoncode|$a[$idField-1]\n";
21 | } else {
22 | print OUT $_;
23 | }
24 | }
25 |
26 |
27 |
28 | sub usage {
29 | print "
30 | Create an OrthoMCL compliant .fasta file, by adjusting definition lines.
31 |
32 | Usage:
33 | orthomclAdjustFasta taxon_code fasta_file id_field
34 |
35 | where:
36 | taxon_code: a three or four letter unique abbreviation for the taxon
37 | fasta_file: the input fasta file
38 | id_field: a number indicating what field in the definition line contains
39 | the protein ID. Fields are separated by either ' ' or '|'. Any
40 | spaces immediately following the '>' are ignored. The first
41 | field is 1. For example, in the following definition line, the
42 | ID (AP_000668.1) is in field 4: >gi|89106888|ref|AP_000668.1|
43 |
44 | Input file requirements:
45 | (1) .fasta format
46 | (2) a unique id is provided for each sequence, and is in the field specified
47 | by id_field
48 |
49 | Output file format:
50 | (1) .fasta format
51 | (2) definition line is of the form:
52 | >taxoncode|unique_protein_id
53 |
54 | The output file is named taxoncode.fasta
55 |
56 | Note: if your input files do not meet the requirements, you can do some simple perl or awk processing of them to create the required input files to this program, or the required output files. This program is provided as a convenience, but OrthoMCL users are expected to have the scripting skills to provide OrthoMCL compliant .fasta files.
57 |
58 | EXAMPLE: orthomclSoftware/bin/orthomclAdjustFasta hsa Homo_sapiens.NCBI36.53.pep.all.fa 1
59 |
60 | ";
61 | exit(1);
62 | }
63 |
--------------------------------------------------------------------------------
/bin/orthomclInstallSchema.sql:
--------------------------------------------------------------------------------
1 | USE orthomcl;
2 |
3 | --Remove schema if it already exists
4 |
5 | DROP TABLE IF EXISTS orthomcl.SimilarSequences;
6 | DROP TABLE IF EXISTS orthomcl.InParalog;
7 | DROP TABLE IF EXISTS orthomcl.Ortholog;
8 | DROP TABLE IF EXISTS orthomcl.CoOrtholog;
9 | DROP TABLE IF EXISTS orthomcl.BestInterTaxonScore;
10 | DROP TABLE IF EXISTS orthomcl.BestQueryTaxonScore;
11 | DROP TABLE IF EXISTS orthomcl.InterTaxonMatch;
12 |
13 | --Create schema
14 |
15 | CREATE TABLE orthomcl.SimilarSequences (
16 | QUERY_ID VARCHAR(15),
17 | SUBJECT_ID VARCHAR(15),
18 | QUERY_TAXON_ID VARCHAR(15),
19 | SUBJECT_TAXON_ID VARCHAR(15),
20 | EVALUE_MANT BIGINT(20),
21 | EVALUE_EXP BIGINT(20),
22 | PERCENT_IDENTITY FLOAT,
23 | PERCENT_MATCH FLOAT
24 | );
25 |
26 |
27 |
28 | CREATE INDEX ss_qtaxexp_ix ON orthomcl.SimilarSequences(query_id, subject_taxon_id, evalue_exp, evalue_mant, query_taxon_id, subject_id);
29 | CREATE INDEX ss_seqs_ix ON orthomcl.SimilarSequences(query_id, subject_id, evalue_exp, evalue_mant);
30 |
31 |
32 | -----------------------------------------------------------
33 |
34 | CREATE TABLE orthomcl.InParalog (
35 | SEQUENCE_ID_A VARCHAR(15),
36 | SEQUENCE_ID_B VARCHAR(15),
37 | TAXON_ID VARCHAR(15),
38 | UNNORMALIZED_SCORE DOUBLE,
39 | NORMALIZED_SCORE DOUBLE
40 | );
41 |
42 |
43 | ------------------------------------------------------------
44 |
45 | CREATE TABLE orthomcl.Ortholog (
46 | SEQUENCE_ID_A VARCHAR(15),
47 | SEQUENCE_ID_B VARCHAR(15),
48 | TAXON_ID_A VARCHAR(15),
49 | TAXON_ID_B VARCHAR(15),
50 | UNNORMALIZED_SCORE DOUBLE,
51 | NORMALIZED_SCORE DOUBLE
52 | );
53 |
54 | CREATE INDEX orthomcl.ortholog_seq_a_ix on orthomcl.ortholog(sequence_id_a);
55 | CREATE INDEX orthomcl.ortholog_seq_b_ix on orthomcl.ortholog(sequence_id_b);
56 |
57 |
58 | -------------------------------------------------------------
59 |
60 | CREATE TABLE orthomcl.CoOrtholog (
61 | SEQUENCE_ID_A VARCHAR(15),
62 | SEQUENCE_ID_B VARCHAR(15),
63 | TAXON_ID_A VARCHAR(15),
64 | TAXON_ID_B VARCHAR(15),
65 | UNNORMALIZED_SCORE DOUBLE,
66 | NORMALIZED_SCORE DOUBLE
67 | );
68 |
69 |
70 |
71 |
72 | CREATE OR REPLACE VIEW orthomcl.InterTaxonMatch
73 | AS SELECT ss.query_id, ss.subject_id, ss.subject_taxon_id,
74 | ss.evalue_mant, ss.evalue_exp
75 | FROM orthomcl.SimilarSequences ss
76 | WHERE ss.subject_taxon_id != ss.query_taxon_id;
77 |
78 |
79 |
80 |
81 | -- exit;
82 |
--------------------------------------------------------------------------------
/config/gus.config:
--------------------------------------------------------------------------------
1 | ##
2 | ## GUS Configuration
3 | ##
4 | ##
5 | ## @version $Revision: 1.1.2.2 $ $Date: 2005/05/06 16:32:34 $
6 | ##
7 |
8 | ### GUS End User Configurtion
9 | ###
10 |
11 |
12 | ####### gus.properties
13 |
14 | # RDBMS Connection Strings
15 | ## dbiDsn is for Perl ( i.e. dbi:Oracle:NAME_OF_DATABASE )
16 | ## jdbcDsn is for JDBC/Java ( i.e. jdbc:oracle:thin:@HOSTNAME:PORT:NAME_OF_DATABASE )
17 | dbVendor=Oracle
18 |
19 | #dbiDsn=dbi:Oracle:plas550n
20 | #databaseLogin=sfischer
21 | #databasePassword=heathersofia
22 | #group=plasmoDB
23 | #project=plasmoDB:5.4
24 | #databaseLogin=apidb
25 | #databasePassword=po34weep
26 |
27 | dbiDsn=dbi:Oracle:trypdev
28 | jdbcDsn=jdbc:oracle:oci:@trypdev
29 | databaseLogin=sfischer
30 | databasePassword=heathersofia
31 | group=OrthoMCL
32 | project=OrthoMCL:2.0
33 |
34 |
35 | # Username, group, and project info from the relevant Core tables
36 |
37 | userName=dba
38 |
39 | tablespace=GUS
40 |
41 | ####### install.prop
42 |
43 | # Path to Perl Executable
44 | perl=/usr/bin/perl
45 |
46 | ####### GUS-PluginMgr.prop
47 |
48 | # Path to MD5 Executable
49 | md5sum=/usr/bin/md5sum
50 |
51 | ################################################################################
52 | ### Warning: Do not change items below here unless you know what you're doing
53 | ################################################################################
54 |
55 | gusSchemas=Core,App,RAD,DoTS,SRes,TESS,Prot,Study,PlasmoDB
56 | coreSchemaName=CORE
57 |
58 | sequenceStart=1
59 |
60 | ### Common delimited list of housekeeping columns. In order as they should appear in the tables
61 | housekeepingColumns=MODIFICATION_DATE,USER_READ,USER_WRITE,GROUP_READ,GROUP_WRITE,OTHER_READ,OTHER_WRITE,ROW_USER_ID,ROW_GROUP_ID,ROW_PROJECT_ID,ROW_ALG_INVOCATION_ID
62 | housekeepingColumnsVer=MODIFICATION_DATE,USER_READ,USER_WRITE,GROUP_READ,GROUP_WRITE,OTHER_READ,OTHER_WRITE,ROW_USER_ID,ROW_GROUP_ID,ROW_PROJECT_ID,ROW_ALG_INVOCATION_ID,VERSION_ALG_INVOCATION_ID,VERSION_DATE,VERSION_TRANSACTION_ID
63 |
64 | ### type,length,precision,nullable
65 |
66 | hkspec.MODIFICATION_DATE=Date,0,0,false
67 | hkspec.USER_READ=Character,1,0,false
68 | hkspec.USER_WRITE=Character,1,0,false
69 | hkspec.GROUP_READ=Character,1,0,false
70 | hkspec.GROUP_WRITE=Character,1,0,false
71 | hkspec.OTHER_READ=Character,1,0,false
72 | hkspec.OTHER_WRITE=Character,1,0,false
73 | hkspec.ROW_USER_ID=Number,12,0,false
74 | hkspec.ROW_GROUP_ID=Number,4,0,false
75 | hkspec.ROW_PROJECT_ID=Number,4,0,false
76 | hkspec.ROW_ALG_INVOCATION_ID=Number,12,0,false
77 |
78 | hkspec.VERSION_ALG_INVOCATION_ID=Number,12,0,true
79 | hkspec.VERSION_DATE=Date,0,0,true
80 | hkspec.VERSION_TRANSACTION_ID=Number,12,0,true
81 |
82 | hibernate.mapdir=/Users/msaffitz/cvswork/gusdba/build/hbm
83 | hibernate.basePkg=org.gus.model
84 |
--------------------------------------------------------------------------------
/bin/orthomclLoadBlast:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use DBI;
4 | use FindBin;
5 | use lib "$FindBin::Bin/../lib/perl";
6 | use OrthoMCLEngine::Main::Base;
7 | use strict;
8 |
9 | usage() unless (@ARGV >= 2);
10 | my $configFile = $ARGV[0];
11 | my $blastFile = $ARGV[1];
12 |
13 | my $base = OrthoMCLEngine::Main::Base->new($configFile);
14 | my $dbh = $base->getDbh();
15 |
16 | my $dbVendor = $base->getConfig("dbVendor");
17 |
18 | if ($dbVendor eq 'mysql') {
19 | loadBlastMySQL($base, $blastFile);
20 | }
21 | elsif ($dbVendor eq 'oracle') {
22 | loadBlastOracle($base, $blastFile);
23 | } else {
24 | die "Config file '$configFile' contains invalid value '$dbVendor' for dbVendor\n";
25 | }
26 |
27 | sub loadBlastMySQL {
28 | my ($base, $blastFile) = @_;
29 | require DBD::mysql;
30 | my $dbh = $base->getDbh();
31 | my $sst = $base->getConfig("similarSequencesTable");
32 | my $sql = "
33 | LOAD DATA
34 | LOCAL INFILE \"$blastFile\"
35 | REPLACE INTO TABLE $sst
36 | FIELDS TERMINATED BY '\\t'
37 | ";
38 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
39 | $stmt->execute() or die DBI::errstr;
40 | }
41 |
42 |
43 | sub loadBlastOracle {
44 | my ($base, $blastFile) = @_;
45 |
46 | my $dbLogin = $base->getConfig("dbLogin");
47 | my $dbPassword = $base->getConfig("dbPassword");
48 | my $dbString = $base->getConfig("dbConnectString");
49 | my @database = split(/:/, $dbString);
50 | my $dbInstance = $database[2];
51 |
52 | open (PARFILE, ">orthomclPar.tmp");
53 | print PARFILE "userid=$dbLogin/$dbPassword\@$dbInstance\n";
54 | close PARFILE;
55 |
56 | my $sst = $base->getConfig("similarSequencesTable");
57 |
58 | my $sqlHeader = "
59 | LOAD DATA
60 | INFILE '$blastFile'
61 | INTO TABLE $sst
62 | FIELDS TERMINATED BY \"\\t\" OPTIONALLY ENCLOSED BY '\"'
63 | TRAILING NULLCOLS
64 | ( query_id,
65 | subject_id,
66 | query_taxon_id,
67 | subject_taxon_id,
68 | evalue_mant,
69 | evalue_exp,
70 | percent_identity,
71 | percent_match
72 | )
73 | ";
74 |
75 | open (CTLFILE, ">orthomclCtl.tmp");
76 | print CTLFILE $sqlHeader;
77 | close CTLFILE;
78 |
79 | my $command=`sqlldr parfile=orthomclPar.tmp control=orthomclCtl.tmp`;
80 | unlink("orthomclCtl.tmp", "orthomclPar.tmp");
81 | }
82 |
83 | sub usage {
84 | print "
85 | Load Blast results into an Oracle or Mysql database.
86 |
87 | usage: orthomclLoadBlast config_file similar_seqs_file
88 |
89 | where:
90 | config_file : see below
91 | similar_seqs_file : output from orthomclParseBlast
92 |
93 | EXAMPLE: orthomclSoftware/bin/orthomclLoadBlast my_orthomcl_dir/orthomcl.config my_orthomcl_dir/similarSequences.txt
94 |
95 | NOTE: the database login in the config file must have update/insert/truncate privileges on the tables specified in the config file.
96 |
97 | Sample Config File:
98 |
99 | dbVendor=oracle (or mysql)
100 | dbConnectString=dbi:Oracle:orthomcl
101 | dbLogin=my_db_login
102 | dbPassword=my_db_password
103 | similarSequencesTable=SimilarSequences
104 | ";
105 | }
106 |
--------------------------------------------------------------------------------
/bin/orthomclFilterFasta:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use strict;
4 |
5 | &usage() unless scalar(@ARGV) == 3;
6 |
7 | my $inputDir = $ARGV[0];
8 | my $minLength = $ARGV[1];
9 | my $maxStopPercent = $ARGV[2];
10 |
11 | opendir(DIR, $inputDir) || die "Can't open input directory '$inputDir'\n";
12 | my @files = readdir(DIR);
13 | closedir(DIR);
14 |
15 | die "Input directory $inputDir does not contain any files" unless scalar(@files);
16 |
17 | my $rejectRates = [];
18 | open(GOOD, ">goodProteins.fasta");
19 | open(BAD, ">poorProteins.fasta");
20 | foreach my $file (@files) {
21 | next if $file =~ /^\./;
22 | open(F, "$inputDir/$file") || die "Can't open input file '$file'\n";
23 | print STDERR "processing file $file\n";
24 | my $seqCount;
25 | my $rejectSeqCount;
26 | my $currentSeq;
27 | my $currentLen;
28 | my $currentStopCnt;
29 |
30 | # process lines of one file
31 | while () {
32 | chomp;
33 | # handle prev seq
34 | if (/\>/) {
35 | if ($currentSeq) {
36 | $seqCount++;
37 | $rejectSeqCount += &handleSeq($currentSeq, $currentLen, $currentStopCnt);
38 | $currentSeq = "";
39 | $currentLen = 0;
40 | $currentStopCnt = 0;
41 | }
42 | } else {
43 | $currentLen += length($_);
44 | $currentStopCnt += tr/[^A-Za-z]//; # this removes the stop codon from $_
45 | }
46 | $currentSeq .= "$_\n";
47 | }
48 | $rejectSeqCount += &handleSeq($currentSeq, $currentLen, $currentStopCnt);
49 | $seqCount++;
50 |
51 | # add file stats to reject count if it qualifies
52 | if ($rejectSeqCount) {
53 | my $pct = $rejectSeqCount/$seqCount * 100;
54 | if ($pct > 10) {
55 | push(@$rejectRates, [$file, $pct]);
56 | }
57 | }
58 | close(F);
59 | }
60 |
61 | if (scalar(@$rejectRates)) {
62 | print "\nProteomes with > 10% poor proteins:\n";
63 | my @sortedRR = sort {$b->[1] <=> $a->[1]} @$rejectRates;
64 | foreach my $reject (@sortedRR) {
65 | my $intPct = int($reject->[1]);
66 | print " $reject->[0]\t$intPct%\n";
67 | }
68 | }
69 |
70 | sub handleSeq {
71 | my ($seq, $len, $stopCnt) = @_;
72 | my $isBad = 0;
73 | my $stopPercent = (($len - $stopCnt)/$len)* 100;
74 |
75 | if ($len < $minLength || $stopPercent > $maxStopPercent) {
76 | print BAD $seq;
77 | $isBad = 1;
78 | } else {
79 | print GOOD $seq;
80 | }
81 | return $isBad;
82 | }
83 |
84 | sub usage {
85 | print "
86 | Create goodProteins.fasta containing all good proteins and rejectProteins.fasta containing all rejects. Input is a directory containing a set of compliant input .fasta files (as produced by orthomclAdjustFasta).
87 |
88 | Usage:
89 | orthomclFilterFasta input_dir min_length max_percent_stops
90 |
91 | where:
92 | input_dir: a directory containing a set of .fasta files
93 | min_length: minimum allowed length of proteins. (suggested: 10)
94 | max_percent_stop: maximum percent stop codons. (suggested 20)
95 |
96 | EXAMPLE: orthomclSoftware/bin/orthomclFilterFasta my_orthomcl_dir/compliantFasta 10 20
97 |
98 | ";
99 | exit(1);
100 | }
101 |
--------------------------------------------------------------------------------
/bin/orthomclDumpPairsFiles:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use DBI;
4 | use FindBin;
5 | use lib "$FindBin::Bin/../lib/perl";
6 | use OrthoMCLEngine::Main::Base;
7 | use strict;
8 |
9 | my $configFile = $ARGV[0];
10 |
11 | &usage() unless $configFile;
12 |
13 | my $base = OrthoMCLEngine::Main::Base->new($configFile);
14 | my $dbh = $base->getDbh();
15 |
16 | my $orthologTable = $base->getConfig("orthologTable");
17 | my $inParalogTable = $base->getConfig("inParalogTable");
18 | my $coOrthologTable = $base->getConfig("coOrthologTable");
19 |
20 | my $dir = "pairs";
21 |
22 | die "dir '$dir' already exists" if -e $dir;
23 |
24 | mkdir($dir);
25 |
26 | printOrthologsFile($dbh, $orthologTable, "$dir/orthologs.txt");
27 |
28 | printInparalogsFile($dbh, $inParalogTable, "$dir/inparalogs.txt");
29 |
30 | printOrthologsFile($dbh, $coOrthologTable, "$dir/coorthologs.txt");
31 |
32 | printMclAbcFile($dbh, $orthologTable, $inParalogTable, $coOrthologTable,
33 | "mclInput");
34 |
35 |
36 | ################# subroutines #########################
37 |
38 | sub printInparalogsFile {
39 | my ($dbh, $inparalogTable, $fileName) = @_;
40 |
41 | my $sql = "
42 | select taxon_id, sequence_id_a, sequence_id_b, normalized_score
43 | from $inparalogTable
44 | order by taxon_id, sequence_id_a, sequence_id_b asc
45 | ";
46 |
47 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
48 | $stmt->execute();
49 | open(F, ">$fileName") || die "Can't open '$fileName' for writing";
50 | while (my ($taxonId, $sourceIdA, $sourceIdB, $score) = $stmt->fetchrow_array()) {
51 | $score = int($score * 1000 + .5)/1000;
52 | print F "$sourceIdA\t$sourceIdB\t$score\n";
53 | }
54 | close(F);
55 | }
56 |
57 | sub printOrthologsFile {
58 | my ($dbh, $table, $fileName) = @_;
59 |
60 | my $sql = "
61 | select taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b, normalized_score
62 | from $table
63 | order by taxon_id_a, taxon_id_b, sequence_id_a, sequence_id_b asc
64 | ";
65 |
66 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
67 | $stmt->execute();
68 | open(F, ">$fileName") || die "Can't open '$fileName' for writing";
69 | while (my ($taxonIdA, $taxonIdB, $sourceIdA, $sourceIdB, $score) = $stmt->fetchrow_array()) {
70 | $score = int($score * 1000 + .5)/1000;
71 | print F "$sourceIdA\t$sourceIdB\t$score\n";
72 | }
73 | close(F);
74 | }
75 |
76 | sub printMclAbcFile {
77 | my ($dbh, $orthologTable, $inParalogTable, $coOrthologTable, $fileName) = @_;
78 |
79 | my $sql = "
80 | select sequence_id_a, sequence_id_b, normalized_score
81 | from $inParalogTable
82 | union
83 | select sequence_id_a, sequence_id_b, normalized_score
84 | from $orthologTable
85 | union
86 | select sequence_id_a, sequence_id_b, normalized_score
87 | from $coOrthologTable
88 | ";
89 |
90 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
91 | $stmt->execute() or die DBI::errstr;
92 | open(F, ">$fileName") || die "Can't open '$fileName' for writing";
93 | while (my ($queryId, $subjectId, $score) = $stmt->fetchrow_array()) {
94 | $score = int($score * 1000 + .5)/1000;
95 | print F "$queryId\t$subjectId\t$score\n";
96 | }
97 | close(F);
98 | }
99 |
100 | sub usage {
101 | print "
102 | Dump files from the database produced by the orthomclPairs program.
103 |
104 | usage: orthomclDumpPairsFiles config_file
105 |
106 | where:
107 | config_file : see below (you can use the same file given to orthomclPairs)
108 |
109 | Database Input:
110 | - InParalog, Ortholog, CoOrtholog tables - populated by orthomclPairs
111 |
112 | Output files:
113 | orthomclMclInput - file required by the mcl program
114 | pairs/ - dir holding relationship files
115 | potentialOrthologs.txt - ortholog relationships
116 | potentialInparalogs.txt - inparalog relationships
117 | potentialCoorthologs.txt - coortholog relationships
118 |
119 | The pairs/ files contain the pairs found by the orthomclPairs tables, and their
120 | average normalized scores. This is the same information as in the
121 | orthomclMclInput file, but segregated by relationship type. These are
122 | candidate relationships (edges) that will subsequently be grouped (clustered)
123 | by the mcl program to form the OrthoMCL ortholog groups. These files contain
124 | more sensitive and less selective relationships then the final ortholog groups.
125 |
126 | Standard Error:
127 | - logging info
128 |
129 | EXAMPLE: orthomclSoftware/bin/orthomclDumpPairsFile my_orthomcl_dir/orthomcl.config
130 |
131 | Sample Config File:
132 |
133 | dbVendor=oracle (or mysql)
134 | dbConnectString=dbi:Oracle:orthomcl
135 | dbLogin=my_db_login
136 | dbPassword=my_db_password
137 | orthologTable=Ortholog
138 | inParalogTable=InParalog
139 | coOrthologTable=CoOrtholog
140 | ";
141 | exit(1);
142 | }
143 |
144 |
--------------------------------------------------------------------------------
/bin/orthomclInstallSchema:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use DBI;
4 | use FindBin;
5 | use lib "$FindBin::Bin/../lib/perl";
6 | use OrthoMCLEngine::Main::Base;
7 | use strict;
8 |
9 |
10 | usage() unless (@ARGV >= 1);
11 | my $configFile = $ARGV[0];
12 | my $sqlLog = $ARGV[1];
13 |
14 | my $base = OrthoMCLEngine::Main::Base->new($configFile);
15 | my $dbh = $base->getDbh();
16 |
17 | if ($sqlLog) {
18 | open (LOGFILE, ">$sqlLog");
19 | }
20 |
21 | my $dbVendor;
22 | my $intType = ($dbVendor eq 'oracle') ? 'NUMBER' : 'INT';
23 | my $oracleNoLogging = $base->getConfig("dbVendor") eq 'oracle'? " NOLOGGING" : "";
24 | my $oracleIndexTblSpc = $base->getConfig("oracleIndexTblSpc");
25 | $oracleIndexTblSpc =~ s/\s//g;
26 |
27 | createSimilarSequencesTable();
28 | createInParalogTable();
29 | createOrthologTable();
30 | createCoOrthologTable();
31 | createInterTaxonMatchView();
32 |
33 | ##############################################################
34 |
35 | sub createSimilarSequencesTable {
36 | my $sst = $base->getConfig("similarSequencesTable");
37 |
38 | my $sql = "
39 | CREATE TABLE $sst (
40 | QUERY_ID VARCHAR(60),
41 | SUBJECT_ID VARCHAR(60),
42 | QUERY_TAXON_ID VARCHAR(40),
43 | SUBJECT_TAXON_ID VARCHAR(40),
44 | EVALUE_MANT FLOAT,
45 | EVALUE_EXP $intType,
46 | PERCENT_IDENTITY FLOAT,
47 | PERCENT_MATCH FLOAT
48 | ) $oracleNoLogging
49 | ";
50 | runSql($sql);
51 |
52 | $sql = "
53 | CREATE UNIQUE INDEX ss_qtaxexp_ix
54 | ON $sst(query_id, subject_taxon_id,
55 | evalue_exp, evalue_mant,
56 | query_taxon_id, subject_id) $oracleNoLogging
57 | ";
58 | runSql($sql);
59 |
60 | $sql = "
61 | CREATE UNIQUE INDEX ss_seqs_ix
62 | ON $sst(query_id, subject_id,
63 | evalue_exp, evalue_mant, percent_match) $oracleNoLogging
64 | ";
65 | runSql($sql);
66 | }
67 |
68 |
69 | sub createInParalogTable {
70 | my $ipt = $base->getConfig("inParalogTable");
71 | my $sql = "
72 | CREATE TABLE $ipt (
73 | SEQUENCE_ID_A VARCHAR(60),
74 | SEQUENCE_ID_B VARCHAR(60),
75 | TAXON_ID VARCHAR(40),
76 | UNNORMALIZED_SCORE FLOAT,
77 | NORMALIZED_SCORE FLOAT
78 | )
79 | ";
80 | runSql($sql);
81 | }
82 |
83 |
84 | sub createOrthologTable {
85 | my $olt = $base->getConfig("orthologTable");
86 | my $sql = "
87 | CREATE TABLE $olt (
88 | SEQUENCE_ID_A VARCHAR(60),
89 | SEQUENCE_ID_B VARCHAR(60),
90 | TAXON_ID_A VARCHAR(40),
91 | TAXON_ID_B VARCHAR(40),
92 | UNNORMALIZED_SCORE FLOAT,
93 | NORMALIZED_SCORE FLOAT
94 | )
95 | ";
96 | runSql($sql);
97 |
98 | $sql = "
99 | CREATE INDEX ortholog_seq_a_ix
100 | ON $olt(sequence_id_a)
101 | ";
102 | runSql($sql);
103 |
104 | $sql = "
105 | CREATE INDEX ortholog_seq_b_ix
106 | ON $olt(sequence_id_b)
107 | ";
108 | runSql($sql);
109 | }
110 |
111 |
112 | sub createCoOrthologTable {
113 | my $cot = $base->getConfig("coOrthologTable");
114 | my $sql = "
115 | CREATE TABLE $cot (
116 | SEQUENCE_ID_A VARCHAR(60),
117 | SEQUENCE_ID_B VARCHAR(60),
118 | TAXON_ID_A VARCHAR(40),
119 | TAXON_ID_B VARCHAR(40),
120 | UNNORMALIZED_SCORE FLOAT,
121 | NORMALIZED_SCORE FLOAT
122 | )
123 | ";
124 | runSql($sql);
125 | }
126 |
127 | sub createInterTaxonMatchView {
128 | my $sst = $base->getConfig("similarSequencesTable");
129 | my $itv = $base->getConfig("interTaxonMatchView");
130 | my $sql = "
131 | CREATE OR REPLACE VIEW $itv
132 | AS SELECT ss.query_id, ss.subject_id, ss.subject_taxon_id,
133 | ss.evalue_mant, ss.evalue_exp
134 | FROM $sst ss
135 | WHERE ss.subject_taxon_id != ss.query_taxon_id
136 | ";
137 | runSql($sql);
138 | }
139 |
140 | sub runSql {
141 | my $sql = $_[0];
142 | if ($sqlLog) {
143 | logSql($sql);
144 | }
145 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
146 | $stmt->execute() or die DBI::errstr;
147 | }
148 |
149 |
150 | sub logSql {
151 | my $sql = $_[0];
152 | print LOGFILE "\n$sql";
153 | }
154 |
155 | sub usage {
156 | print "
157 | Create OrthoMCL schema in an Oracle or Mysql database.
158 |
159 | usage: orthomclInstallSchema config_file sql_log_file
160 |
161 | where:
162 | config_file : see below
163 | sql_log_file : optional log of sql executed
164 |
165 | EXAMPLE: orthomclSoftware/bin/orthomclInstallSchema my_orthomcl_dir/orthomcl.config my_orthomcl_dir/install_schema.log
166 |
167 | NOTE: the database login in the config file must have update/insert/truncate privileges on the tables specified in the config file.
168 |
169 | Sample Config File:
170 |
171 | dbVendor=oracle (or mysql)
172 | dbConnectString=dbi:Oracle:orthomcl
173 | dbLogin=my_db_login
174 | dbPassword=my_db_password
175 | blastResultsTable=BlastResults
176 | orthologTable=Ortholog
177 | inParalogTable=InParalog
178 | coOrthologTable=CoOrtholog
179 | interTaxonMatchView=InterTaxonMatch
180 |
181 | ";
182 | exit(1);
183 | }
184 |
185 |
--------------------------------------------------------------------------------
/bin/orthomclBlastParser:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use strict;
4 |
5 | my $blastFile = shift(@ARGV);
6 | my $fastaFilesDir = shift(@ARGV);
7 |
8 | usage() unless $blastFile;
9 |
10 | usage() unless $fastaFilesDir;
11 |
12 | opendir(DIR, $fastaFilesDir) || die "Can't open fasta directory '$fastaFilesDir'\n";
13 | my @fastaFiles = readdir(DIR);
14 | closedir(DIR);
15 |
16 | my $genes = getGenesFromFasta($fastaFilesDir, @fastaFiles);
17 |
18 | open(F,$blastFile) || die "can't open BLAST file '$blastFile'\n";
19 |
20 | =pod
21 | query_name, hitname,
22 | $pcid, len,
23 | mismatches, ngaps,
24 | start('query'), end('query'),
25 | start('hit'), end('hit'),
26 | evalue, bits
27 | =cut
28 |
29 | my $prevSubjectId = 'blah';
30 | my $subject; # hash to hold subject info
31 | my $queryShorter;
32 |
33 | while() {
34 | chomp;
35 | my ($queryId, $subjectId, $percentIdentity, $length, $mismatches, $ngaps, $queryStart, $queryEnd, $subjectStart, $subjectEnd, $evalue, $bits) = split;
36 |
37 | if ($subjectId ne $prevSubjectId) {
38 |
39 | # print previous subject
40 | printPreviousSubject($subject) if $subject;
41 |
42 | # initialize new one from first HSP
43 | $prevSubjectId = $subjectId;
44 |
45 | $subject = {};
46 | $subject->{queryId} = $queryId;
47 | $subject->{subjectId} = $subjectId;
48 | $subject->{queryShorter} = getTaxonAndLength($subject, $genes);
49 |
50 | ($subject->{evalueMant}, $subject->{evalueExp})
51 | = formatEvalue($evalue); # from first hsp
52 | }
53 |
54 | # get additional info from subsequent HSPs
55 | my $hspspan = [$subjectStart, $subjectEnd];
56 | $hspspan = [$queryStart, $queryEnd] if $subject->{queryShorter};
57 | push(@{$subject->{hspspans}}, $hspspan);
58 | $subject->{totalIdentities} += $percentIdentity * $length;
59 | $subject->{totalLength} += $length;
60 | }
61 | printPreviousSubject($subject);
62 |
63 | ########################################################################################
64 |
65 | sub getGenesFromFasta {
66 | my $fastaFilesDir = shift(@_);
67 | my (@fastaFiles) = @_;
68 |
69 | my $genes;
70 | foreach my $fastaFile (@fastaFiles) {
71 | next if $fastaFile =~ /^\./;
72 | print STDERR "acquiring genes from $fastaFile\n";
73 | $fastaFile =~ /(\w+).fasta/ || die "'$fastaFile' is not in 'taxon.fasta' format\n";
74 | my $taxon = $1;
75 | open(FF,"$fastaFilesDir/$fastaFile") || die "can't open fasta file '$fastaFilesDir/$fastaFile'";
76 | my $gene;
77 | my $length;
78 | while () {
79 | chomp;
80 | next if /^\s*$/;
81 | if (/\>(\S+)/) {
82 | $genes->{$gene}->{length} = $length if $gene;
83 | $genes->{$gene}->{taxon} = $taxon if $gene;
84 | $gene = $1;
85 | $length = 0;
86 | } else {
87 | $length += length($_);
88 | }
89 | }
90 | $genes->{$gene}->{length} = $length if $gene;
91 | $genes->{$gene}->{taxon} = $taxon if $gene;
92 | close(FF);
93 | }
94 | return $genes;
95 | }
96 |
97 | sub getTaxonAndLength {
98 | my ($subject, $genes) = @_;
99 | $subject->{queryTaxon} = $genes->{$subject->{queryId}}->{taxon};
100 | $subject->{subjectTaxon} = $genes->{$subject->{subjectId}}->{taxon};
101 | $subject->{queryLength} = $genes->{$subject->{queryId}}->{length};
102 | $subject->{subjectLength} = $genes->{$subject->{subjectId}}->{length};
103 | die "couldn't find taxon for gene '$subject->{subjectId}'" unless $subject->{subjectTaxon};
104 | die "couldn't find taxon for gene '$subject->{queryId}'" unless $subject->{queryTaxon};
105 | return $subject->{queryLength} < $subject->{subjectLength};
106 | }
107 |
108 | sub printPreviousSubject {
109 | my ($subject) = @_;
110 |
111 | my $nonOverlapMatchLen = computeNonOverlappingMatchLength($subject);
112 |
113 | my $percentIdent =
114 | int($subject->{totalIdentities} / $subject->{totalLength} * 10 + .5)/10;
115 | my $shorterLength = $subject->{queryShorter}? $subject->{queryLength} : $subject->{subjectLength};
116 | my $percentMatch = int($nonOverlapMatchLen / $shorterLength * 1000 + .5) / 10;
117 | print "$subject->{queryId}\t$subject->{subjectId}\t$subject->{queryTaxon}\t$subject->{subjectTaxon}\t$subject->{evalueMant}\t$subject->{evalueExp}\t$percentIdent\t$percentMatch\n";
118 | }
119 |
120 | sub formatEvalue {
121 | my ($evalue) = @_;
122 |
123 | my ($evalue_mant, $evalue_exp);
124 | if ($evalue =~ /e/) {
125 | $evalue = '1' . $evalue if ($evalue =~ /^e/);
126 | ($evalue_mant, $evalue_exp) = split(/e\-/, $evalue);
127 | } else {
128 | $evalue_mant = int($evalue);
129 | $evalue_exp = 0;
130 | }
131 | return ($evalue_mant, -$evalue_exp);
132 | }
133 |
134 | sub computeNonOverlappingMatchLength {
135 | my ($subject) = @_;
136 |
137 | my @hsps = sort {$a->[0] <=> $b->[0]} @{$subject->{hspspans}};
138 | my $first = shift @hsps;
139 | return 0 unless $first;
140 | my ($start, $end) = getStartEnd($first);
141 | my $len = 0;
142 | foreach my $h (@hsps){
143 | my ($hspStart,$hspEnd) = getStartEnd($h);
144 |
145 | next if $hspEnd <= $end; ##does not extend
146 | if ($hspStart <= $end) { ##overlaps
147 | $end = $hspEnd; #extend end ... already dealt with if new end is less
148 | } else { ##there is a gap in between ..
149 | $len += $end - $start + 1;
150 | $start = $hspStart;
151 | $end = $hspEnd;
152 | }
153 | }
154 | $len += $end - $start + 1; # deal with the last one
155 | return $len
156 | }
157 |
158 | # flip orientation if nec.
159 | sub getStartEnd {
160 | my ($h) = @_;
161 | my $hspStart = $h->[0];
162 | my $hspEnd = $h->[1];
163 | if ($hspStart > $hspEnd) {
164 | $hspEnd = $h->[0];
165 | $hspStart = $h->[1];
166 | }
167 | return($hspStart,$hspEnd);
168 | }
169 |
170 | sub usage {
171 | print "
172 |
173 | orthomclBlastParser blast_file fasta_files_dir
174 |
175 | where:
176 | blast_file: BLAST output in m8 format.
177 | fasta_files_dir: a directory of compliant fasta files as produced by
178 | orthomclAdjustFasta
179 |
180 |
181 | m8 format has these columns:
182 | query_name, hitname, pcid, len, mismatches, ngaps, start('query'),
183 | end('query'), start('hit'), end('hit'), evalue, bits
184 |
185 | output:
186 | tab delimited text file, with one row per query-subject match. the columns are:
187 | query_id, subject_id, query_taxon, subject_taxon,
188 | evalue_mant, evalue_exp, percent_ident, percent_match
189 |
190 | (percent_match is computed by counting the number of bases or amino acids in the shorter sequence that are matched in any hsp, and dividing by the length of that shorter sequence)
191 |
192 | EXAMPLE: orthomclSoftware/bin/orthomclBlastParser my_blast_results my_orthomcl_dir/compliantFasta >> my_orthomcl_dir/similarSequences.txt
193 |
194 |
195 | ";
196 |
197 |
198 | exit(1);
199 | }
200 |
--------------------------------------------------------------------------------
/doc/OrthoMCLEngine/Main/mysqlInstallGuide.txt:
--------------------------------------------------------------------------------
1 | THIS FILE IS UNDER CONSTRUCTION. please mail stevef@pcbi.upenn.edu with questions....
2 |
3 | This file is a guide to installing a mysql server by a regular user (not an administrator).
4 | It installs mysql in the user's space.
5 |
6 | It is intended to serve users who:
7 | - do not already have a mysql available, and cannot get an admin to install one
8 | - do have one already installed, but will not be able to use it or reconfigure it
9 |
10 |
11 | ----INSTALLATION OVERVIEW---------------------------
12 |
13 | I. General Requirements
14 | II. Installing from General Linux/Unix Binary Packages
15 | III. Creating a Database and User Account
16 | IV. Installing Required PERL modules for MySQL
17 | V. Optimizing MySQL
18 | VI. Troubleshooting Installation Issues
19 | VII. Removing Your MySQL Installation
20 |
21 | ----------------------------------------------------
22 | ----------------------------------------------------
23 | I. General Requirements
24 |
25 | - MySQL server 5.1 or greater
26 | - MySQL DBI and DBD driver modules for PERL (available at CPAN)
27 | - Unix/Linux or MacOS 10.0.1 or greater
28 |
29 |
30 | ----------------------------------------------------
31 | ----------------------------------------------------
32 | II. Installing MySQL From Linux/Unix Binary Packages (Root access not required)
33 |
34 | 1. Go to http://www.mysql.com/downloads/
35 | a) click on "Download" under MySQL Community Server
36 |
37 | b) for linux: click on "Linux (non RPM packages)"
38 |
39 | c) otherwise, find the .tar package for your OS
40 |
41 | d) choose the download for your platform
42 | - use the 'uname -a' command if you don't know your platform
43 | - 'cat /proc/cpuinfo | grep model' will list your processor type (ie.
44 | AMD or Intel, 32 or 64 bit) if you are running Linux.
45 |
46 | e) either login/register or click "No thanks, just take me to the downloads!"
47 |
48 | f) choose a mirror near you, and download
49 |
50 |
51 | 2. Create a directory for your installation, and move the file to it.
52 | a) This directory needs to be on a volume that has *adequate space* for all the data.
53 | - Please see the UserGuide.txt to estimate how much disk space you will need.
54 |
55 | b) (See step 6 below if you want to relocate the mysql data to a separate volume.)
56 |
57 |
58 | 3. Unzip the downloaded file into the directory:
59 |
60 | tar -xzvf mysql-standard-5.1.34-linux-i686-glibc23.tar.gz
61 | rm mysql-standard-5.1.34-linux-i686-glibc23.tar.gz (to save disk space)
62 |
63 | 4. Give the new directory a shortcut name
64 |
65 | ln -s mysql-standard-5.1.34-linux-i686-glibc23 mysql
66 |
67 |
68 | 5. Change to the mysql directory and configure mysql using the sample config file provided in orthomcl
69 | download:
70 |
71 | cd mysql
72 | cp my_orthomcl_dir/doc/OrthoMCLEngine/Main/mysql.cnf .
73 |
74 | a) set basedir= to the full path of your new mysql directory
75 |
76 | b) set datadir= to the full path of the /data subdir in your new mysql
77 | directory
78 | - this is the directory that will hold all the data.
79 | - use the df -h command to see how much space you have
80 | - you will need at least 5x the size of the file made by
81 | orthomclBlastParser
82 | - (revisit this after you have run orthomclBlastParser)
83 |
84 | c) now see the mysqlConfigurationGuide.txt for important optimization
85 | configuration items
86 |
87 | 6. Set up the default MySQL databases:
88 |
89 |
90 | ./scripts/mysql_install_db --defaults-file=mysql.cnf
91 |
92 | NOTE: The script will inform that you that need to set a root password.
93 | Don't worry about this for now; you will perform this task in another step.
94 |
95 |
96 | 7. You are now ready to start your MySQL server as a background process. To do so, from within your mysql directory, run:
97 |
98 | ./bin/mysqld_safe --defaults-file=mysql.cnf &
99 |
100 | NOTE: You *must* run this command from the mysql directory.
101 | You should see something similar to the following:
102 |
103 | [1] 67786% Starting mysqld daemon with databases from home/youraccountname/mysql/data
104 |
105 | 8. At this point your MySQL password is still blank. Use the following command to set a new
106 | root password:
107 |
108 | ./bin/mysqladmin --defaults-file=mysql.cnf -u root password "yourpasswordhere"
109 |
110 | NOTE: DO NOT FORGET THIS PASSWORD. write it down someplace that you won't forget.
111 |
112 | --------------------------------------------------
113 | --------------------------------------------------
114 | III. Create a New Database and User Account
115 |
116 | 1. Log in to your mysql server as root. If you are logging in to an existing MySQL server,
117 | use any existing account that can create a user and grant privileges:
118 |
119 | ./bin/mysql --defaults-file=mysql.cnf -u root -p
120 |
121 | Enter the root password you set in Step II.8 when prompted.
122 |
123 | 2. Once logged in as root, create the database and user (schema) that you will use for OrthoMCL
124 | (we use orthomcl as an example here), and grant the user account the necessary privileges:
125 |
126 | mysql> CREATE DATABASE orthomcl;
127 |
128 | mysql> GRANT SELECT,INSERT,UPDATE,DELETE,CREATE VIEW,CREATE, INDEX, DROP on orthomcl.* TO orthomcl@localhost;
129 |
130 | mysql> set password for orthomcl@localhost = password('yourpassword');
131 |
132 | NOTE: DO NOT FORGET THIS PASSWORD. write it down someplace that you won't forget
133 |
134 | NOTE: if you want to play with the data in the database, you can get into it like this:
135 | ./bin/mysql --defaults-file=mysql.cnf -u orthomcl
136 |
137 | -------------------------------------------------
138 | -------------------------------------------------
139 | IV. Installing the Required PERL Modules for MySQL
140 |
141 | 1. Check to see if the DBI and DBD::mysql PERL modules
142 | are installed:
143 |
144 | $ perl -MDBI -e 1
145 | $ perl -MDBD::mysql -e 1
146 |
147 | - If you receive no output, then the module *is* installed and you
148 | can continue to section V. However, if you receive an error message for
149 | either, continue to step 2 and install the missing module(s).
150 |
151 | - It is often the case that the DBI module is installed, but DBD:mysql
152 | is not.
153 |
154 |
155 | 2. If you have root access, the easiest way to install Perl modules on Unix/Linux
156 | is to perform a system-wide install using CPAN:
157 |
158 | $ perl -MCPAN -e shell
159 | cpan> o conf makepl_arg "mysql_config=/path_to_your_mysql_dir/bin/mysql_config"
160 | cpan> install Data::Dumper
161 | cpan> install DBI
162 | cpan> force install DBD::mysql
163 |
164 |
165 | 3. Installing modules as a standard user
166 |
167 | - Follow the steps below to install modules as a non-root user. We
168 | assume /myperl in your home directory is your custom perl directory.
169 |
170 | 1. In your home directory, create the PERL and CPAN directories, and
171 | a blank CPAN config module:
172 |
173 | $ mkdir myperl
174 | $ mkdir .cpan
175 | $ mkdir .cpan/CPAN
176 | $ echo "\$CPAN::Config = {}"> ~/.cpan/CPAN/MyConfig.pm
177 |
178 | 2. Configure your environment by adding the following to your
179 | .bash_profile file in your home directory:
180 |
181 | ######################################
182 | if [ -z "$PERL5LIB" ]
183 | then
184 | # If PERL5LIB wasn't previously defined, set it...
185 | PERL5LIB=~/myperl/lib
186 | else
187 | # ...otherwise, extend it.
188 | PERL5LIB=$PERL5LIB:~/myperl/lib
189 | fi
190 |
191 | MANPATH=$MANPATH:~/myperl/man
192 |
193 | export PERL5LIB MANPATH
194 | ######################################
195 |
196 | 3. Create the necessary directories and process your .bash_profile:
197 |
198 | $ mkdir -p ~/myperl/lib
199 | $ mkdir -p ~/myperl/man/man{1,3}
200 | $ source ~/.bash_profile
201 |
202 | 4. Confirm that your custom per5lib paths have been set:
203 |
204 | $ perl -wle'print for grep /myperl/, @INC'
205 |
206 | - You should see pathing relative to your home directory. If not,
207 | repeat steps 1-3.
208 |
209 | 5. Invoke the CPAN shell and complete CPAN configuration:
210 |
211 | $ perl -MCPAN -we shell
212 |
213 | - CPAN will request that you set your config. Accepting the
214 | default (type install Data::Dumper
243 | cpan> install DBI
244 | cpan> force install DBD:mysql
245 |
246 | -------------------------------------------------
247 | -------------------------------------------------
248 | V. MySQL Server Optimization
249 |
250 | - please see the mysqlConfigurationGuide.txt document provided in the orthomcl download.
251 |
252 |
253 | -------------------------------------------------
254 | -------------------------------------------------
255 | VI. Troublshooting Installation Issues
256 |
257 | The MySQL server logs all status and error messages in a file called
258 | yourhost.err, where yourhost is the name of your machine. The file is located
259 | in the mysql/data directory and contains useful information for debugging problems
260 | with your MySQL server.
261 |
262 | Below are some common installation issues and resolutions:
263 |
264 | (1)
265 | ISSUE: Your MySQL installation is conflicting with another install
266 |
267 | - You may be conflicting with an existing MySQL install if you see an
268 | error in your log similar to the following when running mysql_install_db,
269 | mysqladmin, or mysql:
270 |
271 | Installing MySQL system tables...
272 | 090515 13:32:49 [Warning] Can't create test file
273 | /var/lib/mysql/localhost.lower-test
274 | 090515 13:32:49 [Warning] Can't create test file
275 | /var/lib/mysql/localhost.lower-test
276 | ERROR: 1005 Can't create table 'db' (errno: 13)
277 | 090515 13:32:49 [ERROR] Aborting
278 |
279 |
280 | RESOLUTION:
281 |
282 | - A path is set incorrect in your mysql.cnf if you see a reference to
283 | /var/lib in your error log. Check to ensure that you have correctly
284 | set the mysql_sock and port parameters in mysql.cnf.
285 |
286 | - - to see if 3307 is in use, type this command:
287 | netstat -a | grep tcp | grep 3307
288 | - if so, set port=3308 (or 3309, etc., if 3308 is already used)
289 |
290 | (2)
291 | ISSUE: Unspecified or Misconfigured mysql.cnf file
292 |
293 | - If only the first numeric line appears (you do not see a "Starting
294 | mysqld daemon..." message) when you execute ./bin/mysqld_safe, you
295 | probably entered at least one incorrect path in your mysql.cnf file, or
296 | you did not specify --defaults-file=mysql.cnf when starting MySQL.
297 |
298 |
299 | RESOLUTION:
300 |
301 | - Check to ensure that:
302 |
303 | - You specified the --defaults-file=mysql.cnf when running mysql or
304 | a mysqladmin command.
305 |
306 | - Your mysql.cnf parameters correctly reflect your MySQL
307 | installation path.
308 |
309 |
310 |
311 | -------------------------------------------------
312 | -------------------------------------------------
313 | VI. Removing Your MySQL Installation
314 |
315 | - If you wish to remove your MySQL installation, this can be performed in
316 | two simple steps:
317 |
318 | (1) Shutdown your MySQL server:
319 |
320 | cd mysql
321 | ./bin/mysqladmin --defaults-file=mysql.cnf -u root -p shutdown
322 |
323 |
324 | (2) Change to the parent directory of your MySQL installation, and remove
325 | the mysql directory and symbolic link:
326 |
327 |
328 | rm -r -f mysql-standard-5.1.34-linux-i686-glibc23
329 | rm mysql
330 |
331 |
332 | NOTE: Always use caution when using the rm -r -f command! This
333 | command deletes an entire directory structory with no request
334 | for confirmation, so be sure the correct directory is specified.
335 |
336 |
337 |
338 |
339 |
340 |
341 |
342 |
343 |
344 |
345 |
346 |
347 |
348 |
349 |
350 |
351 |
--------------------------------------------------------------------------------
/doc/OrthoMCLEngine/Main/UserGuide.txt:
--------------------------------------------------------------------------------
1 | OrthoMCL User's Guide
2 | Version 2.0
3 |
4 | UNDER CONSTRUCTION:
5 | - mysql install documentation
6 | - mcl documentation
7 |
8 | ===================================================
9 | =========== Introduction ==========================
10 | ===================================================
11 |
12 | For details on the orthomcl algorithm, please read the OrthoMCL Algorithm Document available at: http://docs.google.com/Doc?id=dd996jxg_1gsqsp6
13 |
14 | The input to OrthoMCL is a set of proteomes.
15 |
16 | The output is a set of files:
17 | pairs/
18 | potentialOrthologs.txt
19 | potentialCoorthologs.txt
20 | potentialInparalogs.txt
21 | groups.txt
22 |
23 | The files in the pairs/ directory contain pairwise relationships between proteins, and their scores. They are categorized into potential orthologs, co-orthologs and inparalogs as described in the OrthoMCL Algorithm Document (docs.google.com/Doc?id=dd996jxg_1gsqsp6). The groups.txt file contains the groups created by clustering the pairs with the MCL program.
24 |
25 | There are three overall stages:
26 | - all-v-all BLAST
27 | - the OrthoMCL Pairs program -- makes the pairs/ directory
28 | - the MCL program -- clusters the pairs to make the groups.txt file
29 |
30 | These stages are executed in a series of thirteen steps detailed below. Most simply involve running a provided program. They are broken into steps for ease of backtracking and recoverability. Most are *very simple* to run, so don't be discouraged.
31 |
32 |
33 | ===================================================
34 | ========= Benchmark Dataset ======================
35 | ===================================================
36 |
37 | In the documentation for Orthomcl we refer to a Benchmark Dataset. We tested this set extensively. It had:
38 | - 100 proteomes (across the tree of life)
39 | - 1M proteins
40 | - 500M significant similarities (BLAST hits)
41 |
42 | We base hardware requirements and time estimates on this benchmark dataset. The most significant predictor of resourc/time requirements is the number of significant similarities. Even so, as this number changes, resource requirements will change non-linearly.
43 |
44 |
45 | ===================================================
46 | ========= Requirements ============================
47 | ===================================================
48 |
49 | (1) UNIX
50 | - The OrthoMCL Pairs program has only been tested on UNIX
51 | - The MCL program is UNIX compatible only
52 |
53 | (2) BLAST
54 | - we recommend NCBI BLAST for two reasons
55 | a) theoretically: XXXXXXX
56 | b) practically: NCBI BLAST supports a tab delimited output which we provide parsers for. See Step 7 below.
57 | - for large datasets (e.g. 1M proteins) all-v-all BLAST will likely require a compute cluster. Otherwise, it could run for weeks or months.
58 |
59 | (3) Relational Database
60 | - The OrthoMCL Pairs program runs in a relational database. Supported vendors are:
61 | - Oracle
62 | - MySql
63 | - If you don't already have one of these installed, install MySql, which can be done for free and without significant systems administration support. (Follow the instructions we provide below.)
64 |
65 | We realize that it is a little inconvenient to require a relational database. However, using a relational database as the core technology for orthomclPairs provides speed, robustness and scalability that would have been very hard to acheive without it.
66 |
67 | (4) Hardware
68 | - the hardware requirements vary dramatically with the size of your dataset.
69 | - for the Benchmark Dataset, we recommend:
70 | - memory: at least 4G
71 | - disk: 100G free space.
72 | - you can estimate your disk space needs more accurately when you have completed Step 8 below. You will need at least 5 times the size of the blast results file produced in that step. 90% of the disk space required is to hold that file, load it into the database and index it in the database.
73 |
74 | (5) Perl
75 | - standard perl
76 | - DBI libraries
77 |
78 | (6) MCL program
79 | - more details here...
80 |
81 | (7) Time
82 | - The Benchmark Dataset took:
83 | - 3 days to run all-v-all BLAST on a 500 cpu compute cluster.
84 | - 16 hours for the orthmclPairs processing to find pairs
85 | - 2 hours for MCL to find the groups
86 |
87 |
88 |
89 | ===================================================
90 | =========== Overview of steps =====================
91 | ===================================================
92 |
93 | This is an overview of the thirteen steps to run orthomcl. Details are in the next sections.
94 |
95 | All programs except mysql and mcl are provided as part of the OrthoMCL download. The provided programs all begin with 'orthomcl' and will print help if called with no arguments.
96 |
97 | (1) install or get access to a supported relational database. If using MySql, certain configurations are required, so it may involve working with your MySql administrator or installing your own MySql. See the mysqlInstallationGuide.txt document provided with the orthomcl software.
98 |
99 | (2) download and install the mcl program according to provided instructions.
100 |
101 | (3) install and configure the OrthoMCL suite of programs.
102 |
103 | (4) run orthomclInstallSchema to install the required schema into the database.
104 |
105 | (5) run orthomclAdjustFasta (or your own simple script) to generate protein fasta files in the required format.
106 |
107 | (6) run orthomclFilterFasta to filter away poor quality proteins, and optionally remove alternative proteins. Creates a single large goodProteins.fasta file (and a poorProteins.fasta file)
108 |
109 | (7) run all-v-all NCBI BLAST on goodProteins.fasta (output format is tab delimited text). We do not provide documentation or support for this step.
110 |
111 | (8) run orthomclBlastParser on the NCBI BLAST tab output to create a file of similarities in the required format
112 |
113 | (9) run orthomclLoadBlast to load the output of orthomclBlastParser into the database.
114 |
115 | (10) run the orthomclPairs program to compute pairwise relationships.
116 |
117 | (11) run the orthomclDumpPairsFiles program to dump the pairs/ directory from the database
118 |
119 | (12) run the mcl program on the mcl_input.txt file created in Step 11.
120 |
121 | (13) run orthomclMclToGroups to convert mcl output to groups.txt
122 |
123 | We recommend you save the output of each step so that you can easily redo it if things go wrong.
124 |
125 |
126 | ===================================================
127 | ============ Steps in detail ======================
128 | ===================================================
129 |
130 | ========== Step 1: Install and configure the relational database ============
131 | If you are using Oracle, please see the included oracleConfigurationGuide.txt
132 |
133 | If you are using MySQL, please see the included mysqlConfigurationGuide.txt
134 |
135 | If you do not have either, please see the mysqlInstallationGuide.txt to install your own mysql.
136 |
137 |
138 | ========== Step 2: install mcl ==========
139 | Get the latest software from http://www.micans.org/mcl/src/mcl-latest.tar.gz
140 |
141 | Follow this install instructions.
142 |
143 | MORE HERE SOON...
144 |
145 |
146 | ========== Step 3: install and configure OrthoMCL programs ========
147 | Input:
148 | - orthomclSoftware.tar
149 | Output:
150 | - directory of executable programs
151 | - home directory for your run of orthomcl
152 | - orthomcl.config file
153 |
154 | Use this command to unpack the software:
155 | tar -xf orthomclSoftware.tar
156 |
157 | The result will be this:
158 | orthomclSoftware/
159 | bin/
160 | ...
161 | doc/
162 | UserGuide.txt
163 | orthomcl.config.template
164 | lib/
165 |
166 | The bin/ directory has a set of programs. To run the programs you will need to either:
167 | a) include the orthomclSoftware/bin directory in your PATH
168 | b) call the programs using their full directory path
169 |
170 | Make a directory to hold the data and results for your run of orthomcl. In this document we will call that directory "my_orthomcl_dir".
171 |
172 | In the orthomclSoftware/doc/Main/OrthoMCLEngine directory is a file called orthomcl.config.template. Copy that file to my_orthomcl_dir/orthomcl.config, and edit the new file according to the following instructions.
173 |
174 | In the examples below, it is assumed that your MySql server has a database called 'orthomcl'. You can either create one (go into the server and run 'create database orthomcl') or use an existing database, and change the dbConnectString accordingly.
175 |
176 | dbVendor=
177 | - either 'oracle' or 'mysql'
178 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs
179 | dbConnectString=
180 | - the string required by Perl DBI to find the database.
181 | - examples are:
182 | dbi:Oracle:orthomcl (for an oracle database with service name 'orthomcl')
183 | dbi:MySql:orthomcl (for a centrally installed mysql server with a database called 'orthomcl')
184 | dbi:MySql:orthomcl:localhost:3307 (for a user installed mysql server on port 3307 with a database called 'orthomcl')
185 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs, orthomclDumpPairsFiles
186 | dbLogin=
187 | - your database login name
188 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs, orthomclDumpPairsFiles
189 | dbPassword=
190 | - your database password
191 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs, orthomclDumpPairsFiles
192 | similarSequencesTable=
193 | - the name to give the table that will be loaded with blast results by orthomclLoadBlast. This is configurable for your flexibility. It doesn't matter what you call it.
194 | - used by orthomclInstallSchema, orthomclLoadBlast, orthomclPairs
195 | orthologTable=
196 | - the name of the table that will hold potential ortholog pairs. This is configurable so that you can run orthomclPairs multiple times, and compare results.
197 | - used by orthomclInstallSchema, orthomclPairs, orthomclDumpPairsFiles
198 | inParalogTable=InParalog
199 | - the name of the table that will hold potential inparalog pairs. This is configurable so that you can run orthomclPairs multiple times, and compare results.
200 | - used by orthomclInstallSchema, orthomclPairs, orthomclDumpPairsFiles
201 | coOrthologTable=CoOrtholog
202 | - the name of the table that will hold potential coortholog pairs. This is configurable so that you can run orthomclPairs multiple times, and compare results.
203 | - used by orthomclInstallSchema, orthomclPairs, orthomclDumpPairsFiles
204 | interTaxonMatchView=InterTaxonMatch
205 | percentMatchCutoff=
206 | - blast similarities with percent match less than this value are ignored.
207 | - used by orthomclPairs
208 | evalueExponentCutoff=
209 | - blast similarities with evalue Exponents greather than this value are ignored.
210 | - used by orthomclPairs
211 | oracleIndexTblSpc=
212 | - optional table space to house all oracle indexes, if required by your oracle server. default is blank.
213 |
214 |
215 | ========== Step 4: orthomclInstallSchema ========
216 | Input:
217 | - database
218 | Output:
219 | - database with schema installed
220 |
221 | Run the orthmclInstallSchema program to install the schema. (Run the program with no arguments to get help. This is true of all following orthomcl programs.)
222 |
223 | Benchmark time: < 1 min
224 |
225 |
226 | ========== Step 5: orthomclAdjustFasta ==========
227 | Input:
228 | - fasta files as acquired from the genome resource.
229 | Output:
230 | - the my_orthomcl_dir/compliantFasta/ directory of orthomcl-compliant fasta files (see Step 6)
231 |
232 | Use orthomclAdjustFasta to produce a compliant file from any input file that conforms to the following pattern (for other files, provide your own script to produce complaint fasta files):
233 | - has one or more fields that are separated by white space or the '|' character (optionally surrounded by white space)
234 | - has the unique ID in the same field of every protein.
235 |
236 | First, for any organism that has multiple protein fasta files, combine them all into one single proteome fasta file
237 |
238 | Then, create an empty my_orthomcl_dir/compliantFasta/ directory, and change to that directory. Run orthomclAdjustFasta once for each input proteome fasta file. It will produce a compliant file in the new directory. Check each file to ensure that the proteins all have proper IDs.
239 |
240 | Benchmark time: < 1 min per genome
241 |
242 |
243 | ======== Step 6: orthomclFilterFasta ===========
244 | Input:
245 | - my_orthomcl_dir/compliantFasta/
246 | - optionally a gene->protein mapping file
247 | Output:
248 | - my_orthomcl_dir/goodProteins.fasta
249 | - my_orthomcl_dir/poorProteins.fasta
250 | - report of suspicious proteomes (> 10% poor proteins)
251 |
252 | This step produces a single goodProteins.fasta file to run BLAST on. It filters away poor-quality sequences (placing them in poorProteins.fasta). The filter is based on length and percent stop codons. You can adjust these values.
253 |
254 | The input requirements are:
255 | 1) a compliantFasta/ directory which contains all and only the proteome .fasta files, one file per proteome.
256 | 1) each .fasta file must have a name in the form 'xxxx.fasta' where xxxx is a three or four letter unique taxon code. For example: hsa.fasta or eco.fasta
257 | 2) each protein in those files must have a definition line in the following format:
258 | >xxxx|yyyyyyyy
259 | where xxxx is the three or four letter taxon code and yyyyyyy is a sequence identifier unique within that taxon.
260 |
261 | Change dir to my_orthomcl_dir/ and run orthomclFilterFasta.
262 |
263 | Benchmark time: 5 min
264 |
265 |
266 | ========= Step 7: All-v-all BLAST =========
267 | Input:
268 | - goodProteins.fasta
269 | Output:
270 | - your_blast_results_in_tab_format
271 |
272 | You must run your own BLAST. For large datasets you should consider gaining access to a compute cluster.
273 |
274 | We expect you to:
275 | - use NCBI BLAST
276 | - run with the -m 8 option to provide tab delimited output required by Step 8
277 | - see the OrthoMCL Algorithm Document (docs.google.com/Doc?id=dd996jxg_1gsqsp6) for important details about other BLAST arguments
278 |
279 | If you are a power user you can deviate from this, so long as you can ultimately provide output in exactly the format provided by NCBI BLAST using the -m 8 option, and expected by Step 8.
280 |
281 | If you are a super-power user you can deviate from that, and also skip Step 8. But you must be able to provide the exact format file created by that step as expected by Step 9. The tricky part is computing percent match.
282 |
283 | Time estimate: highly dependent on your data and hardware
284 |
285 | ========= Step 8: orthomclBlastParser ========
286 | Input:
287 | - your_blast_results_in_tab_format
288 | - my_orthomcl_dir/compliantFasta/
289 | Output:
290 | - my_orthomcl_dir/similarSequences.txt
291 |
292 | This step parses NCBI BLAST -m 8 output into the format that can be loaded into the orthomcl database.
293 |
294 | Use the orthomclBlastParser program for this. In addition to formatting, it computes the percent match of each hit, which is tricky (see the perl code if you are a super-power user.)
295 |
296 | orthomclBlastParser my_blast_results compliantFasta >> similarSequences.txt
297 |
298 | IMPORTANT NOTE: the size of this file determines the disk space required by the relational database. You will need 5x the size of this file. Please see the oracleConfigGuide or mysqlConfigGuide now that you know the size of this file.
299 |
300 | Benchmark time: 10 min
301 |
302 |
303 | ========= Step 9: orthomclLoadBlast ===========
304 | Input:
305 | - similarSequences.txt
306 | Output:
307 | - SimilarSequences table in the database
308 |
309 | This step loads the BLAST results into the orthomcl database.
310 |
311 | Use the orthomclLoadBlast program for this.
312 |
313 | Benchmark time: 4 hours
314 |
315 |
316 | ========= Step 10: orthomclPairs =========
317 | Input:
318 | - SimilarSequences table in the database
319 | Output:
320 | - PotentialOrthologs table
321 | - PotentialInParalogs table
322 | - PotentialCoOrthologs table
323 |
324 | This is a computationally major step that finds protein pairs. It executes the algorithm described in the OrthoMCL Algorithm Document (docs.google.com/Doc?id=dd996jxg_1gsqsp6), using a relational database. The program proceeds through a series of internal steps, each creating an intermediate database table or index. There are about 20 such tables created. Finally, it populates the output tables.
325 |
326 | The cleanup= option allows you to control the cleaning up of the intermediary tables. The 'yes' option drops the intermediary tables once they are no longer needed. The 'no' option keeps the intermediary tables in the database. In total, they are expected to be about 50 percent of the SimilarSequences table. They are useful mostly for power users or developers who would like to query them. They can be removed afterwards with the 'only' or 'all' options. The latter also removes the final tables, and should only be done after Step 11 below has dumped them to files.
327 |
328 | The startAfter= option allows you to pick up where the program left off, if it stops for any reason. Look in the log to find the last completed step, and use its tag as the value for startAfter=
329 |
330 | Because this program will run for many hours, we recommend you run it using the UNIX 'screen' program, so that it does not abort in the middle. (If it does, use startAfter=).
331 |
332 | Benchmark time: 16 hours
333 |
334 |
335 | ========== Step 11: orthomclDumpPairsFiles ========
336 | Input:
337 | - database with populated pairs tables
338 | Output
339 | - pairs/ directory
340 | - mclInput file
341 |
342 | Run the orthomclDumpPairsFiles
343 |
344 | Benchmark time: 5 min
345 |
346 |
347 | ========== Step 12: mcl ========
348 | Input:
349 | - mclInput file
350 | Output:
351 | - mclOutput file
352 |
353 | mcl my_orthomcl_dir/mclInput --abc -I 1.5 -o my_orthomcl_dir/mclOutput
354 |
355 | Benchmark time: 3 hours
356 |
357 |
358 | ========== Step 13: orthomclMclToGroups ==========
359 | Input:
360 | - mclOutput file
361 | Output:
362 | - groups.txt
363 |
364 | Change to my_orthomcl_dir and run:
365 | orthomclMclToGroups my_prefix 1000 < mclOutput > groups.txt
366 |
367 | my_prefix is an arbitrary string to use as a prefix for your group IDs.
368 |
369 | Benchmark time: 1 min
370 |
371 |
372 |
373 |
374 |
375 |
--------------------------------------------------------------------------------
/bin/orthomclPairs:
--------------------------------------------------------------------------------
1 | #!/usr/bin/perl
2 |
3 | use DBI;
4 | use FindBin;
5 | use lib "$FindBin::Bin/../lib/perl";
6 | use OrthoMCLEngine::Main::Base;
7 | use strict;
8 |
9 |
10 | my $debug=0;
11 |
12 | my $configFile = $ARGV[0];
13 | my $logFile = $ARGV[1];
14 | my $clean = $ARGV[2];
15 | my $restart = $ARGV[3];
16 | my $taxonFilter = $ARGV[4];
17 |
18 | my $stepCount = 1;
19 | my @steps = ( # Common
20 | ['updateMinimumEvalueExponent'],
21 | ['bestQueryTaxonScore'],
22 | ['qtscore_ix'],
23 | # Ortholog
24 | ['bestHit', ['drop table BestQueryTaxonScore']],
25 | ['best_hit_ix'],
26 | ['ortholog', ['drop table BestHit']],
27 | ['orthologTaxon'],
28 | ['orthologAvg'],
29 | ['orthologAvgIndex'],
30 | ['orthologsNormalization', ['drop table OrthologAvgScore', 'drop table OrthologTaxon', 'drop table OrthologTemp']],
31 | # InParalog
32 | ['bestInterTaxonScore'],
33 | ['bis_uids_ix'],
34 | ['uniqueSimSeqsQueryId'],
35 | ['ust_qids_ix'],
36 | ['betterHit', ['drop table BestInterTaxonScore', 'drop table UniqueSimSeqsQueryId']],
37 | ['better_hit_ix'],
38 | ['inParalog', ['drop table BetterHit']],
39 | ['inParalogTaxonAvg'],
40 | ['orthologUniqueId'],
41 | ['orthologUniqueIdIndex'],
42 | ['inParalogOrthologTaxonAvg', ['drop table OrthologUniqueId']],
43 | ['inParalogAvg',['drop table InParalogTaxonAvg', 'drop table InParalogOrthologTaxonAvg']],
44 | ['inParalogAvgIndex'],
45 | ['inParalogsNormalization', ['drop table InParalogAvgScore', 'drop table InParalogTemp']],
46 | # CoOrtholog
47 | ['inParalog2Way'],
48 | ['in2a_ix'],
49 | ['in2b_ix'],
50 | ['ortholog2Way'],
51 | ['ortholog2WayIndex'],
52 | ['inParalogOrthologInParalog'],
53 | ['inParalogOrtholog'],
54 | ['coOrthologCandidate', ['drop table Ortholog2Way', 'drop table InParalog2Way', 'drop table InParalogOrthologInParalog', 'drop table InParalogOrtholog']],
55 | ['coOrthologNotOrtholog', ['drop table CoOrthologCandidate']],
56 | ['coOrthologNotOrthologIndex'],
57 | ['coOrtholog', ['drop table CoOrthologNotOrtholog']],
58 | ['coOrthologTaxon'],
59 | ['coOrthologAvg'],
60 | ['coOrthologAvgIndex'],
61 | ['coOrthologsNormalization', ['drop table CoOrthologAvgScore', 'drop table CoOrthologTaxon', 'drop table CoOrthologTemp']],
62 | ['cleanall', ['truncate table InParalog', 'truncate table Ortholog', 'truncate table CoOrtholog']],
63 | );
64 |
65 | my $stepsHash;
66 | my $cleanHash;
67 | for (my $i=0; $i{$steps[$i]->[0]} = $i+1;
69 | $cleanHash->{$steps[$i]->[0]} = $steps[$i]->[1] if $steps[$i]->[1];
70 | }
71 |
72 | &usage() unless $configFile;
73 | &usage() unless $logFile;
74 | &usage() unless $clean =~ /cleanup=(yes|no|only|all)/;
75 |
76 | $clean = $1;
77 |
78 | my $skipPast;
79 | if ($restart) {
80 | if ($restart =~ /taxonFilter=/) {
81 | $taxonFilter = $restart;
82 | } else {
83 | usage() unless $restart =~ /startAfter=(.*)/;
84 | $skipPast = $stepsHash->{$1};
85 | die "invalid restart arg $restart" unless $skipPast;
86 | }
87 | }
88 |
89 | my $andTaxonFilter = "";
90 | my $whereTaxonFilter = "";
91 | my $taxonFilterTaxon;
92 | if ($taxonFilter) {
93 | die "illegal argument '$taxonFilter'\n" unless $taxonFilter =~ /taxonFilter=(.*)/;
94 | $taxonFilterTaxon = $1;
95 | my $subjFilter = "and s.subject_taxon_id != '$taxonFilterTaxon'";
96 | $andTaxonFilter = "and s.query_taxon_id != '$taxonFilterTaxon' $subjFilter";
97 | $whereTaxonFilter = "where s.query_taxon_id != '$taxonFilterTaxon' $subjFilter";
98 | }
99 |
100 |
101 | open (LOGFILE, ">>$logFile") || die "Can't open log file '$logFile'\n";
102 | my $oldfh = select(LOGFILE); $| = 1; select($oldfh); # flush print buffer
103 |
104 | print LOGFILE "\n\n============================================================================================\n";
105 | print LOGFILE localtime(). " orthomclPairs " . join(' ', @ARGV) . "\n";
106 | print LOGFILE "=============================================================================================\n\n";
107 |
108 | my $base = OrthoMCLEngine::Main::Base->new($configFile, *LOGFILE);
109 | my $dbh = $base->getDbh();
110 |
111 | my $sst = $base->getConfig("similarSequencesTable");
112 |
113 | my $oracleNoLogging = $base->getConfig("dbVendor") eq 'oracle'? " NOLOGGING" : "";
114 | my $straightJoin = $base->getConfig("dbVendor") eq 'oracle'? "" : "STRAIGHT_JOIN";
115 |
116 | commonTempTables();
117 |
118 | orthologs();
119 |
120 | inparalogs();
121 |
122 | coorthologs();
123 |
124 | clean('cleanall') if $clean eq 'all';
125 |
126 | print LOGFILE "\nDone\n";
127 |
128 |
129 |
130 | ################################################################################
131 | ############################### Common tables #################################
132 | ################################################################################
133 | sub commonTempTables {
134 | print LOGFILE localtime() . " Constructing common temp tables\n"
135 | unless $clean eq 'only' || $clean eq 'all';
136 |
137 | my $interTaxonMatch = $base->getConfig("interTaxonMatchView");
138 |
139 | # a little bit of a hack here. mysql can't tolerate finding the
140 | # minEvalueExp in the sql that updates the table
141 | # so, we do it as a preprocess.
142 | # must explicitly avoid the preprocess if just cleaning or if skipping
143 | my $sql = "
144 | select min(evalue_exp)
145 | from $sst
146 | where evalue_mant != 0
147 | ";
148 | my $minEvalueExp;
149 | if ($clean ne 'only' && $clean ne 'all' && !$skipPast) {
150 | print LOGFILE localtime() . " Find min evalue exp (OrthoMCL-DB V2 took ??? for this step)\n";
151 | my $stmt = $dbh->prepare("$sql") or die DBI::errstr;
152 | $stmt->execute() or die DBI::errstr;
153 | ($minEvalueExp) = $stmt->fetchrow_array();
154 | print LOGFILE localtime() . " done\n";
155 | }
156 |
157 | $sql = "
158 | update $sst
159 | set evalue_exp = ${minEvalueExp}-1
160 | where evalue_exp = 0
161 | ";
162 | runSql($sql, "updating $sst, setting 0 evalue_exp to underflow value (${minEvalueExp} - 1)",
163 | 'updateMinimumEvalueExponent', '25 min', undef);
164 |
165 | ##########################################################################
166 |
167 | $sql = "
168 | create table BestQueryTaxonScore $oracleNoLogging as
169 | select im.query_id, im.subject_taxon_id, low_exp.evalue_exp, min(im.evalue_mant) as evalue_mant
170 | from $interTaxonMatch im,
171 | (select query_id, subject_taxon_id, min(evalue_exp) as evalue_exp
172 | from $interTaxonMatch
173 | group by query_id, subject_taxon_id) low_exp
174 | where im.query_id = low_exp.query_id
175 | and im.subject_taxon_id = low_exp.subject_taxon_id
176 | and im.evalue_exp = low_exp.evalue_exp
177 | group by im.query_id, im.subject_taxon_id, low_exp.evalue_exp
178 | ";
179 |
180 | runSql($sql, "create BestQueryTaxonScore", 'bestQueryTaxonScore', '1.5 hours', undef);
181 |
182 | ################################################################################
183 |
184 | $sql = "
185 | create unique index qtscore_ix on BestQueryTaxonScore(query_id, subject_taxon_id, evalue_exp, evalue_mant)
186 | ";
187 |
188 | runSql($sql, "create qtscore_ix index on BestQueryTaxonScore", 'qtscore_ix', '15 min', 'BestQueryTaxonScore');
189 | }
190 |
191 |
192 | ################################################################################
193 | ############################### Orthologs #####################################
194 | ################################################################################
195 | sub orthologs {
196 | print LOGFILE localtime() . " Constructing ortholog tables\n"
197 | unless $clean eq 'only' || $clean eq 'all';
198 |
199 | my $evalueExpThreshold = $base->getConfig("evalueExponentCutoff");
200 | my $percentMatchThreshold = $base->getConfig("percentMatchCutoff");
201 |
202 | my $sql = "
203 | create table BestHit $oracleNoLogging as
204 | select s.query_id, s.subject_id,
205 | s.query_taxon_id, s.subject_taxon_id,
206 | s.evalue_exp, s.evalue_mant
207 | from $sst s, BestQueryTaxonScore cutoff
208 | where s.query_id = cutoff.query_id
209 | and s.subject_taxon_id = cutoff.subject_taxon_id
210 | and s.query_taxon_id != s.subject_taxon_id
211 | and s.evalue_exp <= $evalueExpThreshold $andTaxonFilter
212 | and s.percent_match >= $percentMatchThreshold
213 | and (s.evalue_mant < 0.01
214 | or s.evalue_exp = cutoff.evalue_exp
215 | and s.evalue_mant = cutoff.evalue_mant)
216 | ";
217 |
218 | runSql($sql, "create BestHit", 'bestHit', '1.5 hours', undef);
219 |
220 | ######################################################################
221 |
222 | $sql = "
223 | create unique index best_hit_ix on BestHit(query_id,subject_id)
224 | ";
225 |
226 | runSql($sql, "create best_hit_ix index on BestHit", 'best_hit_ix', '15 min', 'BestHit');
227 |
228 | ######################################################################
229 |
230 | $sql = "
231 | create table OrthologTemp $oracleNoLogging as
232 | select bh1.query_id as sequence_id_a, bh1.subject_id as sequence_id_b,
233 | bh1.query_taxon_id as taxon_id_a, bh1.subject_taxon_id as taxon_id_b,
234 | case -- don't try to calculate log(0) -- use rigged exponents of SimSeq
235 | when bh1.evalue_mant < 0.01 or bh2.evalue_mant < 0.01
236 | then (bh1.evalue_exp + bh2.evalue_exp) / -2
237 | else -- score = ( -log10(evalue1) - log10(evalue2) ) / 2
238 | (log(10, bh1.evalue_mant * bh2.evalue_mant)
239 | + bh1.evalue_exp + bh2.evalue_exp) / -2
240 | end as unnormalized_score
241 | from BestHit bh1, BestHit bh2
242 | where bh1.query_id < bh1.subject_id
243 | and bh1.query_id = bh2.subject_id
244 | and bh1.subject_id = bh2.query_id
245 | ";
246 |
247 | runSql($sql, "create OrthologTemp table", 'ortholog', '5 min', 'OrthologTemp');
248 |
249 | ######################################################################
250 |
251 | orthologTaxonSub('');
252 |
253 | ######################################################################
254 |
255 | normalizeOrthologsSub('', $base->getConfig("orthologTable"));
256 | }
257 |
258 |
259 | ################################################################################
260 | ############################### InParalogs ####################################
261 | ################################################################################
262 | sub inparalogs {
263 | print LOGFILE localtime() . " Constructing inParalog tables\n"
264 | unless $clean eq 'only' || $clean eq 'all';
265 |
266 | my $inParalogTable = $base->getConfig("inParalogTable");
267 | my $orthologTable = $base->getConfig("orthologTable");
268 | my $evalueExpThreshold = $base->getConfig("evalueExponentCutoff");
269 | my $percentMatchThreshold = $base->getConfig("percentMatchCutoff");
270 |
271 | my $sql = "
272 | create table BestInterTaxonScore $oracleNoLogging as
273 | select im.query_id, low_exp.evalue_exp, min(im.evalue_mant) as evalue_mant
274 | from BestQueryTaxonScore im,
275 | (select query_id, min(evalue_exp) as evalue_exp
276 | from BestQueryTaxonScore
277 | group by query_id) low_exp
278 | where im.query_id = low_exp.query_id
279 | and im.evalue_exp = low_exp.evalue_exp
280 | group by im.query_id, low_exp.evalue_exp
281 | ";
282 |
283 | runSql($sql, "create BestInterTaxonScore", 'bestInterTaxonScore', '5 min', undef);
284 |
285 | ###########################################################################
286 |
287 | $sql = "
288 | create unique index bis_uids_ix on BestInterTaxonScore(query_id)
289 | ";
290 |
291 | runSql($sql, "create bis_uids_ix index on BestQueryTaxonScore", 'bis_uids_ix', '1 min', 'BestQueryTaxonScore');
292 |
293 | ###########################################################################
294 |
295 | $sql = "
296 | create table UniqueSimSeqsQueryId $oracleNoLogging as
297 | select distinct s.query_id from $sst s $whereTaxonFilter
298 | ";
299 |
300 | runSql($sql, "create UniqueSimSeqsQueryId", 'uniqueSimSeqsQueryId', '25 min', undef);
301 |
302 | ###########################################################################
303 |
304 | $sql = "
305 | create unique index ust_qids_ix on UniqueSimSeqsQueryId(query_id)
306 | ";
307 |
308 | runSql($sql, "create ust_qids_ix index on UniqueSimSeqsQueryId", 'ust_qids_ix', '1 min', 'UniqueSimSeqsQueryId');
309 |
310 |
311 | ###########################################################################
312 |
313 | $sql = "
314 | create table BetterHit $oracleNoLogging as
315 | select s.query_id, s.subject_id,
316 | s.query_taxon_id as taxon_id,
317 | s.evalue_exp, s.evalue_mant
318 | from $sst s, BestInterTaxonScore bis
319 | where s.query_id != s.subject_id $andTaxonFilter
320 | and s.query_taxon_id = s.subject_taxon_id
321 | and s.query_id = bis.query_id
322 | and s.evalue_exp <= $evalueExpThreshold
323 | and s.percent_match >= $percentMatchThreshold
324 | and (s.evalue_mant < 0.001
325 | or s.evalue_exp < bis.evalue_exp
326 | or (s.evalue_exp = bis.evalue_exp and s.evalue_mant <= bis.evalue_mant))
327 | -- . . . or Similarity for a protein with no BestInterTaxonScore
328 | -- (i.e. an intrataxon match for a protein with no intertaxon
329 | -- match in the database)
330 | union
331 | select s.query_id, s.subject_id, s.query_taxon_id as taxon_id, s.evalue_exp, s.evalue_mant
332 | from $sst s
333 | where s.query_taxon_id = s.subject_taxon_id $andTaxonFilter
334 | and s.evalue_exp <= $evalueExpThreshold
335 | and s.percent_match >= $percentMatchThreshold
336 | and s.query_id in
337 | (SELECT distinct ust.query_id
338 | from UniqueSimSeqsQueryId ust
339 | LEFT OUTER JOIN BestInterTaxonScore bis ON bis.query_id = ust.query_id
340 | WHERE bis.query_id IS NULL)
341 | ";
342 |
343 | runSql($sql, "create BetterHit table", 'betterHit', '3 hours', undef);
344 |
345 | ###########################################################################
346 |
347 | $sql = "
348 | create unique index better_hit_ix on BetterHit(query_id,subject_id)
349 | ";
350 |
351 | runSql($sql, "create better_hit_ix index on BetterHit", 'better_hit_ix', '25 min', 'BetterHit');
352 |
353 | ###########################################################################
354 |
355 | $sql = "
356 | create table InParalogTemp $oracleNoLogging as
357 | select bh1.query_id as sequence_id_a, bh1.subject_id as sequence_id_b,
358 | bh1.taxon_id,
359 | case -- don't try to calculate log(0) -- use rigged exponents of SimSeq
360 | when bh1.evalue_mant < 0.01 or bh2.evalue_mant < 0.01
361 | then (bh1.evalue_exp + bh2.evalue_exp) / -2
362 | else -- score = ( -log10(evalue1) - log10(evalue2) ) / 2
363 | (log(10, bh1.evalue_mant * bh2.evalue_mant)
364 | + bh1.evalue_exp + bh2.evalue_exp) / -2
365 | end as unnormalized_score
366 | from BetterHit bh1, BetterHit bh2
367 | where bh1.query_id < bh1.subject_id
368 | and bh1.query_id = bh2.subject_id
369 | and bh1.subject_id = bh2.query_id
370 | ";
371 |
372 | runSql($sql, "create InParalogTemp table", 'inParalog', '15 min', undef);
373 |
374 | ################################################################
375 |
376 | $sql = "
377 | create table InParalogTaxonAvg $oracleNoLogging as
378 | select avg(i.unnormalized_score) average, i.taxon_id
379 | from InParalogTemp i
380 | group by i.taxon_id
381 | ";
382 |
383 | runSql($sql, "create InParalogTaxonAvg table", 'inParalogTaxonAvg', '1 min', undef);
384 |
385 | ################################################################
386 |
387 | $sql = "
388 | create table OrthologUniqueId $oracleNoLogging as
389 | select distinct(sequence_id) from (
390 | select sequence_id_a as sequence_id from $orthologTable
391 | union
392 | select sequence_id_b as sequence_id from $orthologTable) i
393 | ";
394 |
395 | runSql($sql, "create OrthologUniqueId table", 'orthologUniqueId', '5 min', undef);
396 |
397 | ################################################################
398 |
399 | $sql = "create unique index ortholog_unique_id_ix on OrthologUniqueId(sequence_id)";
400 |
401 |
402 | runSql($sql, "create unique ortholog_unique_id_ix index", 'orthologUniqueIdIndex', '1 min', 'OrthologUniqueId');
403 |
404 | ################################################################
405 |
406 | $sql = "
407 | create table InParalogOrthologTaxonAvg $oracleNoLogging as
408 |
409 | select avg(i.unnormalized_score) average, i.taxon_id
410 | from InParalogTemp i
411 | where i.sequence_id_a in
412 | (select sequence_id from OrthologUniqueId)
413 | or i.sequence_id_b in
414 | (select sequence_id from OrthologUniqueId)
415 | group by i.taxon_id
416 | ";
417 |
418 | runSql($sql, "create InParalogOrthologTaxonAvg table", 'inParalogOrthologTaxonAvg', '10 min', undef);
419 |
420 | ################################################################
421 |
422 | $sql = "
423 | create table InParalogAvgScore $oracleNoLogging as
424 | select case
425 | when orth_i.average is NULL
426 | then all_i.average
427 | else orth_i.average
428 | end as avg_score,
429 | all_i.taxon_id
430 | from InParalogTaxonAvg all_i LEFT OUTER JOIN InParalogOrthologTaxonAvg orth_i
431 | ON all_i.taxon_id = orth_i.taxon_id
432 | ";
433 |
434 | runSql($sql, "create InParalogAvgScore table", 'inParalogAvg', '1 min', undef);
435 |
436 | ################################################################
437 |
438 | $sql = "create unique index inparalog_avg_ix on InParalogAvgScore(taxon_id,avg_score)";
439 |
440 |
441 | runSql($sql, "create InParalogAvgScore index", 'inParalogAvgIndex', '1 min', 'InParalogAvgScore');
442 |
443 | ################################################################
444 |
445 | $sql = "
446 | insert into $inParalogTable (sequence_id_a, sequence_id_b, taxon_id, unnormalized_score, normalized_score)
447 | select it.sequence_id_a, it.sequence_id_b, it.taxon_id, it.unnormalized_score, it.unnormalized_score/a.avg_score
448 | from InParalogTemp it, InParalogAvgScore a
449 | where it.taxon_id = a.taxon_id
450 | ";
451 |
452 | runSql($sql, "populate $inParalogTable table, including normalized_score", 'inParalogsNormalization', '3 min', "$inParalogTable");
453 |
454 | ################################################################
455 | }
456 |
457 | ################################################################################
458 | ############################### CoOrthologs ###################################
459 | ################################################################################
460 | sub coorthologs {
461 | print LOGFILE localtime() . " Constructing coOrtholog tables\n"
462 | unless $clean eq 'only' || $clean eq 'all';
463 |
464 | my $inParalogTable = $base->getConfig("inParalogTable");
465 | my $orthologTable = $base->getConfig("orthologTable");
466 | my $coOrthologTable = $base->getConfig("coOrthologTable");
467 | my $evalueExpThreshold = $base->getConfig("evalueExponentCutoff");
468 | my $percentMatchThreshold = $base->getConfig("percentMatchCutoff");
469 |
470 | my $sql = "
471 | create table InParalog2Way $oracleNoLogging as
472 | select sequence_id_a, sequence_id_b from $inParalogTable
473 | union
474 | select sequence_id_b as sequence_id_a, sequence_id_a as sequence_id_b from $inParalogTable
475 | ";
476 |
477 | runSql($sql, "create InParalog2Way", 'inParalog2Way', '1.5 hours', undef);
478 |
479 | ######################################################################
480 |
481 | $sql = "
482 | create unique index in2a_ix on InParalog2Way(sequence_id_a, sequence_id_b)
483 | ";
484 |
485 | runSql($sql, "index in2a_ix", 'in2a_ix', '45 min', undef);
486 |
487 | ######################################################################
488 |
489 | $sql = "
490 | create unique index in2b_ix on InParalog2Way(sequence_id_b, sequence_id_a)
491 | ";
492 |
493 | runSql($sql, "index in2b_ix", 'in2b_ix', '45 min', 'InParalog2Way');
494 |
495 | ######################################################################
496 |
497 | $sql = "
498 | create table Ortholog2Way $oracleNoLogging as
499 | -- symmetric closure of Ortholog
500 | select sequence_id_a, sequence_id_b from $orthologTable
501 | union
502 | select sequence_id_b as sequence_id_a, sequence_id_a as sequence_id_b from $orthologTable
503 | ";
504 |
505 | runSql($sql, "create Ortholog2Way", 'ortholog2Way', '1 hours', undef);
506 |
507 | ######################################################################
508 |
509 | $sql = "
510 | create unique index ortholog2way_ix on Ortholog2Way(sequence_id_a, sequence_id_b)
511 | ";
512 |
513 | runSql($sql, "index ortholog2way_ix", 'ortholog2WayIndex', '5 min', 'Ortholog2Way');
514 |
515 | ######################################################################
516 |
517 | $sql = "
518 | create table InParalogOrthologInParalog $oracleNoLogging as
519 | select ip1.sequence_id_a, ip2.sequence_id_b
520 | from Ortholog2Way o, InParalog2Way ip2, InParalog2Way ip1
521 | where ip1.sequence_id_b = o.sequence_id_a
522 | and o.sequence_id_b = ip2.sequence_id_a
523 | ";
524 |
525 | runSql($sql, "create InParalogOrthologInParalog", 'inParalogOrthologInParalog', '20 min', undef);
526 |
527 | ##################################################################
528 |
529 | $sql = "
530 | create table InParalogOrtholog $oracleNoLogging as
531 | select ip.sequence_id_a, o.sequence_id_b
532 | from InParalog2Way ip, Ortholog2Way o
533 | where ip.sequence_id_b = o.sequence_id_a
534 | ";
535 |
536 | runSql($sql, "create InParalogOrtholog", 'inParalogOrtholog', '15 min', undef);
537 |
538 | ##################################################################
539 |
540 | $sql = "
541 | create table CoOrthologCandidate $oracleNoLogging as
542 | select distinct
543 | least(sequence_id_a, sequence_id_b) as sequence_id_a,
544 | greatest(sequence_id_a, sequence_id_b) as sequence_id_b
545 | from (select sequence_id_a, sequence_id_b from InParalogOrthologInParalog
546 | union
547 | select sequence_id_a, sequence_id_b from InParalogOrtholog) t
548 | ";
549 |
550 | runSql($sql, "create CoOrthologCandidate", 'coOrthologCandidate', '1 hour', undef);
551 |
552 | ######################################################################
553 |
554 | $sql = "
555 | create table CoOrthologNotOrtholog $oracleNoLogging as
556 | SELECT cc.sequence_id_a, cc.sequence_id_b
557 | FROM CoOrthologCandidate cc
558 | LEFT OUTER JOIN $orthologTable o
559 | ON cc.sequence_id_a = o.sequence_id_a
560 | AND cc.sequence_id_b = o.sequence_id_b
561 | WHERE o.sequence_id_a IS NULL
562 | ";
563 |
564 | runSql($sql, "create CoOrthologNotOrtholog table", 'coOrthologNotOrtholog', '10 min', undef);
565 |
566 |
567 | #####################################################################
568 |
569 | $sql = "
570 | create index coortholog_not_ortholog_ix on CoOrthologNotOrtholog(sequence_id_a,sequence_id_b)
571 | ";
572 |
573 | runSql($sql, "index coortholog_not_ortholog_ix", 'coOrthologNotOrthologIndex', '1 min', 'CoOrthologNotOrtholog');
574 |
575 |
576 | ######################################################################
577 |
578 | my $tf;
579 | if ($taxonFilterTaxon) {
580 | $tf = "and ab.query_taxon_id != '$taxonFilterTaxon' and ab.subject_taxon_id != '$taxonFilterTaxon' and ba.query_taxon_id != '$taxonFilterTaxon' and ba.subject_taxon_id != '$taxonFilterTaxon'";
581 | }
582 |
583 | $sql = "
584 | create table CoOrthologTemp $oracleNoLogging as
585 | select candidate.sequence_id_a, candidate.sequence_id_b,
586 | ab.query_taxon_id as taxon_id_a, ab.subject_taxon_id as taxon_id_b,
587 | case -- in case of 0 evalue, use rigged exponent
588 | when ab.evalue_mant < 0.00001 or ba.evalue_mant < 0.00001
589 | then (ab.evalue_exp + ba.evalue_exp) / -2
590 | else -- score = ( -log10(evalue1) - log10(evalue2) ) / 2
591 | (log(10, ab.evalue_mant * ba.evalue_mant)
592 | + ab.evalue_exp + ba.evalue_exp) / -2
593 | end as unnormalized_score
594 | from $sst ab, $sst ba, CoOrthologNotOrtholog candidate
595 | where ab.query_id = candidate.sequence_id_a $tf
596 | and ab.subject_id = candidate.sequence_id_b
597 | and ab.evalue_exp <= $evalueExpThreshold
598 | and ab.percent_match >= $percentMatchThreshold
599 | and ba.query_id = candidate.sequence_id_b
600 | and ba.subject_id = candidate.sequence_id_a
601 | and ba.evalue_exp <= $evalueExpThreshold
602 | and ba.percent_match >= $percentMatchThreshold
603 | ";
604 |
605 | runSql($sql, "create CoOrthologTemp table", 'coOrtholog', '2 hours', undef);
606 |
607 | ######################################################################
608 |
609 | orthologTaxonSub('co');
610 |
611 | ######################################################################
612 |
613 | normalizeOrthologsSub("Co", $base->getConfig("coOrthologTable"));
614 | }
615 |
616 |
617 | sub orthologTaxonSub {
618 | my ($co) = @_;
619 |
620 | my $coCaps = $co? "Co" : "";
621 | $co = $co? "coO" : "o";
622 |
623 | my $sql = "create table ${coCaps}OrthologTaxon $oracleNoLogging as
624 | select case
625 | when taxon_id_a < taxon_id_b
626 | then taxon_id_a
627 | else taxon_id_b
628 | end as smaller_tax_id,
629 | case
630 | when taxon_id_a < taxon_id_b
631 | then taxon_id_b
632 | else taxon_id_a
633 | end as bigger_tax_id,
634 | unnormalized_score
635 | from ${coCaps}OrthologTemp";
636 |
637 | runSql($sql, "create ${coCaps}OrthologTaxon table", "${co}rthologTaxon", '1 min', undef);
638 | }
639 |
640 | sub normalizeOrthologsSub {
641 | my ($co, $orthologTable) = @_;
642 |
643 | my $coCaps = $co? "Co" : "";
644 | $co = $co? "coO" : "o";
645 |
646 | my $sql = "
647 | create table ${coCaps}OrthologAvgScore $oracleNoLogging as
648 | select smaller_tax_id, bigger_tax_id, avg(unnormalized_score) avg_score
649 | from ${coCaps}OrthologTaxon
650 | group by smaller_tax_id, bigger_tax_id
651 | ";
652 |
653 | runSql($sql, "create ${coCaps}OrthologAvgScore table", "${co}rthologAvg", '1 min', undef);
654 |
655 | ################################################################
656 |
657 | $sql = "create unique index ${co}orthoAvg_ix on ${coCaps}OrthologAvgScore(smaller_tax_id,bigger_tax_id,avg_score)";
658 |
659 | runSql($sql, "create ${coCaps}OrthologAvgScore index", "${co}rthologAvgIndex", '1 min', "${coCaps}OrthologAvgScore");
660 |
661 | ################################################################
662 |
663 | $sql = "
664 | insert into $orthologTable (sequence_id_a, sequence_id_b, taxon_id_a, taxon_id_b, unnormalized_score, normalized_score)
665 | select ot.sequence_id_a, ot.sequence_id_b, ot.taxon_id_a, ot.taxon_id_b, ot.unnormalized_score, ot.unnormalized_score/a.avg_score
666 | from ${coCaps}OrthologTemp ot, ${coCaps}OrthologAvgScore a
667 | where least(ot.taxon_id_a, ot.taxon_id_b) = a.smaller_tax_id
668 | and greatest(ot.taxon_id_a, ot.taxon_id_b) = a.bigger_tax_id
669 | ";
670 |
671 | runSql($sql, "populate $orthologTable table, including normalized_score", "${co}rthologsNormalization", '2 min', "$orthologTable");
672 | }
673 |
674 | sub runSql {
675 | my ($sql, $msg, $tag, $sampleTime, $tableToAnalyze) = @_;
676 |
677 | print LOGFILE "$sql\n\n" if $debug;
678 |
679 | my $stepNumber = $stepsHash->{$tag};
680 | die "invalid tag '$tag'" unless $stepNumber;
681 |
682 | if ($skipPast >= $stepNumber) {
683 | print LOGFILE "... skipping '$tag'...\n\n";
684 | return;
685 | }
686 |
687 | if ($clean ne 'only' && $clean ne 'all') {
688 | my $t = time();
689 |
690 | print LOGFILE localtime() . " $msg (Benchmark dataset took $sampleTime for this step)\n";
691 |
692 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
693 | $stmt->execute() or die DBI::errstr;
694 |
695 | &analyzeStats($tableToAnalyze) if ($tableToAnalyze);
696 |
697 | my $tt = time() - $t;
698 | my $hours = int($tt / 3600);
699 | my $mins = int($tt / 60) % 60;
700 | if ($hours == 0 && $mins == 0) {$mins = 1};
701 | my $hoursStr = $hours? "$hours hours and " : "";
702 | print LOGFILE localtime() . " step '$tag' done ($hoursStr$mins mins)\n\n";
703 | }
704 |
705 | clean($tag) unless ($clean eq 'no');
706 | }
707 |
708 | sub analyzeStats {
709 | my ($tableToAnalyze) = @_;
710 |
711 | if ($base->getConfig("dbVendor") eq 'oracle') {
712 | my $sql = "analyze table $tableToAnalyze compute statistics";
713 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
714 | $stmt->execute() or die DBI::errstr;
715 | $stmt = $dbh->prepare("$sql for all indexes") or die DBI::errstr;
716 | $stmt->execute() or die DBI::errstr;
717 | } else {
718 | my $sql = "analyze table $tableToAnalyze";
719 | my $stmt = $dbh->prepare($sql) or die DBI::errstr;
720 | $stmt->execute() or die DBI::errstr;
721 | }
722 | }
723 |
724 | sub clean {
725 | my ($tag) = @_;
726 |
727 | my $cleanSqls = $cleanHash->{$tag};
728 | foreach my $cleanSql (@$cleanSqls) {
729 | if ($cleanSql) {
730 | $cleanSql =~ /(\w+) table (\w+)/i || die "invalid clean sql '$cleanSql'";
731 | my $action = $1;
732 | my $table = $2;
733 | next if ($action eq 'drop' && &tableAlreadyDropped($table));
734 | my $stmt = $dbh->prepare($cleanSql) or die DBI::errstr;
735 | print LOGFILE localtime() . " cleaning: $cleanSql\n";
736 | $stmt->execute() or die DBI::errstr;
737 | print LOGFILE localtime() . " done\n";
738 | }
739 | }
740 | }
741 |
742 | sub tableAlreadyDropped {
743 | my ($table) = @_;
744 |
745 | my $orthologTable = $base->getConfig("orthologTable");
746 | my $coOrthologTable = $base->getConfig("coOrthologTable");
747 | my $inParalogTable = $base->getConfig("inParalogTable");
748 |
749 | $table = $orthologTable if $table eq 'Ortholog';
750 | $table = $coOrthologTable if $table eq 'CoOrtholog';
751 | $table = $inParalogTable if $table eq 'InParalog';
752 | my $sql;
753 | if ($base->getConfig("dbVendor") eq 'oracle') {
754 | $table = uc($table);
755 | $sql = "select table_name from all_tables where table_name = '$table'";
756 | } else {
757 | $sql = "show tables like '$table'";
758 | }
759 | my $stmt = $dbh->prepare($sql);
760 | $stmt->execute() or die DBI::errstr;
761 | while ($stmt->fetchrow()) { return 0};
762 | return 1;
763 | }
764 |
765 |
766 | sub cleanall {
767 | foreach my $tag (keys (%$cleanHash)) {
768 | clean($tag);
769 | }
770 | }
771 |
772 | sub usage {
773 | my $stepsString;
774 | map { $stepsString .= " $_->[0]\n" } @steps;
775 |
776 | print "
777 | Find pairs for OrthoMCL.
778 |
779 | usage: orthomclPairs config_file log_file cleanup=[yes|no|only|all]
780 |
781 | where:
782 | config_file : see below
783 | cleanup : clean up temp tables?
784 | yes=clean as we go;
785 | no=don't clean as we go;
786 | only=just clean, do nothing else;
787 | all=just clean, plus clean InParalog, Ortholog and CoOrtholog tables.
788 | startAfter : optionally start after a previously completed step. see below for TAGs
789 |
790 | Database Input:
791 | - SimilarSequences table containing all-v-all BLAST hits
792 | - InParalog, Ortholog, CoOrtholog tables - created but empty
793 |
794 | Database Output:
795 | - Populated InParalog, Ortholog and CoOrtholog tables
796 |
797 | Standard Error:
798 | - logging info
799 |
800 | NOTE: the database login in the config file must have update/insert/truncate privileges on the tables specified in the config file.
801 |
802 | EXAMPLE: orthomclSoftware/bin/orthomclPairs my_orthomcl_dir/orthomcl.config my_orthomcl_dir/orthomcl_pairs.log cleanup=no
803 |
804 |
805 | Sample Config File:
806 |
807 | dbVendor=oracle (or mysql)
808 | dbConnectString=dbi:Oracle:orthomcl
809 | dbLogin=my_db_login
810 | dbPassword=my_db_password
811 | similarSequencesTable=SimilarSequences
812 | orthologTable=Ortholog
813 | inParalogTable=InParalog
814 | coOrthologTable=CoOrtholog
815 | interTaxonMatchView=InterTaxonMatch
816 | percentMatchCutoff=50
817 | evalueExponentCutoff=-5
818 |
819 | Names of TAGs to use in startAfter (look in log file to see last one run)
820 | $stepsString
821 | ";
822 | exit(1);
823 | }
824 |
825 |
--------------------------------------------------------------------------------