MAXLOOP) break;
82 | qmin = u1+i-1+length-MAXLOOP;
83 | if (qminMAXLOOP) continue;
91 | for (energy = s=0; sMLclosing;
123 | if (fm
41 | #include
42 | #include
43 | #include
44 |
45 | #include "global.h"
46 |
47 | int verbose=0, verboseMemory=0, quiet=0, userSeed;
48 | long totalMem=0;
49 | unsigned long randomSeed;
50 |
51 | /* functions */
52 |
53 | /*************************************/
54 | void *AllocMem(long n, char *name, char *func, int showInfo)
55 | {
56 | void *P;
57 |
58 | if ( (P=malloc(n))==NULL ) {
59 | fprintf(stderr, "Out of memory allocating '%s': %s()\n", name, func);
60 | exit(0);
61 | }
62 |
63 | totalMem+=n;
64 |
65 | if (showInfo && verboseMemory)
66 | fprintf(stderr, "%s in %s() - %ld bytes\n", name, func, n);
67 |
68 | return P;
69 | }
70 |
71 |
72 | /*************************************/
73 | void *CAllocMem(long n, char *name, char *func, int showInfo)
74 | {
75 | void *P;
76 |
77 | if ( (P=calloc(n, 1))==NULL ) {
78 | fprintf(stderr, "Out of memory allocating '%s': %s()\n", name, func);
79 | exit(0);
80 | }
81 |
82 | totalMem+=n;
83 | if (showInfo && verboseMemory)
84 | fprintf(stderr, "%s in %s() - %ld bytes\n", name, func, n);
85 |
86 | return P;
87 | }
88 |
89 |
90 | /*************************************/
91 | int GetDoubleParams(int argc, char **argv, int *argn, char *pos, int numParams, double *params)
92 | {
93 | int i;
94 | char *st, buf[256];
95 |
96 | i=0;
97 | strcpy(buf, pos);
98 | st=strtok(buf, "\t,/");
99 | do {
100 | if (st==NULL) {
101 | if ((*argn)+190$\%) identity
53 | they only contribute little new information. Our benchmarks showed
54 | that alginments with 5--10 sequences and amean pairwise identity
55 | $<90$\% give good results.
56 |
57 | There are different possible scenarios how to obtain alignments for
58 | your sequences.
59 |
60 | \subsection{Download pre-made alignments}
61 |
62 | These days, for many organisms the complete genomic sequence is
63 | known. Moreover, for many organisms also related species have been
64 | sequenced and pre-calculated multiple alignments are available for
65 | download. Well known resources for major model organism are for
66 | example \url{genome.ucsc.edu} or \url{ensemble.org}, but also many
67 | independent smaller genome projects provide multiple alignments.
68 |
69 | \subsection{Create alignments of long genomic regions}
70 |
71 | If you want to analyze longer genomic regions ($>1kb$) but cannot find
72 | pre-made alignments, we recommend using the \texttt{MultiZ} program
73 | suite to create alignments. It can be downloaded here
74 | \url{http://www.bx.psu.edu/miller_lab/dist} and comes with excellent
75 | documentation.
76 |
77 | You will need homologous genomic sequences for your region in other
78 | species. For example, you can align the complete genomes of bacteria
79 | or align homologous loci in the megabase range of higher organisms
80 | with \texttt{MultiZ}.
81 |
82 | \subsection{Create alignments of individual short regions}
83 |
84 | If your sequence of interest is relatively short (a few hundred
85 | nucleotides) we recommend using a simple global alignment program like
86 | for example \texttt{ClustalW}. Use \texttt{Blast} to find homologous
87 | sequences in a sequence database (e.g. GenBank,
88 | \url{www.ncbi.nlm.nih.gov/genbank/}). Collect the significant hits
89 | that match to your region of interest and align the sequences
90 | afterwards with an alignment program.
91 |
92 | \section{Formatting the alignments}
93 |
94 | \texttt{RNAcode} can process alignments in two different formats: MAF
95 | and CLUSTAL W. You have to make sure that your alignment is in one of
96 | these formats before you can use \texttt{RNAcode}.
97 |
98 | The MAF format was popularized by the USCS genome browser and is very
99 | useful to represent genome-wide alignments. The detailed specification
100 | can be found here: \url{http://genome.ucsc.edu/FAQ/FAQformat.html}. If
101 | you download alignments from an UCSC resource it is usually formatted
102 | as MAF. Also if you align your sequences using \texttt{MultiZ} the
103 | default output format is MAF.
104 |
105 | For shorter alignments of individual regions, the CLUSTAL W format is
106 | useful. Apart from CLUSTAL W, many other alignment programs output
107 | their alignments in this format.
108 |
109 | \section{Pre-processing alignments}
110 |
111 | If your alignments contain blocks of long genomic regions it is
112 | usually no reasonable to score these long regions as a whole. The
113 | \texttt{tar.gz} package contains a script \texttt{breakMAF.pl} that
114 | allows you to easily pre-process your MAF files:
115 |
116 | \begin{verbatim}
117 | # scripts/breakMAF.pl examples/genomic.maf > genomic-preprocessed.maf
118 | \end{verbatim}
119 |
120 | This command breaks blocks longer than 400 in shorter blocks of an
121 | average size of 200.
122 |
123 | \section{Running RNAcode}
124 |
125 | Analyze alignment with standard options and print detailed results
126 | page:
127 |
128 | \begin{verbatim}
129 |
130 | # RNAcode examples/coding.aln
131 |
132 | \end{verbatim}
133 |
134 | Analyze alignment and show best non-overlapping hits below a $p$-value
135 | cutoff of 0.01 in gtf format:
136 |
137 | \begin{verbatim}
138 | # RNAcode --outfile out.gtf --gtf --best-only --cutoff 0.01 genomic-preprocessed.maf
139 | \end{verbatim}
140 |
141 | Create color annotations for high scoring coding segments:
142 |
143 | \begin{verbatim}
144 |
145 | # RNAcode --eps coding.aln
146 |
147 | \end{verbatim}
148 |
149 | Please refer to the more detailed README file that explains all
150 | options of \texttt{RNAcode} and how to interpret the results.
151 |
152 | For details on the methodology refer to the paper:
153 |
154 | \textbf{RNAcode: robust discrimination of coding and noncoding regions in
155 | comparative sequence data}\\
156 | Washietl S, Findei\ss\ S, M{\"u}ller S, Kalkhof S, von~Bergen M, Hofacker IL, Stadler PF, Goldman N\\
157 | \emph{RNA} (2011), in revision
158 |
159 | \end{document}
--------------------------------------------------------------------------------
/phyml/optimiz.h:
--------------------------------------------------------------------------------
1 | /*
2 |
3 | PHYML : a program that computes maximum likelihood phylogenies from
4 | DNA or AA homologous sequences
5 |
6 | Copyright (C) Stephane Guindon. Oct 2003 onward
7 |
8 | All parts of the source except where indicated are distributed under
9 | the GNU public licence. See http://www.opensource.org for details.
10 |
11 | */
12 |
13 | #ifndef OPTIMIZ_H
14 | #define OPTIMIZ_H
15 |
16 | void Optimiz_Ext_Br(arbre *tree);
17 | void Optimize_Alpha(arbre *tree);
18 | void Optimize_Kappa(arbre *tree);
19 | void Optimize_Lambda(arbre *tree);
20 | void Optimize_Param_Parall(arbre *tree);
21 | phydbl Optimize_Branch_Quad(arbre *tree, allseq *alldata, edge *b_fcus);
22 | void Optimize_After_Hide(arbre *tree, allseq *alldata, node *h);
23 | void Round_Optimize(arbre *tree, allseq *data);
24 | int Dist_Seq_Brak(phydbl *ax, phydbl *bx, phydbl *cx,
25 | phydbl *fa, phydbl *fb, phydbl *fc,
26 | allseq *data, int num1, int num2, model *mod);
27 | phydbl Dist_Seq_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
28 | phydbl *xmin, allseq *data,
29 | int num1, int num2, model *mod);
30 | phydbl Kappa_Golden(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
31 | phydbl *xmin, arbre *tree, allseq *alldata);
32 | phydbl Lambda_Golden(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
33 | phydbl *xmin, arbre *tree, allseq *alldata);
34 | phydbl Alpha_Golden_Br_Opt(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
35 | phydbl *xmin, arbre *tree, allseq *alldata,
36 | int n_opt, phydbl *init_l);
37 | phydbl Alpha_Golden(phydbl ax, phydbl bx, phydbl cx, phydbl tol,phydbl *xmin,
38 | arbre *tree, allseq *alldata);
39 | phydbl Br_Len_Golden(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
40 | phydbl *xmin, edge *b_fcus, arbre *tree);
41 | phydbl Br_Len_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
42 | edge *b_fcus, arbre *tree, int n_iter_max);
43 | int Br_Len_Brak(phydbl *ax, phydbl *bx, phydbl *cx,
44 | phydbl *fa, phydbl *fb, phydbl *fc,
45 | edge *b_fcus, arbre *tree);
46 | phydbl Optimize_Path_Length(model *mod, allseq *alldata, edge *a,
47 | int lra, edge *b, int lrb, phydbl i_len);
48 | void Optimize_Param_Serie(node *a, node *d, edge *b_fcus, arbre *tree,
49 | allseq *alldata, int n_passes);
50 | phydbl Optimize_Dist(model *mod, phydbl init, allseq *twoseqs);
51 | phydbl Pinvar_Golden(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
52 | phydbl *xmin, arbre *tree, allseq *alldata, int n_iter_max);
53 | void Optimize_Pinvar(arbre *tree);
54 | int Lambda_Brak(phydbl *ax, phydbl *bx, phydbl *cx,
55 | phydbl *fa, phydbl *fb, phydbl *fc,
56 | arbre *tree);
57 | int Kappa_Brak(phydbl *ax, phydbl *bx, phydbl *cx,
58 | phydbl *fa, phydbl *fb, phydbl *fc,
59 | arbre *tree);
60 | int Alpha_Brak(phydbl *ax, phydbl *bx, phydbl *cx,
61 | phydbl *fa, phydbl *fb, phydbl *fc,
62 | arbre *tree);
63 | int Pinvar_Brak(phydbl *ax, phydbl *bx, phydbl *cx,
64 | phydbl *fa, phydbl *fb, phydbl *fc,
65 | arbre *tree);
66 | void Optimiz_All_Free_Param(arbre *tree, int verbose);
67 | void Optimiz_RRparam_GTR(arbre *tree, int num_param);
68 | phydbl RRparam_GTR_Golden(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
69 | phydbl *xmin, arbre *tree, allseq *alldata, phydbl *param, int n_iter_max);
70 |
71 | int Powell_GTR_Param(arbre *tree, phydbl *p, int n, phydbl ftol);
72 | phydbl Linmin_GTR_Param(arbre *tree,phydbl *p, phydbl *xi, int n);
73 | phydbl F1dim(arbre *tree, phydbl x, phydbl *p, phydbl *xi, phydbl n);
74 | int Mnbrak_1dim(phydbl *ax, phydbl *bx, phydbl *cx,
75 | phydbl *fa, phydbl *fb, phydbl *fc,
76 | arbre *tree,
77 | phydbl *p, phydbl *xi, phydbl n);
78 | phydbl Brent_1dim(phydbl ax, phydbl bx, phydbl cx,
79 | phydbl tol, phydbl *xmin,
80 | arbre *tree,
81 | phydbl *p, phydbl *xi, phydbl n);
82 |
83 | int Min_With_Derivatives(arbre *tree, phydbl *p, int n, phydbl ftol, phydbl step_size,
84 | phydbl (*func) (), void (*dfunc)(), phydbl (*linmin)());
85 | void BFGS(arbre *tree, phydbl *p, int n, phydbl gtol, phydbl step_size,
86 | phydbl(*func)(), void (*dfunc)(), void (*lnsrch)(),int *failed);
87 | void Lnsrch_RR_Param(arbre *tree, int n, phydbl *xold, phydbl fold, phydbl *g, phydbl *p, phydbl *x,
88 | phydbl *f, phydbl stpmax, int *check);
89 | void Optimize_Single_Param_Generic(arbre *tree, phydbl *param, phydbl lim_inf, phydbl lim_sup, phydbl tol, int n_max_iter);
90 | int Generic_Brak(phydbl *param,
91 | phydbl *ax, phydbl *bx, phydbl *cx,
92 | phydbl *fa, phydbl *fb, phydbl *fc,
93 | phydbl lim_inf, phydbl lim_sup,
94 | arbre *tree);
95 | phydbl Generic_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
96 | phydbl *xmin, arbre *tree, int n_iter_max);
97 | void Optimize_Br_Len_Serie(node *a, node *d, edge *b_fcus, arbre *tree,allseq *alldata);
98 | void Lnsrch_Nucleotide_Frequencies(arbre *tree, int n, phydbl *xold,
99 | phydbl fold, phydbl *g, phydbl *p, phydbl *x,
100 | phydbl *f, phydbl stpmax, int *check);
101 |
102 | void Optimize_Global_Rate(arbre *tree);
103 | phydbl Br_Len_Brent_Default(edge *b_fcus, arbre *tree);
104 |
105 | void EM_Dist(model *mod, allseq *data);
106 | phydbl Dist_F_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol, int n_iter_max,
107 | phydbl *param, phydbl *F, model *mod);
108 | int Dist_F_Brak(phydbl *ax, phydbl *bx, phydbl *cx, phydbl *F, phydbl *param, model *mod);
109 | void Opt_Dist_F(phydbl *dist, phydbl *F, model *mod);
110 | phydbl Missing_Dist_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol, int n_iter_max,
111 | int x, int y, matrix *mat);
112 | int Missing_Dist_Brak(phydbl *ax, phydbl *bx, phydbl *cx, int x, int y, matrix *mat);
113 | void Opt_Missing_Dist(int x, int y, matrix *mat);
114 | int Optimiz_Alpha_And_Pinv(arbre *tree);
115 | void Lnsrch_RR_Cov_Param(arbre *tree, int n, phydbl *xold, phydbl fold,
116 | phydbl *g, phydbl *p, phydbl *x,
117 | phydbl *f, phydbl stpmax, int *check);
118 | phydbl Node_Time_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
119 | node *anc, node *des, arbre *tree, int n_iter_max);
120 | phydbl Time_Stamps_Mult_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
121 | arbre *tree, int n_iter_max);
122 | phydbl Branch_Rate_Shape_Brent(phydbl ax, phydbl bx, phydbl cx, phydbl tol,
123 | phydbl *xmin, arbre *tree, int n_iter_max);
124 |
125 |
126 | #endif
127 |
128 |
--------------------------------------------------------------------------------
/src/getopt.h:
--------------------------------------------------------------------------------
1 | /* Declarations for getopt.
2 | Copyright (C) 1989,90,91,92,93,94,96,97,98 Free Software Foundation, Inc.
3 | This file is part of the GNU C Library.
4 |
5 | The GNU C Library is free software; you can redistribute it and/or
6 | modify it under the terms of the GNU Library General Public License as
7 | published by the Free Software Foundation; either version 2 of the
8 | License, or (at your option) any later version.
9 |
10 | The GNU C Library is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 | Library General Public License for more details.
14 |
15 | You should have received a copy of the GNU Library General Public
16 | License along with the GNU C Library; see the file COPYING.LIB. If not,
17 | write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
18 | Boston, MA 02111-1307, USA. */
19 |
20 | #ifndef _GETOPT_H
21 |
22 | #ifndef __need_getopt
23 | # define _GETOPT_H 1
24 | #endif
25 |
26 | #ifdef __cplusplus
27 | extern "C" {
28 | #endif
29 |
30 | /* For communication from `getopt' to the caller.
31 | When `getopt' finds an option that takes an argument,
32 | the argument value is returned here.
33 | Also, when `ordering' is RETURN_IN_ORDER,
34 | each non-option ARGV-element is returned here. */
35 |
36 | extern char *optarg;
37 |
38 | /* Index in ARGV of the next element to be scanned.
39 | This is used for communication to and from the caller
40 | and for communication between successive calls to `getopt'.
41 |
42 | On entry to `getopt', zero means this is the first call; initialize.
43 |
44 | When `getopt' returns -1, this is the index of the first of the
45 | non-option elements that the caller should itself scan.
46 |
47 | Otherwise, `optind' communicates from one call to the next
48 | how much of ARGV has been scanned so far. */
49 |
50 | extern int optind;
51 |
52 | /* Callers store zero here to inhibit the error message `getopt' prints
53 | for unrecognized options. */
54 |
55 | extern int opterr;
56 |
57 | /* Set to an option character which was unrecognized. */
58 |
59 | extern int optopt;
60 |
61 | #ifndef __need_getopt
62 | /* Describe the long-named options requested by the application.
63 | The LONG_OPTIONS argument to getopt_long or getopt_long_only is a vector
64 | of `struct option' terminated by an element containing a name which is
65 | zero.
66 |
67 | The field `has_arg' is:
68 | no_argument (or 0) if the option does not take an argument,
69 | required_argument (or 1) if the option requires an argument,
70 | optional_argument (or 2) if the option takes an optional argument.
71 |
72 | If the field `flag' is not NULL, it points to a variable that is set
73 | to the value given in the field `val' when the option is found, but
74 | left unchanged if the option is not found.
75 |
76 | To have a long-named option do something other than set an `int' to
77 | a compiled-in constant, such as set a value from `optarg', set the
78 | option's `flag' field to zero and its `val' field to a nonzero
79 | value (the equivalent single-letter option character, if there is
80 | one). For long options that have a zero `flag' field, `getopt'
81 | returns the contents of the `val' field. */
82 |
83 | struct option
84 | {
85 | # if defined __STDC__ && __STDC__
86 | const char *name;
87 | # else
88 | char *name;
89 | # endif
90 | /* has_arg can't be an enum because some compilers complain about
91 | type mismatches in all the code that assumes it is an int. */
92 | int has_arg;
93 | int *flag;
94 | int val;
95 | };
96 |
97 | /* Names for the values of the `has_arg' field of `struct option'. */
98 |
99 | # define no_argument 0
100 | # define required_argument 1
101 | # define optional_argument 2
102 | #endif /* need getopt */
103 |
104 |
105 | /* Get definitions and prototypes for functions to process the
106 | arguments in ARGV (ARGC of them, minus the program name) for
107 | options given in OPTS.
108 |
109 | Return the option character from OPTS just read. Return -1 when
110 | there are no more options. For unrecognized options, or options
111 | missing arguments, `optopt' is set to the option letter, and '?' is
112 | returned.
113 |
114 | The OPTS string is a list of characters which are recognized option
115 | letters, optionally followed by colons, specifying that that letter
116 | takes an argument, to be placed in `optarg'.
117 |
118 | If a letter in OPTS is followed by two colons, its argument is
119 | optional. This behavior is specific to the GNU `getopt'.
120 |
121 | The argument `--' causes premature termination of argument
122 | scanning, explicitly telling `getopt' that there are no more
123 | options.
124 |
125 | If OPTS begins with `--', then non-option arguments are treated as
126 | arguments to the option '\0'. This behavior is specific to the GNU
127 | `getopt'. */
128 |
129 | #if defined __STDC__ && __STDC__
130 | # ifdef __GNU_LIBRARY__
131 | /* Many other libraries have conflicting prototypes for getopt, with
132 | differences in the consts, in stdlib.h. To avoid compilation
133 | errors, only prototype getopt for the GNU C library. */
134 | extern int getopt (int __argc, char *const *__argv, const char *__shortopts);
135 | # else /* not __GNU_LIBRARY__ */
136 | extern int getopt ();
137 | # endif /* __GNU_LIBRARY__ */
138 |
139 | # ifndef __need_getopt
140 | extern int getopt_long (int __argc, char *const *__argv, const char *__shortopts,
141 | const struct option *__longopts, int *__longind);
142 | extern int getopt_long_only (int __argc, char *const *__argv,
143 | const char *__shortopts,
144 | const struct option *__longopts, int *__longind);
145 |
146 | /* Internal only. Users should not call this directly. */
147 | extern int _getopt_internal (int __argc, char *const *__argv,
148 | const char *__shortopts,
149 | const struct option *__longopts, int *__longind,
150 | int __long_only);
151 | # endif
152 | #else /* not __STDC__ */
153 | extern int getopt ();
154 | # ifndef __need_getopt
155 | extern int getopt_long ();
156 | extern int getopt_long_only ();
157 |
158 | extern int _getopt_internal ();
159 | # endif
160 | #endif /* __STDC__ */
161 |
162 | #ifdef __cplusplus
163 | }
164 | #endif
165 |
166 | /* Make sure we later can get all the definitions and declarations. */
167 | #undef __need_getopt
168 |
169 | #endif /* getopt.h */
170 |
--------------------------------------------------------------------------------
/src/treeML.c:
--------------------------------------------------------------------------------
1 | /* Copyright 2009, Stefan Washietl
2 |
3 | This file is part of RNAcode.
4 |
5 | RNAcode is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation, either version 3 of the License, or
8 | (at your option) any later version.
9 |
10 | RNAcode is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License
16 | along with RNAcode. If not, see . */
17 |
18 |
19 | #include "spr.h"
20 | #include "utilities.h"
21 | #include "lk.h"
22 | #include "optimiz.h"
23 | #include "bionj.h"
24 | #include "models.h"
25 | #include "free.h"
26 | #include "options.h"
27 | #include "simu.h"
28 | #include "eigen.h"
29 | #include "pars.h"
30 | #include "alrt.h"
31 | #include "rnaz_utils.h"
32 |
33 | seq **Get_Seq_local(seq**, option *io, int rw);
34 |
35 | int treeML(const struct aln *alignment[], char** treeString, float* kappa){
36 |
37 | int i;
38 | seq **data;
39 | allseq *alldata;
40 | option *io;
41 | arbre *tree;
42 | int n_otu;
43 | matrix *mat;
44 | model *mod;
45 | time_t t_beg,t_end;
46 | div_t hour,min;
47 | phydbl best_lnL;
48 | int L,r_seed;
49 |
50 | /* Initialize data structures */
51 |
52 | tree = NULL;
53 | mod = NULL;
54 | data = NULL;
55 | best_lnL = UNLIKELY;
56 |
57 | io = (option *)Make_Input();
58 | Set_Defaults_Input(io);
59 | Set_Defaults_Model(io->mod);
60 | Set_Defaults_Optimiz(io->mod->s_opt);
61 | r_seed = time(NULL);
62 | srand(r_seed);
63 | Make_Model_Complete(io->mod);
64 | mod = io->mod;
65 |
66 |
67 | /* Set options */
68 |
69 | io->mod->datatype=NT; /* Nucleotides not amino acids */
70 | io->mod->s_opt->print=0; /* Shut of verbose output */
71 | io->mod->s_opt->opt_topo = 0; /* Do not optimize topology */
72 | io->mod->s_opt->opt_bl = 1; /* Optimize branch lengths*/
73 | io->mod->s_opt->opt_num_param = 1; /* Optimize kappa, initialize with 4.0*/
74 | io->mod->s_opt->opt_kappa = 1;
75 | io->mod->kappa = 4.0;
76 |
77 | /* Manually fill data structure with the alignment */
78 |
79 | n_otu = 0;
80 |
81 | L=strlen(alignment[0]->seq);
82 | for (n_otu=0; alignment[n_otu]!=NULL; n_otu++);
83 |
84 | io->mod->n_otu=n_otu;
85 |
86 | data = (seq **)mCalloc(n_otu,sizeof(seq *));
87 |
88 | for (i=0;ilen = L;
91 | data[i]->name = (char *)mCalloc(T_MAX_NAME,sizeof(char));
92 | strcpy(data[i]->name,alignment[i]->name);
93 | data[i]->state = (char *)mCalloc(T_MAX_SEQ,sizeof(char));
94 | strcpy(data[i]->state,alignment[i]->seq);
95 | data[i]->is_ambigu = NULL;
96 | }
97 |
98 | /* Call modified version of Get_Seq, that does some processing */
99 | data=Get_Seq_local(data, io, 0);
100 | alldata = Compact_Seq(data,io);
101 | Free_Seq(data,alldata->n_otu);
102 | Check_Ambiguities(alldata,io->mod->datatype,io->mod->stepsize);
103 | if (Init_Model(alldata,mod)!=1){
104 | return(0);
105 | }
106 |
107 | /* Calculate pairwise distances and make BIONJ tree*/
108 | mat = ML_Dist(alldata,mod);
109 | Fill_Missing_Dist(mat);
110 | mat->tree = Make_Tree_From_Scratch(alldata->n_otu,alldata);
111 | Bionj(mat);
112 | tree = mat->tree;
113 | tree->mat = mat;
114 | tree->mod = mod;
115 | tree->io = io;
116 | tree->data = alldata;
117 | tree->both_sides = 1;
118 | tree->n_pattern = tree->data->crunch_len/tree->mod->stepsize;
119 | time(&t_beg);
120 | time(&(tree->t_beg));
121 |
122 | /* Prepare for optimization and optimize */
123 |
124 | Fill_Dir_Table(tree);
125 | Update_Dirs(tree);
126 | Make_Tree_4_Pars(tree,alldata,alldata->init_len);
127 | Make_Tree_4_Lk(tree,alldata,alldata->init_len);
128 | tree->triplet_struct = Make_Triplet_Struct(mod);
129 | Br_Len_Not_Involving_Invar(tree);
130 | Order_Tree_CSeq(tree,alldata);
131 | Round_Optimize(tree,tree->data);
132 |
133 | //treeString=(char *)mCalloc(T_MAX_LINE,sizeof(char));
134 |
135 | *treeString=Write_Tree(tree);
136 | *kappa=io->mod->kappa;
137 |
138 | Free_Mat(tree->mat);
139 | Free_Triplet(tree->triplet_struct);
140 | Free_Tree_Pars(tree);
141 | Free_Tree_Lk(tree);
142 | Free_Tree(tree);
143 |
144 | Free_Cseq(alldata);
145 |
146 | Free_Model(mod);
147 |
148 | Free_Input(io);
149 |
150 | return(1);
151 |
152 | }
153 |
154 | /* Function from Seqgen (c) Andrew Rambaut & Nick Grassly */
155 |
156 | seq **Get_Seq_local(seq** data, option *io, int rw)
157 | {
158 | int i,j;
159 | char **buff;
160 | int n_unkn,n_removed,pos;
161 | int *remove;
162 |
163 |
164 | /* rewind(fp_seq); */
165 |
166 |
167 | if(data)
168 | {
169 | buff = (char **)mCalloc(io->mod->n_otu,sizeof(char *));
170 | For(i,io->mod->n_otu) buff[i] = (char *)mCalloc(data[0]->len,sizeof(char));
171 | remove = (int *)mCalloc(data[0]->len,sizeof(int));
172 |
173 | n_removed = 0;
174 |
175 | For(i,data[0]->len)
176 | {
177 | For(j,io->mod->n_otu)
178 | {
179 | if((data[j]->state[i] == '?') || (data[j]->state[i] == '-')) data[j]->state[i] = 'X';
180 | if((io->mod->datatype == NT) && (data[j]->state[i] == 'N')) data[j]->state[i] = 'X';
181 | if(data[j]->state[i] == 'U') data[j]->state[i] = 'T';
182 | }
183 |
184 | n_unkn = 0;
185 | For(j,io->mod->n_otu) if(data[j]->state[i] == 'X') n_unkn++;
186 |
187 | if(n_unkn == io->mod->n_otu)
188 | {
189 | remove[i] = 1;
190 | n_removed++;
191 | }
192 |
193 | For(j,io->mod->n_otu) buff[j][i] = data[j]->state[i];
194 | }
195 |
196 | if(n_removed > 0)
197 | {
198 | if(io->mod->datatype == NT)
199 | printf("\n. %d sites are made from completely undetermined states ('X', '-', '?' or 'N')...\n",n_removed);
200 | else
201 | printf("\n. %d sites are made from completely undetermined states ('X', '-', '?')...\n",n_removed);
202 | }
203 |
204 | pos = 0;
205 | For(i,data[0]->len)
206 | {
207 | /* if(!remove[i]) */
208 | /* { */
209 | For(j,io->mod->n_otu) data[j]->state[pos] = buff[j][i];
210 | pos++;
211 | /* } */
212 | }
213 |
214 | For(i,io->mod->n_otu) data[i]->len = pos;
215 | For(i,io->mod->n_otu) Free(buff[i]);
216 | Free(buff);
217 | Free(remove);
218 | }
219 | return data;
220 | }
221 |
--------------------------------------------------------------------------------
/src/treeSimulate.c:
--------------------------------------------------------------------------------
1 | /* Copyright 2009, Stefan Washietl
2 |
3 | This file is part of RNAcode.
4 |
5 | RNAcode is free software: you can redistribute it and/or modify
6 | it under the terms of the GNU General Public License as published by
7 | the Free Software Foundation, either version 3 of the License, or
8 | (at your option) any later version.
9 |
10 | RNAcode is distributed in the hope that it will be useful,
11 | but WITHOUT ANY WARRANTY; without even the implied warranty of
12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 | GNU General Public License for more details.
14 |
15 | You should have received a copy of the GNU General Public License
16 | along with RNAcode. If not, see . */
17 |
18 |
19 |
20 | #include
21 | #include
22 | #include
23 | #include
24 | #include
25 | #include
26 |
27 | #include "rnaz_utils.h"
28 |
29 | #include "treeSimulate.h"
30 |
31 | #include "global.h"
32 | #include "treefile.h"
33 | #include "evolve.h"
34 | #include "model.h"
35 | #include "nucmodels.h"
36 | #include "aamodels.h"
37 | #include "progress.h"
38 | #include "twister.h"
39 |
40 | extern int model;
41 | extern int numStates;
42 | extern int isNucModel;
43 | extern int userFreqs;
44 | extern int equalFreqs;
45 | extern int numSites;
46 | extern double tstv;
47 | extern double nucFreq[NUM_NUC];
48 | extern int numTaxa;
49 | char* P;
50 |
51 |
52 | void simulateTree(TTree* tree, float freqs[], float kap, int L){
53 |
54 | int i,j,k;
55 | int* tmp;
56 | double** matrix;
57 | double freqR,freqY,freqAG,freqCT;
58 | double freqA, freqC, freqG, freqT;
59 |
60 | isNucModel = 1;
61 | numStates = 4;
62 | model=0; /*0 means HKY, take care model constants are a tricky in
63 | seqgen and if set to defined constant HKY this does not
64 | work*/
65 |
66 | equalFreqs = 0;
67 | equalTstv = 0;
68 |
69 |
70 | freqA=nucFreq[0]=(double)freqs[0];
71 | freqC=nucFreq[1]=(double)freqs[1];
72 | freqG=nucFreq[2]=(double)freqs[2];
73 | freqT=nucFreq[3]=(double)freqs[3];
74 |
75 | freqR=freqA+freqG;
76 | freqY=freqC+freqT;
77 | freqAG=freqA*freqG;
78 | freqCT=freqC*freqT;
79 |
80 | tstv=(double)kap*(freqAG+freqCT)/(freqR*freqY);
81 |
82 | numSites=L;
83 |
84 | randomSeed = CreateSeed();
85 | SetSeed(randomSeed);
86 | CreateRates();
87 |
88 | numTaxa=tree->numTips;
89 |
90 | CreateSequences(tree, numSites);
91 | SetModel(model);
92 |
93 | EvolveSequences(tree, 0, numSites, 1.0, NULL);
94 |
95 | FreeRates();
96 |
97 | }
98 |
99 |
100 | TTree* string2tree(char* treeString){
101 |
102 | FILE * tmpfh;
103 | TTree* tree;
104 | int dummy1;
105 | double dummy2;
106 |
107 | tmpfh = tmpfile();
108 | fprintf(tmpfh,"%s",treeString);
109 | rewind(tmpfh);
110 | tree=NewTree();
111 | ReadTree(tmpfh, tree, 0, 0, NULL, &dummy1, &dummy2);
112 | fclose(tmpfh);
113 | return tree;
114 |
115 | }
116 |
117 |
118 | float** getDistanceMatrix(TTree* tree, struct aln *alignment[]){
119 |
120 | TNode* LCA;
121 | TNode* nodeA=NULL;
122 | TNode* nodeB=NULL;
123 | float sum=0.0;
124 | float** matrix;
125 | int N,i,j,k;
126 |
127 | for (N=0; alignment[N]!=NULL; N++);
128 |
129 | matrix=(float**)malloc(sizeof(float*)*N);
130 |
131 | for (i=0;ij) continue;
146 |
147 | for (k=0;knumTips;k++){
148 | if (strcmp(tree->names[k],alignment[i]->name)==0){
149 | nodeA=tree->tips[k];
150 | break;
151 | }
152 | }
153 |
154 | for (k=0;knumTips;k++){
155 | if (strcmp(tree->names[k],alignment[j]->name)==0){
156 | nodeB=tree->tips[k];
157 | break;
158 | }
159 | }
160 |
161 | LCA=NULL;
162 |
163 | LCA=getLCA(tree,nodeA, nodeB);
164 | sum=0.0;
165 |
166 | while (1){
167 | if (nodeA != LCA){
168 | sum+=nodeA->length0;
169 | nodeA=nodeA->branch0;
170 | } else {
171 | break;
172 | }
173 | }
174 |
175 | while (1){
176 | if (nodeB != LCA){
177 | sum+=nodeB->length0;
178 | nodeB=nodeB->branch0;
179 | } else {
180 | break;
181 | }
182 | }
183 |
184 | matrix[i][j]=sum;
185 | matrix[j][i]=sum;
186 |
187 | }
188 | }
189 | return matrix;
190 | }
191 |
192 |
193 |
194 |
195 |
196 | TNode* getLCA(TTree* tree, TNode* nodeA, TNode* nodeB){
197 |
198 | TNode** listA;
199 | TNode** listB;
200 | TNode** tmp;
201 | TNode* lca;
202 | int i,j;
203 | int numAncestorsA,numAncestorsB;
204 | int minNumAncestors,maxNumAncestors;
205 |
206 | listA=(TNode**)malloc(sizeof(TNode*)*tree->numNodes);
207 | listB=(TNode**)malloc(sizeof(TNode*)*tree->numNodes);
208 |
209 | numAncestorsA=0;
210 |
211 | do {
212 | listA[numAncestorsA++]=nodeA;
213 | nodeA=nodeA->branch0;
214 | } while (nodeA != tree->root);
215 | listA[numAncestorsA++]=tree->root;
216 |
217 |
218 | numAncestorsB=0;
219 |
220 | do {
221 | listB[numAncestorsB++]=nodeB;
222 | nodeB=nodeB->branch0;
223 | } while (nodeB != tree->root);
224 | listB[numAncestorsB++]=tree->root;
225 |
226 | if (numAncestorsA < numAncestorsB){
227 | minNumAncestors=numAncestorsA;
228 | maxNumAncestors=numAncestorsB;
229 |
230 | } else {
231 | minNumAncestors=numAncestorsB;
232 | maxNumAncestors=numAncestorsA;
233 |
234 | tmp=listA;
235 | listA=listB;
236 | listB=tmp;
237 | }
238 |
239 | for (i=0;inumTips; i++) {
262 |
263 | seq=(char*)malloc((sizeof(char))*(numSites+1));
264 |
265 | encodedSeq=tree->tips[i]->sequence;
266 |
267 | for (j=0; j %s\n",seq);
275 |
276 |
277 | alignment[i]=createAlnEntry(strdup(tree->names[i]),
278 | seq,0,0,0,'+');
279 |
280 | }
281 |
282 | alignment[i]=NULL;
283 | }
284 |
285 | /* tree is free'd manually since the function "FreeTree" in seq-gen
286 | do not seem to free everything */
287 |
288 | void freeSeqgenTree(TTree* tree){
289 |
290 | int i;
291 | TNode *node, *nextNode;
292 |
293 | for (i=0;icapacity;i++){
294 | free(tree->names[i]);
295 | }
296 |
297 | node=tree->nodeList;
298 | while (node != NULL){
299 | nextNode=node->next;
300 | free(node->sequence);
301 | free(node);
302 | node=nextNode;
303 | }
304 |
305 | free(tree->names);
306 | free(tree->tips);
307 | free(tree);
308 |
309 | }
310 |
--------------------------------------------------------------------------------
/levmar/lm.h:
--------------------------------------------------------------------------------
1 | /////////////////////////////////////////////////////////////////////////////////
2 | //
3 | // Prototypes and definitions for the Levenberg - Marquardt minimization algorithm
4 | // Copyright (C) 2004 Manolis Lourakis (lourakis@ics.forth.gr)
5 | // Institute of Computer Science, Foundation for Research & Technology - Hellas
6 | // Heraklion, Crete, Greece.
7 | //
8 | // This program is free software; you can redistribute it and/or modify
9 | // it under the terms of the GNU General Public License as published by
10 | // the Free Software Foundation; either version 2 of the License, or
11 | // (at your option) any later version.
12 | //
13 | // This program is distributed in the hope that it will be useful,
14 | // but WITHOUT ANY WARRANTY; without even the implied warranty of
15 | // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 | // GNU General Public License for more details.
17 | //
18 | /////////////////////////////////////////////////////////////////////////////////
19 |
20 | #ifndef _LM_H_
21 | #define _LM_H_
22 |
23 | #undef HAVE_LAPACK // uncomment this to force not using LAPACK
24 |
25 | #define LINSOLVERS_RETAIN_MEMORY // comment this is if you don't want routines in Axb.c retain working memory between calls
26 |
27 | /* no changes necessary beyond this point */
28 |
29 | #ifdef __cplusplus
30 | extern "C" {
31 | #endif
32 |
33 |
34 | #define FABS(x) (((x)>=0.0)? (x) : -(x))
35 |
36 | /* work arrays size for LM with & without jacobian, should be multiplied by sizeof(double)
37 | * or sizeof(float) to be converted to bytes
38 | */
39 | #define LM_DER_WORKSZ(npar, nmeas) (2*(nmeas) + 4*(npar) + (nmeas)*(npar) + (npar)*(npar))
40 | #define LM_DIF_WORKSZ(npar, nmeas) (3*(nmeas) + 4*(npar) + (nmeas)*(npar) + (npar)*(npar))
41 |
42 | #define LM_OPTS_SZ 5 /* max(4, 5) */
43 | #define LM_INFO_SZ 9
44 | #define LM_INIT_MU 1E-03
45 | #define LM_STOP_THRESH 1E-17
46 | #define LM_DIFF_DELTA 1E-06
47 | #define LM_VERSION "2.1.2 (Jan. 2006)"
48 |
49 | /* double precision LM, with & without jacobian */
50 | /* unconstrained minimization */
51 | extern int dlevmar_der(
52 | void (*func)(double *p, double *hx, int m, int n, void *adata),
53 | void (*jacf)(double *p, double *j, int m, int n, void *adata),
54 | double *p, double *x, int m, int n, int itmax, double *opts,
55 | double *info, double *work, double *covar, void *adata);
56 |
57 | extern int dlevmar_dif(
58 | void (*func)(double *p, double *hx, int m, int n, void *adata),
59 | double *p, double *x, int m, int n, int itmax, double *opts,
60 | double *info, double *work, double *covar, void *adata);
61 |
62 | /* box-constrained minimization */
63 | extern int dlevmar_bc_der(
64 | void (*func)(double *p, double *hx, int m, int n, void *adata),
65 | void (*jacf)(double *p, double *j, int m, int n, void *adata),
66 | double *p, double *x, int m, int n, double *lb, double *ub,
67 | int itmax, double *opts, double *info, double *work, double *covar, void *adata);
68 |
69 | extern int dlevmar_bc_dif(
70 | void (*func)(double *p, double *hx, int m, int n, void *adata),
71 | double *p, double *x, int m, int n, double *lb, double *ub,
72 | int itmax, double *opts, double *info, double *work, double *covar, void *adata);
73 |
74 | #ifdef HAVE_LAPACK
75 | /* linear equation constrained minimization */
76 | extern int dlevmar_lec_der(
77 | void (*func)(double *p, double *hx, int m, int n, void *adata),
78 | void (*jacf)(double *p, double *j, int m, int n, void *adata),
79 | double *p, double *x, int m, int n, double *A, double *b, int k,
80 | int itmax, double *opts, double *info, double *work, double *covar, void *adata);
81 |
82 | extern int dlevmar_lec_dif(
83 | void (*func)(double *p, double *hx, int m, int n, void *adata),
84 | double *p, double *x, int m, int n, double *A, double *b, int k,
85 | int itmax, double *opts, double *info, double *work, double *covar, void *adata);
86 | #endif /* HAVE_LAPACK */
87 |
88 |
89 | /* single precision LM, with & without jacobian */
90 | /* unconstrained minimization */
91 | extern int slevmar_der(
92 | void (*func)(float *p, float *hx, int m, int n, void *adata),
93 | void (*jacf)(float *p, float *j, int m, int n, void *adata),
94 | float *p, float *x, int m, int n, int itmax, float *opts,
95 | float *info, float *work, float *covar, void *adata);
96 |
97 | extern int slevmar_dif(
98 | void (*func)(float *p, float *hx, int m, int n, void *adata),
99 | float *p, float *x, int m, int n, int itmax, float *opts,
100 | float *info, float *work, float *covar, void *adata);
101 |
102 | /* box-constrained minimization */
103 | extern int slevmar_bc_der(
104 | void (*func)(float *p, float *hx, int m, int n, void *adata),
105 | void (*jacf)(float *p, float *j, int m, int n, void *adata),
106 | float *p, float *x, int m, int n, float *lb, float *ub,
107 | int itmax, float *opts, float *info, float *work, float *covar, void *adata);
108 |
109 | extern int slevmar_bc_dif(
110 | void (*func)(float *p, float *hx, int m, int n, void *adata),
111 | float *p, float *x, int m, int n, float *lb, float *ub,
112 | int itmax, float *opts, float *info, float *work, float *covar, void *adata);
113 |
114 | #ifdef HAVE_LAPACK
115 | /* linear equation constrained minimization */
116 | extern int slevmar_lec_der(
117 | void (*func)(float *p, float *hx, int m, int n, void *adata),
118 | void (*jacf)(float *p, float *j, int m, int n, void *adata),
119 | float *p, float *x, int m, int n, float *A, float *b, int k,
120 | int itmax, float *opts, float *info, float *work, float *covar, void *adata);
121 |
122 | extern int slevmar_lec_dif(
123 | void (*func)(float *p, float *hx, int m, int n, void *adata),
124 | float *p, float *x, int m, int n, float *A, float *b, int k,
125 | int itmax, float *opts, float *info, float *work, float *covar, void *adata);
126 | #endif /* HAVE LAPACK */
127 |
128 | /* linear system solvers */
129 | #ifdef HAVE_LAPACK
130 | extern int dAx_eq_b_QR(double *A, double *B, double *x, int m);
131 | extern int dAx_eq_b_QRLS(double *A, double *B, double *x, int m, int n);
132 | extern int dAx_eq_b_Chol(double *A, double *B, double *x, int m);
133 | extern int dAx_eq_b_LU(double *A, double *B, double *x, int m);
134 | extern int dAx_eq_b_SVD(double *A, double *B, double *x, int m);
135 |
136 | extern int sAx_eq_b_QR(float *A, float *B, float *x, int m);
137 | extern int sAx_eq_b_QRLS(float *A, float *B, float *x, int m, int n);
138 | extern int sAx_eq_b_Chol(float *A, float *B, float *x, int m);
139 | extern int sAx_eq_b_LU(float *A, float *B, float *x, int m);
140 | extern int sAx_eq_b_SVD(float *A, float *B, float *x, int m);
141 | #else // no LAPACK
142 | extern int dAx_eq_b_LU_noLapack(double *A, double *B, double *x, int n);
143 |
144 | extern int sAx_eq_b_LU_noLapack(float *A, float *B, float *x, int n);
145 | #endif /* HAVE_LAPACK */
146 |
147 | /* jacobian verification, double & single precision */
148 | extern void dlevmar_chkjac(
149 | void (*func)(double *p, double *hx, int m, int n, void *adata),
150 | void (*jacf)(double *p, double *j, int m, int n, void *adata),
151 | double *p, int m, int n, void *adata, double *err);
152 |
153 | extern void slevmar_chkjac(
154 | void (*func)(float *p, float *hx, int m, int n, void *adata),
155 | void (*jacf)(float *p, float *j, int m, int n, void *adata),
156 | float *p, int m, int n, void *adata, float *err);
157 |
158 | #ifdef __cplusplus
159 | }
160 | #endif
161 |
162 | #endif /* _LM_H_ */
163 |
--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | RNAcode -- Analyze the protein coding potential in multiple sequence alignments
2 |
3 | 0. About RNAcode
4 | ================
5 |
6 | RNAcode predicts protein coding regions in an alignment of homologous
7 | nucleotide sequences. The prediction is based on evolutionary
8 | signatures typical for protein genese, i.e. the presence of
9 | synonyomous/conservative nucleotide mutations, conservation of the
10 | reading frame and absence of stop codons.
11 |
12 | RNAcode does not rely on any species specific sequence characteristics
13 | whatsoever and does not use any machine learning techniques. The only
14 | input required for RNAcode is a multiple sequence alignment either in
15 | MAF or Clustal W format. RNAcode reports local regions of unusual high
16 | coding potential together with an associated p-value.
17 |
18 |
19 | 1. Installation
20 | ===============
21 |
22 | You can compile and install RNAcode like this:
23 |
24 | # ./configure
25 | # make
26 | # make install (as root)
27 |
28 | See INSTALL for details and more advanced installation options.
29 |
30 |
31 | 2. Usage
32 | ========
33 |
34 | 2.1 Synopsis
35 | ------------
36 |
37 | Analyze alignment (Clustal W or MAF form) with standard options and
38 | print detailed results page:
39 |
40 | # RNAcode data.aln
41 |
42 | Analyze alignment and show best non-overlapping hits below a p-value
43 | cutoff of 0.01 in gtf format:
44 |
45 | # RNAcode --outfile results.gtf --gtf --best-only --cutoff 0.01
46 |
47 | Create color annotations for high scoring coding segments:
48 |
49 | # RNAcode --eps data.aln
50 |
51 |
52 | 2.2 Input alignment
53 | -------------------
54 |
55 | The input alignment needs to be formatted in ClustalW format or MAF
56 | format (http://genome.ucsc.edu/FAQ/FAQformat#format5). The latter
57 | format allows to include genomic coordinates which can be used to
58 | produce annotation files.
59 |
60 | Important: RNAcode uses the first sequence as reference sequence,
61 | i.e. all results and reported coding regions apply to this reference
62 | sequence.
63 |
64 | Currently the alignments has to contain at least 3 sequences. Gaps
65 | have to be given as dash ('-'). Unspecified letters given as 'N' are
66 | allowed and treated neurally during all calculations. No difference is
67 | made between uppercase or lowercase input, i.e. 'softly'-repeat masked
68 | sequences which use lowercase letters for masked regions are treated
69 | the same way as unmasked sequences.
70 |
71 |
72 | 2.3. Command line
73 | -----------------
74 |
75 | RNAcode is invoked as follows:
76 |
77 | # RNAcode [OPTIONS] alignment.aln
78 |
79 | alignment is the alignment file and OPTIONS is one of the follwing
80 | command-line options either given in one-letter form with a single
81 | dash or as long option with double dash:
82 |
83 | --outfile -o (default: stdout)
84 |
85 | File to which the output is written. Defaults to standard output.
86 |
87 | --cutoff -p (default: 1.0)
88 |
89 | Show only regions that have a p-value below the given number. By
90 | default all hits are shown.
91 |
92 | --num-samples -n (default: 100)
93 |
94 | Number of random alignments that are sampled to calculate the
95 | p-value. RNAcode estimates the significance of a coding prediction by
96 | sampling a given number of random alignments. Default is 100 which
97 | gives reasonably stable p-values that are useful for assessing the
98 | relevance of a prediction.
99 |
100 | --stop-early -s
101 |
102 | Setting this option stops the sampling process as soon as it is clear
103 | that the best hit will not fall below the given p-value cutoff. For
104 | example, assume a p-value cutoff of 0.05 (see --cutoff) and a sample
105 | size of 1000 is given (see --num-samples). As soon as 50 random
106 | samples score better than the original alignment, the process is
107 | stopped and all hits in the original alignment are reported as p>0.05
108 | (or by convention as 1.0 in gtf and tabular output).
109 |
110 | --best-region -r Show only best non-overlapping hits
111 |
112 | By default all positive scoring segments are shown in the output if
113 | they fall below the given p-value cutoff. If two hits overlap
114 | (different frame or different strand) and --best-region is given only
115 | the hit with the highest score is shown. Strong coding regions often
116 | lead to statistically significant signals also in other frames. These
117 | hits are suppressed by this option and only the correct reading frame
118 | is reported.
119 |
120 | --best-only -b Show only best hit
121 |
122 | This options shows only the one single best hit for each alignment.
123 |
124 | --pars -c
125 |
126 | Scoring parameters as comma separated string:
127 |
128 | "DELTA,OMEGA,omega,stop_penalty"
129 |
130 | See the appendix of the Paper for an explanation for the meaning of
131 | these parameters. Default: "-10.0,-4.0,-2.0,-8.0"
132 |
133 | --gtf -g
134 | --tabular -t
135 |
136 | Changes the default output to two different machine readable formats
137 | (see next section).
138 |
139 | --eps -e
140 |
141 | Create colored plots in EPS format. The generated plots are resolution
142 | independent vector graphics that can be included in any graphics
143 | software. For each high scoring segment below a given cutoff (see
144 | --eps-cutoff) a file named hss-N.eps is created (N is the running
145 | number of the high scoring segment). See documents/color-legend.pdf
146 | for an explanation of the color scheme.
147 |
148 | --eps-cutoff -i
149 |
150 | Create plots only for high scoring segments with p better than this
151 | cutoff (default: 0.05)
152 |
153 | --eps-dir -d
154 |
155 | Directory to save EPS files in. Default: "eps"
156 |
157 |
158 |
159 | 2.4. Output format
160 | ------------------
161 |
162 | In the default output each prediction is reported on one line by 10 fields.
163 |
164 |
165 | 1. HSS id Unique running number for each high scoring segment
166 | predicted in one RNAcode call
167 |
168 |
169 | 2. Frame: The reading frame phasing relative to the starting
170 | nucleotide position in the reference sequence. +1 means
171 | that the first nucleotide in the reference sequence is in
172 | the same frame as the predicted coding region. Negative
173 | frames indicate that the predicted regions are on the
174 | reverse complement strand.
175 |
176 | 3. Length: The length of the predicted region in amino acids
177 |
178 | 4. From: The position of the first/last amino acid in the translated
179 | 5. To: nucleotide sequence of the reference sequence starting
180 | with 1.
181 |
182 | 6. Name The name of the reference sequence as given in the input alignment.
183 |
184 | 7. Start The nucleotide position in the reference sequence of the
185 |
186 | 8. End predicted coding region. If no genomic coordinates are given
187 | (if you provide a CLUSTAL W as input) the first nucleotide position in
188 | the references sequence is set to 1, otherwise the positions are the
189 | 1-based genomic coordinates as given in the input MAF file.
190 |
191 | 9. Score The coding potential score. High scores indicate high coding potential.
192 |
193 | 10. P The p-value associated with the score. This is the probability
194 | that a random alignment with same properties contains an equally good
195 | or better hit.
196 |
197 | If --tabular is given, the output is printed as tab-delimited list
198 | without header or any other output. With --gtf the output is formated
199 | as GTF genome annotation file.
200 |
201 |
202 | 4. Citing RNAcode
203 | =================
204 |
205 | RNAcode: robust discrimination of coding and noncoding regions in
206 | comparative sequence data
207 | Washietl S, Findeiss S, Muller S, Kalkhof S, von Bergen M, Hofacker IL, Stadler PF, Goldman N
208 | RNA (2011), in revision
209 |
210 | 3. Contact
211 | ==========
212 |
213 | Stefan Washietl
214 |
--------------------------------------------------------------------------------
/seqgen/twister.c:
--------------------------------------------------------------------------------
1 | /*
2 | Sequence Generator - seq-gen, version 1.3.2
3 | Andrew Rambaut & Nick Grassly
4 | Department of Zoology, University of Oxford
5 |
6 | The code in this file is covered by the license and copyright message
7 | given below.
8 |
9 | Any feedback is very welcome.
10 | http://evolve.zoo.ox.ac.uk/software/Seq-Gen/
11 | email: andrew.rambaut@zoo.ox.ac.uk
12 | */
13 |
14 | /*
15 | A C-program for MT19937, with initialization improved 2002/1/26.
16 | Coded by Takuji Nishimura and Makoto Matsumoto.
17 |
18 | Before using, initialize the state by using init_genrand(seed)
19 | or init_by_array(init_key, key_length).
20 |
21 | Copyright (C) 1997 - 2002, Makoto Matsumoto and Takuji Nishimura,
22 | All rights reserved.
23 |
24 | Redistribution and use in source and binary forms, with or without
25 | modification, are permitted provided that the following conditions
26 | are met:
27 |
28 | 1. Redistributions of source code must retain the above copyright
29 | notice, this list of conditions and the following disclaimer.
30 |
31 | 2. Redistributions in binary form must reproduce the above copyright
32 | notice, this list of conditions and the following disclaimer in the
33 | documentation and/or other materials provided with the distribution.
34 |
35 | 3. The names of its contributors may not be used to endorse or promote
36 | products derived from this software without specific prior written
37 | permission.
38 |
39 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
40 | "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
41 | LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
42 | A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
43 | CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
44 | EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
45 | PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
46 | PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
47 | LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
48 | NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
49 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
50 |
51 |
52 | Any feedback is very welcome.
53 | http://www.math.sci.hiroshima-u.ac.jp/~m-mat/MT/emt.html
54 | email: m-mat @ math.sci.hiroshima-u.ac.jp (remove space)
55 | */
56 |
57 | #include
58 | #include
59 | #include
60 | #include "twister.h"
61 |
62 | /* Period parameters */
63 | #define N 624
64 | #define M 397
65 | #define MATRIX_A 0x9908b0dfUL /* constant vector a */
66 | #define UPPER_MASK 0x80000000UL /* most significant w-r bits */
67 | #define LOWER_MASK 0x7fffffffUL /* least significant r bits */
68 |
69 | static unsigned long mt[N]; /* the array for the state vector */
70 | static int mti=N+1; /* mti==N+1 means mt[N] is not initialized */
71 |
72 | /* initializes mt[N] with a seed */
73 | void init_genrand(unsigned long s)
74 | {
75 | mt[0]= s & 0xffffffffUL;
76 | for (mti=1; mti> 30)) + mti);
79 | /* See Knuth TAOCP Vol2. 3rd Ed. P.106 for multiplier. */
80 | /* In the previous versions, MSBs of the seed affect */
81 | /* only MSBs of the array mt[]. */
82 | /* 2002/01/09 modified by Makoto Matsumoto */
83 | mt[mti] &= 0xffffffffUL;
84 | /* for >32 bit machines */
85 | }
86 | }
87 |
88 | /* initialize by an array with array-length */
89 | /* init_key is the array for initializing keys */
90 | /* key_length is its length */
91 | /* slight change for C++, 2004/2/26 */
92 | void init_by_array(unsigned long init_key[], int key_length)
93 | {
94 | int i, j, k;
95 | init_genrand(19650218UL);
96 | i=1; j=0;
97 | k = (N>key_length ? N : key_length);
98 | for (; k; k--) {
99 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1664525UL))
100 | + init_key[j] + j; /* non linear */
101 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
102 | i++; j++;
103 | if (i>=N) { mt[0] = mt[N-1]; i=1; }
104 | if (j>=key_length) j=0;
105 | }
106 | for (k=N-1; k; k--) {
107 | mt[i] = (mt[i] ^ ((mt[i-1] ^ (mt[i-1] >> 30)) * 1566083941UL))
108 | - i; /* non linear */
109 | mt[i] &= 0xffffffffUL; /* for WORDSIZE > 32 machines */
110 | i++;
111 | if (i>=N) { mt[0] = mt[N-1]; i=1; }
112 | }
113 |
114 | mt[0] = 0x80000000UL; /* MSB is 1; assuring non-zero initial array */
115 | }
116 |
117 | /* generates a random number on [0,0xffffffff]-interval */
118 | unsigned long genrand_int32(void)
119 | {
120 | unsigned long y;
121 | static unsigned long mag01[2]={0x0UL, MATRIX_A};
122 | /* mag01[x] = x * MATRIX_A for x=0,1 */
123 |
124 | if (mti >= N) { /* generate N words at one time */
125 | int kk;
126 |
127 | if (mti == N+1) /* if init_genrand() has not been called, */
128 | init_genrand(5489UL); /* a default initial seed is used */
129 |
130 | for (kk=0;kk> 1) ^ mag01[y & 0x1UL];
133 | }
134 | for (;kk> 1) ^ mag01[y & 0x1UL];
137 | }
138 | y = (mt[N-1]&UPPER_MASK)|(mt[0]&LOWER_MASK);
139 | mt[N-1] = mt[M-1] ^ (y >> 1) ^ mag01[y & 0x1UL];
140 |
141 | mti = 0;
142 | }
143 |
144 | y = mt[mti++];
145 |
146 | /* Tempering */
147 | y ^= (y >> 11);
148 | y ^= (y << 7) & 0x9d2c5680UL;
149 | y ^= (y << 15) & 0xefc60000UL;
150 | y ^= (y >> 18);
151 |
152 | return y;
153 | }
154 |
155 | /* generates a random number on [0,0x7fffffff]-interval */
156 | long genrand_int31(void)
157 | {
158 | return (long)(genrand_int32()>>1);
159 | }
160 |
161 | /* generates a random number on [0,1]-real-interval */
162 | double genrand_real1(void)
163 | {
164 | return genrand_int32()*(1.0/4294967295.0);
165 | /* divided by 2^32-1 */
166 | }
167 |
168 | /* generates a random number on [0,1)-real-interval */
169 | double genrand_real2(void)
170 | {
171 | return genrand_int32()*(1.0/4294967296.0);
172 | /* divided by 2^32 */
173 | }
174 |
175 | /* generates a random number on (0,1)-real-interval */
176 | double genrand_real3(void)
177 | {
178 | return (((double)genrand_int32()) + 0.5)*(1.0/4294967296.0);
179 | /* divided by 2^32 */
180 | }
181 |
182 | /* generates a random number on [0,1) with 53-bit resolution*/
183 | double genrand_res53(void)
184 | {
185 | unsigned long a=genrand_int32()>>5, b=genrand_int32()>>6;
186 | return(a*67108864.0+b)*(1.0/9007199254740992.0);
187 | }
188 | /* These real versions are due to Isaku Wada, 2002/01/09 added */
189 |
190 | void SetSeed(unsigned long seed)
191 | {
192 | init_genrand(seed);
193 | }
194 |
195 | unsigned long CreateSeed( )
196 | {
197 | static unsigned long differ = 0; // guarantee time-based seeds will change
198 |
199 | // Get a uint32 from t and c
200 | // Better than uint32(x) in case x is floating point in [0,1]
201 | // Based on code by Lawrence Kirby (fred@genesis.demon.co.uk)
202 | time_t t = time(NULL);
203 | clock_t c = clock();
204 |
205 | unsigned long h1 = 0;
206 | unsigned long h2 = 0;
207 |
208 | unsigned char *p = (unsigned char *) &t;
209 |
210 | size_t i, j;
211 |
212 | for( i = 0; i < sizeof(t); ++i )
213 | {
214 | h1 *= UCHAR_MAX + 2U;
215 | h1 += p[i];
216 | }
217 | p = (unsigned char *) &c;
218 | for( j = 0; j < sizeof(c); ++j )
219 | {
220 | h2 *= UCHAR_MAX + 2U;
221 | h2 += p[j];
222 | }
223 | return ( h1 + differ++ ) ^ h2;
224 | }
225 |
226 | double rndu()
227 | {
228 | return genrand_real1();
229 | }
230 |
231 |
232 |
--------------------------------------------------------------------------------