├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── examples
    ├── MSA_RF00167.fa
    ├── pydca_demo.ipynb
    ├── pydca_demo.py
    └── ref_RF00167.fa
├── how_to_docker.md
├── install.sh
├── pydca
    ├── __init__.py
    ├── config_dca
    │   ├── __init__.py
    │   └── config_log.py
    ├── contact_visualizer
    │   ├── __init__.py
    │   └── contact_visualizer.py
    ├── dca_utilities
    │   ├── __init__.py
    │   └── dca_utilities.py
    ├── extras
    │   └── a2m2aln.pl
    ├── fasta_reader
    │   ├── __init__.py
    │   └── fasta_reader.py
    ├── main.py
    ├── meanfield_dca
    │   ├── __init__.py
    │   ├── meanfield_dca.py
    │   └── msa_numerics.py
    ├── mfdca_main.py
    ├── msa_trimmer
    │   ├── __init__.py
    │   └── msa_trimmer.py
    ├── plmdca
    │   ├── __init__.py
    │   ├── include
    │   │   └── plmdca.h
    │   ├── lbfgs
    │   │   ├── include
    │   │   │   ├── arithmetic_ansi.h
    │   │   │   ├── arithmetic_sse_double.h
    │   │   │   ├── arithmetic_sse_float.h
    │   │   │   └── lbfgs.h
    │   │   └── lib
    │   │   │   └── lbfgs.cpp
    │   ├── msa_numerics.py
    │   ├── plmdca.py
    │   ├── plmdcaBackend.cpp
    │   └── plmdca_numerics.cpp
    ├── plmdca_main.py
    └── sequence_backmapper
    │   ├── __init__.py
    │   ├── scoring_matrix.py
    │   └── sequence_backmapper.py
├── requirements.txt
├── setup.py
└── tests
    ├── __init__.py
    ├── fasta_reader_test.py
    ├── input_files_path.py
    ├── meanfield_dca_test.py
    ├── sequence_backmapper_test.py
    └── tests_input
        ├── MSA_RF00059_trimmed_gap_treshold_50.fa
        ├── PF02826.faa
        ├── ref_seq_PF02826.faa
        ├── ref_seq_RF00059.faa
        ├── ref_seq_RF00059_test1.faa
        ├── ref_seq_RF00059_test2.faa
        ├── ref_seq_RF00059_test3.faa
        └── ref_seq_RF00059_test4.faa


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | Pipfile*
  7 | # C extensions
  8 | *.so
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .coverage
 43 | .coverage.*
 44 | .cache
 45 | nosetests.xml
 46 | coverage.xml
 47 | *.cover
 48 | .hypothesis/
 49 | .pytest_cache/
 50 | 
 51 | # Translations
 52 | *.mo
 53 | *.pot
 54 | 
 55 | # Django stuff:
 56 | *.log
 57 | local_settings.py
 58 | db.sqlite3
 59 | 
 60 | # Flask stuff:
 61 | instance/
 62 | .webassets-cache
 63 | 
 64 | # Scrapy stuff:
 65 | .scrapy
 66 | 
 67 | # Sphinx documentation
 68 | docs/_build/
 69 | 
 70 | # PyBuilder
 71 | target/
 72 | 
 73 | # Jupyter Notebook
 74 | .ipynb_checkpoints
 75 | 
 76 | # pyenv
 77 | .python-version
 78 | 
 79 | # celery beat schedule file
 80 | celerybeat-schedule
 81 | 
 82 | # SageMath parsed files
 83 | *.sage.py
 84 | 
 85 | # Environments
 86 | .env
 87 | .venv
 88 | env/
 89 | venv/
 90 | ENV/
 91 | env.bak/
 92 | venv.bak/
 93 | 
 94 | # Spyder project settings
 95 | .spyderproject
 96 | .spyproject
 97 | 
 98 | # Rope project settings
 99 | .ropeproject
100 | 
101 | # mkdocs documentation
102 | /site
103 | 
104 | # mypy
105 | .mypy_cache/
106 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Multiscale Biomolecular Simulation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include pydca/plmdca/include/*
2 | include pydca/plmdca/lbfgs/include/*


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # About `pydca`
 2 | > [!CAUTION]
 3 | > Unfortunately, pyDCA is based on specific python versions and libraries. If you want to use it, we recommend utilizing a Docker container. An example setup is shown [here](how_to_docker.md).
 4 | 
 5 | `pydca` is Python implementation of direct coupling analysis (DCA) of residue coevolution for protein and RNA sequence families using the **__mean-field__** and **__pseudolikelihood maximization__** algorithms. Given multiple sequence alignment (MSA) files in FASTA format, `pydca` computes the coevolutionary scores of pairs of sites in the alignment. In addition, when an optional file containing a reference sequence is supplied, scores corresponding to pairs of sites of this reference sequence are computed by mapping the reference sequence to the MSA. The software provides command line utilities or it can be used as a library. 
 6 | 
 7 | # Prerequisites
 8 | `pydca` is implemented mainly in Python with the pseudolikelihood maximization parameter inference part implemented using C++ backend for optimization. To install pydca and successfully carry out DCA computations, the following are required. 
 9 | * Python 3, version 3.5 or later.
10 | * C++ compiler that supports C++11 (e.g. the GNU compiler collection).
11 | * Optionally, OpenMP for multithreading support.
12 | 
13 | 
14 | # Installing
15 | To install the current version of `pydca` from PyPI, run on the command line
16 | ```bash
17 | $ pip install pydca
18 | ```
19 | or you can use the `install.sh` bash script as 
20 | ```bash 
21 | $ source install.sh
22 | ```
23 | 
24 | # Using `pydca` as a Python Library
25 | After installation, pydca can be imported into other Python source codes and used. 
26 | [Here is IPython Notebook example](https://github.com/KIT-MBS/pydca/blob/master/examples/pydca_demo.ipynb). 
27 | If you encounter a problem opening the Ipython Notebook example, copy and past the URL [here](https://nbviewer.jupyter.org/).
28 | 
29 | # Running `pydca` From Command Line
30 | When `pydca` is installed, it provides three main command. Namely `pydca`, `plmdca`, and `mfdca`. 
31 | The command `pydca` is used for tasks such as trimming alignment data before DCA computation, and 
32 | visualization of contact maps or true positive rates. The other two command are associated with 
33 | DCA computation with the pseudolikelihood maximization algorithm (plmDCA) or the mean-field algorithm (mfDCA).
34 | Below we show some usage examples of all the three commands.
35 | ## Trimming MSA data 
36 | Trim gaps by reference sequence:
37 | ```bash
38 | $ pydca trim_by_refseq <biomolecule>  <alignment.fa>  <refseq_file.fa> --remove_all_gaps --verbose
39 | ```
40 | Trim by percentage of gaps in MSA columns:
41 | ```bash 
42 | $ pydca trim_by_gap_size <alignmnet.fa> --max_gap 0.9 --verbose
43 | ```
44 | ### DCA Computation
45 | #### Using `pydca`'s Pseudolikelihood Maximization Algorithm
46 | ```bash 
47 | $ plmdca compute_fn <biomolecule> <alignment.fa> --max_iterations 500 --num_threads 6 --apc --verbose 
48 | ```
49 | We can also the values of regularization parameters 
50 | ```bash
51 | $ plmdca compute_fn <biomolecule> <alignment.fa> --apc --lambda_h 1.0 --lambda_J 50.0 --verbose 
52 | ```
53 | The command `compute_fn` computes DCA scores obtained from the Frobenius norm of the couplings. `--apc` performs
54 | average product correction (APC). To obtain DCA scores from direct-information (DI) we replace the subcommand 
55 | `compute_fn` by `compute_di`. 
56 | #### Using `pydca`'s Mean-Field Algorithm 
57 | ```bash
58 | $ mfdca compute_fn <biomolecule> <alignment.fa> --apc --pseudocount 0.5 --verbose
59 | ```
60 | ### Contact Map Visualization 
61 | When protein/RNA sequence family has a resolved PDB structure, we can evaluate the 
62 | performance of `pydca` by contact map visualization. Example:
63 | ```bash
64 | $ pydca plot_contact_map <biomolecule> <PDB_chain_name> <PDB_id/PDB_file.PDB> <refseq.fa> <DCA_file.txt> --verbose  
65 | ```
66 | ### Plotting True Positive Rate
67 | In addition to contact map we can evaluate the performance of `pydca` by plotting 
68 | the true positive rate. 
69 | ```bash
70 | $ pydca plot_contact_map <biomolecule> <PDB_chain_name> <PDB_id/PDB_file.PDB> <refseq.fa> <DCA_file.txt> --verbose
71 | ```
72 | To get help message about a (sub)command  we use, for example, 
73 | ```bash
74 | $ pydca --help
75 | ```
76 | ```bash
77 | $ plmdca compute_fn  --help
78 | ```
79 | 
80 | # References
81 | ### If you use pydca for your work please cite the following references
82 | 1. Zerihun, MB., Pucci, F, Peter, EK, and Schug, A. <br>
83 | pydca: v1.0: a comprehensive software for direct coupling analysis of RNA and protein sequences <br>
84 |  Bioinformatics, btz892, doi.org/10.1093/bioinformatics/btz892
85 | 
86 | 2. Morcos, F., Pagnani, A., Lunt, B., Bertolino, A., Marks, DS., Sander, C., Zecchina, R., Onuchic, JN., Hwa, T., and Weigt, M. <br>
87 | Direct-coupling analysis of residue coevolution captures native contacts across many protein families <br>
88 | PNAS December 6, 2011 108 (49) E1293-E1301, doi:10.1073/pnas.1111471108
89 | 
90 | 2. Ekeberg, M., Lövkvist, C., Lan, Y., Weigt, M., & Aurell, E. (2013). <br>
91 | Improved contact prediction in proteins: Using pseudolikelihoods to infer Potts models. <br>
92 | Physical Review E, 87(1), 012707, doi:10.1103/PhysRevE.87.012707
93 | 


--------------------------------------------------------------------------------
/examples/pydca_demo.py:
--------------------------------------------------------------------------------
 1 | #import pydca  or selected modules
 2 | import pydca 
 3 | from pydca.plmdca import plmdca 
 4 | from pydca.meanfield_dca import meanfield_dca 
 5 | from pydca.sequence_backmapper import sequence_backmapper
 6 | from pydca.msa_trimmer import msa_trimmer 
 7 | 
 8 | """Demonstrates usage of pydca within Python scrips
 9 | """
10 | 
11 | protein_msa_file = ''
12 | protein_refseq_file = ''
13 | rna_msa_file = ''
14 | rna_refseq_file = ''
15 | 
16 | 
17 | # Creating PlmDCA instance and using it 
18 | 
19 | plmdca_protein = plmdca.PlmDCA('protein', msa_file, 
20 |     seqid=0.7,
21 |     lambda_h=0.5, 
22 |     lambda_J = 50.0,
23 |     num_threads=6, 
24 | )
25 | 
26 | # compute sorted Frobenius norm of the couplings, average product corrected 
27 | fn_scores_apc = plmdca_protein.compute_sorted_FN_APC()
28 | 
29 | # compute sorted Frobenius norm of couplings, without average product correction
30 | fn_scores_raw = plmdca_protein.compute_sorted_FN() 
31 | 
32 | # compute DCA scores summarized by direct information (DI), average product corrected. 


--------------------------------------------------------------------------------
/examples/ref_RF00167.fa:
--------------------------------------------------------------------------------
1 | >0000|REFERENCE
2 | CGCUUCAUAUAAUCCUAAUGAUAUGGUUUGGGAGUUUCUACCAAGAGCCUUAAACUCUUGAUUAUGAAGUG
3 | 


--------------------------------------------------------------------------------
/how_to_docker.md:
--------------------------------------------------------------------------------
 1 | # pyDCA: Docker Workaround
 2 | 
 3 | Create a Dockerfile with the following content:
 4 | ```bash
 5 | FROM --platform=linux/amd64 python:3.7.8
 6 | 
 7 | RUN pip install pydca
 8 | WORKDIR /data
 9 | ```
10 | 
11 | Build your docker image in the terminal:
12 | ```bash
13 | $ docker build -t pydca:latest
14 | ```
15 | 
16 | 
17 | Run a new container interactively in your terminal:
18 | ```bash
19 | $ mkdir -p data
20 | $ docker run -it --platform linux/amd64 --volume $(pwd)/data:/data pydca:latest bash
21 | ```
22 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | #----------------------------------------
 4 | # Installs the current version of pydca from PyPI 
 5 | # into the current user's home directory
 6 | # ---------------------------------------
 7 | 
 8 | MINIMUM_PYTHON_VERSION='3.5'
 9 | MINIMUM_PYTHON_VERSION_COMPARE=35
10 | echo "Python 3 minimum version required: ${MINIMUM_PYTHON_VERSION}"
11 | if command -v python3 &> /dev/null; then
12 | 	PYTHON_VERSION=$(python -c 'import sys; version=sys.version_info[:3]; print("{0}.{1}".format(*version))')
13 | 	PYTHON_VERSION_COMPARE=$(python -c 'import sys; version=sys.version_info[:3]; print("{0}{1}".format(*version))')
14 | 	if [ "${PYTHON_VERSION_COMPARE}" -lt "${MINIMUM_PYTHON_VERSION_COMPARE}" ]; then 
15 | 			echo "ERROR: You have older Python version: version=${PYTHON_VERSION}"
16 | 	else
17 | 		echo "Python 3 version found: ${PYTHON_VERSION}"
18 | 		echo 'installing pydca'
19 | 		pip install pydca
20 | 	fi
21 | else 
22 | 	echo 'You need Python 3 version 3.5 or later to install pydca'
23 | fi
24 | 


--------------------------------------------------------------------------------
/pydca/__init__.py:
--------------------------------------------------------------------------------
1 | """pydca is python implementation of Direct Coupling Analysis for protein and RNA 
2 | sequences. It implements two flavors of DCA: mean-field and pseudolikelihood maximization.
3 | 
4 | Both the mean-field and pseudolikelihood maximization algorithms provide Python API. 
5 | The mean-field algorithm is implemented in Python whereas the pseudolikelihood parameter
6 | inference part is implemented using C++11 backend.
7 | """
8 | 


--------------------------------------------------------------------------------
/pydca/config_dca/__init__.py:
--------------------------------------------------------------------------------
1 | """Implements configurations for DCA computation. Example, logging configuration.
2 | """
3 | 


--------------------------------------------------------------------------------
/pydca/config_dca/config_log.py:
--------------------------------------------------------------------------------
 1 | 
 2 | """A sample configuration for logging. This is a suggestion and you are free to
 3 | chose your own preferred way of configuring loggers.
 4 | """
 5 | 
 6 | LOGGING_CONFIG = {
 7 |     'version':1,
 8 |     'disable_existing_loggers':False,
 9 |     'formatters':{
10 |         'verbose':{
11 |             'format':'%(levelname)s %(asctime)s %(module)s %(funcName)s %(message)s'
12 |         },
13 |         'simple':{
14 |             'format':'%(levelname)s %(message)s'
15 |         },
16 |     },
17 |     'handlers':{
18 |         'console':{
19 | 		    'level':'INFO',
20 | 			'class':'logging.StreamHandler',
21 |             'formatter':'verbose',
22 | 		},
23 | 	},
24 | 	'loggers':{
25 | 		'':{
26 | 			'handlers':['console'],
27 | 			'level':'DEBUG',
28 | 			'propagate':True,
29 | 		},
30 | 	},
31 | 
32 | }
33 | 
34 | class ConsoleColor:
35 |     """Defines colors for logging messages.
36 | 
37 |     Attributes
38 |     ----------
39 |         nocolor :  str
40 |             Disable coloring of terminal while logging messages.
41 |         green : str
42 |             This color is used for logging level INFO.
43 |         yellow : str
44 |             This color is used for logging level WARNING.
45 |         red : str
46 |             This color is used for logging level ERROR.
47 |     """
48 |     nocolor = '\033[0;0m'
49 |     green = '\033[0;32m'
50 |     yellow = '\033[0;33m'
51 |     red = '\033[0;31m'
52 | 


--------------------------------------------------------------------------------
/pydca/contact_visualizer/__init__.py:
--------------------------------------------------------------------------------
1 | """Implements visualization tools for contact map comparison and true positive
2 | plots.
3 | """
4 | 


--------------------------------------------------------------------------------
/pydca/dca_utilities/__init__.py:
--------------------------------------------------------------------------------
1 | """Implements utilities that perform file input/output operations during DCA
2 | computations.
3 | """
4 | 


--------------------------------------------------------------------------------
/pydca/extras/a2m2aln.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl -w
  2 | # Modified perl script to process alignment data. 
  3 | #
  4 | use strict;
  5 | use warnings;
  6 | use Carp qw| cluck :DEFAULT |;
  7 | use Getopt::Long qw(:config gnu_getopt auto_version auto_help);
  8 | use Pod::Usage;
  9 | BEGIN { our $VERSION = "1.0"; }
 10 | 
 11 | our $opt;
 12 | if( !GetOptions( $opt = { debug => 0, }, 'debug!', 'man!', 'query|q=s', 'quiet!' ) ) { pod2usage(2); }
 13 | 
 14 | my $dbg = $opt->{debug};
 15 | if($opt->{man}){ pod2usage(-verbose => 2); }
 16 | 
 17 | if(!$opt->{query}) { pod2usage(  -message => "Error: required --query argument is missing!\n", -exitval => 1, -verbose => 1, -output  => \*STDERR ); }
 18 | 
 19 | my $query;
 20 | my $qstart;
 21 | my $seqsinfile = _read_fasta_file();
 22 | my @seqs;
 23 | 
 24 | # $seq = [ 'QUERY/1-126', 'ADKELKFLVVDDFSTMRRIVRNLLKELGFNNVEEAEDG...' ]
 25 | foreach my $seq (@$seqsinfile)
 26 | {
 27 |     if(!defined($query) && $seq->[0] =~ /$opt->{query}/o)
 28 |     {
 29 |         $qstart = ( defined($1) ? $1+0 : undef );
 30 |         unshift @seqs, ( $query = $seq ); # query may match multiple times, take the first only
 31 |     }
 32 |     else
 33 |     {
 34 |         push @seqs, $seq;
 35 |     }
 36 | }
 37 | 
 38 | my @querycols = ();
 39 | for( my $i = 0, my $s = $query->[1], my $l = length($query->[1]); $i < $l; ++$i )
 40 | {
 41 |     if(substr($s, $i, 1) =~ /[A-Z]/o){ push @querycols, $i; }
 42 |     #  if(substr($s, $i, 0) =~ /[>]/o){ push @querycols, $i; }
 43 | }
 44 | 
 45 | #if(defined($qstart)) { print STDOUT "# querystart=$qstart\n"; }
 46 | #{
 47 | #    my $qnogaps = $query->[1]; $qnogaps =~ tr/[A-Za-z]//cd;
 48 | #    print STDOUT "# query=", $qnogaps, "\n";
 49 | #}
 50 | #
 51 | #
 52 | foreach my $seq (@seqs)
 53 | {
 54 | 
 55 |     if(length($query->[1]) != length($seq->[1]))
 56 |     {
 57 |         warn("length of aligned sequence '".$seq->[0]."' (".length($seq->[1]).") does not equal to query length (".length($query->[1])."), skipping sequence\n");
 58 |         next;
 59 |     }
 60 |     # we just do default METHOD_TO_RESOLVE_AMBIGUOUS_RESIDUES=2 of calculate_evolutionary_constraints.m: skip entire sequence
 61 |     #if($seq->[1] =~ /[BJOXZ]/o){ if(!$opt->{quiet}){ warn("skipped '".$seq->[0]."' because it has [BJOXZ] in it\n"); } next; }
 62 | 
 63 |     print STDOUT ">.$seq->[0]\n";
 64 |     print STDOUT @{[split(//o, $seq->[1])]}[ @querycols ], "\n"; # neat
 65 | }
 66 | 
 67 | exit(0);
 68 | 
 69 | sub                 _read_fasta_file
 70 | {
 71 |     my $ret = [];
 72 |     my( $desc, $seq ) = ( undef, '' );
 73 |     my $line = <STDIN>;
 74 |     while($line)
 75 |     {
 76 |         if(substr($line,0,1) ne '>'){ $line = <STDIN>; next; }
 77 |         #
 78 |         if(defined($desc)){ push @$ret, [ $desc, $seq ]; $desc = undef; $seq = ''; }
 79 |         #
 80 |         chomp($line);
 81 |         $desc = substr($line, 1);
 82 |         $line = <STDIN>;
 83 |         while($line)
 84 |         {
 85 |             if(substr($line,0,1) ne '>'){ chomp($line); $seq .= $line; $line = <STDIN>; }
 86 |             else{ last; }
 87 |         }
 88 |     }
 89 |     if(defined($desc)){ push @$ret, [ $desc, $seq ]; }
 90 |     #
 91 |     return $ret;
 92 | }
 93 | 
 94 | =pod
 95 | 
 96 | =head1 NAME
 97 | 
 98 | a2m2aln - reformat A2M input to a simple alignment format
 99 | 
100 | =head1 SYNOPSIS
101 | 
102 | a2m2aln [OPTIONS]
103 | 
104 | a2m2aln --query '^RASH_HUMAN/(\d+)' --quiet < INFILE > OUTFILE
105 | 
106 | a2m2aln --man --help --version
107 | 
108 | =head1 DESCRIPTION
109 | 
110 | a2m2aln formats L<A2M|http://compbio.soe.ucsc.edu/a2m-desc.html> input to a simple alignment format used by freecontact(1):
111 | 
112 |  * Optional header line: '# querystart=<position>'.
113 |  * Optional header line: '# query=<SEQUENCE>'. Lowercase letters in <SEQUENCE> indicate insertions that were deleted from the alignment.
114 |  * One aligned sequence per line.
115 |  * The first sequence is the query.
116 | 
117 | All gaps and insertions - also query insertions - are removed from the alignment.  The 'query' header field helps reconstruct original query residue numbers.
118 | 
119 | =head1 OPTIONS
120 | 
121 | =over
122 | 
123 | =item -q, --query REGEXP
124 | 
125 | Query identifier, a regular expression, e.g. '^RASH_HUMAN\b' to match 'RASH_HUMAN/5-165'.  Required.
126 | 
127 | Use parentheses to match query start, e.g. '^RASH_HUMAN/(\d+)'. Matching the query start position is optional.
128 | 
129 | =item --debug
130 | 
131 | =item --nodebug
132 | 
133 | =item --help
134 | 
135 | =item --man
136 | 
137 | =item --quiet
138 | 
139 | Suppress printing of messages like 'sequences skipped for unusual residue letters'.
140 | 
141 | =item --version
142 | 
143 | =back
144 | 
145 | =head1 AUTHOR
146 | 
147 | Laszlo Kajan <lkajan@rostlab.org>
148 | 
149 | =head1 SEE ALSO
150 | 
151 | L<freecontact(1)>
152 | 
153 | =cut
154 | 
155 | #:vim:ts=4:et:ai:
156 | 


--------------------------------------------------------------------------------
/pydca/fasta_reader/__init__.py:
--------------------------------------------------------------------------------
1 | """Implements input readers from FASTA formatted files.
2 | """
3 | 


--------------------------------------------------------------------------------
/pydca/fasta_reader/fasta_reader.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from Bio import AlignIO
  3 | 
  4 | """Reads alignment data from FASTA files, can convert residue representation in
  5 | in the sequences for char to int or vice versa.
  6 | 
  7 | Author: Mehari B. Zerihun
  8 | """
  9 | 
 10 | __all__ = [
 11 |     'get_alignment_from_fasta_file',
 12 |     'get_alignment_int_form',
 13 |     'get_alignment_char_form',
 14 |     'sequences_to_char_form',
 15 | ]
 16 | 
 17 | logger = logging.getLogger(__name__)
 18 | 
 19 | #Protein residues  (numbering is mostly random, not alphabetical)
 20 | #==================================================================
 21 | #1. Alanine - ala - A, 2. Arginine - arg - R, 3. Asparagine - asn - N
 22 | #4. Aspartic acid - asp - D, 5. Cysteine -cys - C, 6. Glutamine - gln - Q
 23 | #7. Glutamic acid - glu - E, 8. Glycine - gly - G, 9. Histidine - his - H
 24 | #10. Isoleucine - ile - I, 11. Leucine - lue -L, 12. Lysine - lys - K
 25 | #13. Methionine - met - M, 14. Phenylalanine - phe - F, 15. Proline - pro - P
 26 | #16. Serine - ser - S, 17. Threonine - thr - T, 18. Tryptophan - trp - W
 27 | #19. Tyrosine - tyr - Y, 20. Valine - val - V
 28 | 
 29 | #RNA nucleotides
 30 | #====================================
 31 | #1. Adenine - A, 2. Cytosine - C
 32 | #3. Guanine - G, 4. Uracil - U
 33 | 
 34 | RES_TO_INT_ALL = {
 35 |     'PROTEIN':{
 36 |         'A': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5,
 37 |         'G': 6, 'H': 7, 'I': 8, 'K': 9, 'L': 10,
 38 |         'M': 11, 'N': 12, 'P': 13, 'Q': 14, 'R': 15,
 39 |         'S': 16, 'T': 17, 'V': 18, 'W':19, 'Y':20,
 40 |         '-':21, '.':21, '~':21,
 41 |     },
 42 |     'RNA':{
 43 |         'A':1, 'C':2, 'G':3, 'U':4, '-':5, '.':5, '~':5,
 44 |     },
 45 | }
 46 | 
 47 | 
 48 | class FastaReaderError(Exception):
 49 |     """Raise exceptions related to reading alignment data
 50 |     """
 51 | 
 52 | 
 53 | def res_to_char(biomolecule):
 54 |     """Creates a mapping for residues from char to int to
 55 |     int to char.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |         biomolecule : str
 60 |             Type of biomolecule residues represent
 61 |             (must be protein or rna)
 62 | 
 63 |     Returns
 64 |     -------
 65 |         RES_TO_CHAR : dict
 66 |             A dictionary of residues with the keys in
 67 |             int representation and the values in char representation
 68 |     """
 69 |     biomolecule = biomolecule.strip().upper()
 70 |     RES_TO_INT = RES_TO_INT_ALL[biomolecule]
 71 |     EXCLUDE = ['.', '~']
 72 | 
 73 |     RES_TO_CHAR = {
 74 |         val:key for key, val in RES_TO_INT.items() if key not in EXCLUDE
 75 |     }
 76 |     return RES_TO_CHAR
 77 | 
 78 | 
 79 | def get_alignment_from_fasta_file(file_name):
 80 |     """Read sequences from FASTA file using Bio.AlignIO.read()
 81 | 
 82 |     Parameters
 83 |     ----------
 84 |         file_name : str
 85 |             Path to FASTA formatted file.
 86 | 
 87 |     Returns
 88 |     -------
 89 |         alignment : list
 90 |             A list of biomolecular sequence strings.
 91 |     """
 92 |     alignment = []
 93 |     try:
 94 |         record_iterator = AlignIO.read(file_name, 'fasta')
 95 |         #biopython just reads the records if there are tags (>some key).
 96 |         #It doesn't know if the file is really a biological sequence or not
 97 |     except Exception as expt:
 98 |         error_msg='\n\tError occured while reading from fasta file: {}.' +\
 99 |             '\n\tError type:{}\n\tArguments:{!r}'
100 |         logger.error(error_msg.format(file_name, type(expt).__name__, expt.args))
101 |         raise
102 |     else:
103 |         if any(True for _ in record_iterator):
104 |             for record in record_iterator:
105 |                 seq = record.seq.strip()
106 |                 if seq: alignment.append(seq.upper())
107 |             if not alignment:
108 |                 logger.error(
109 |                     '\n\trecord_iterator returned by AlignIO.read()'
110 |                     ' has no sequences',
111 |                 )
112 |                 raise ValueError
113 | 
114 |         else:
115 |             logger.error(
116 |                 '\n\trecord_iterator returned by AlignIO.read() is empty',
117 |             )
118 |             raise ValueError
119 |     return alignment
120 | 
121 | 
122 | def alignment_letter2int(alignment, biomolecule='protein'):
123 |     """
124 |     Converts sequences in a multiple sequence alignment from one letter to integer representation.
125 |     """
126 |     biomolecule=biomolecule.strip().upper()
127 |     if biomolecule not in ['PROTEIN','RNA']:
128 |         logger.error(
129 |             '\n\t{} entered. Biomolecule must be either PROTEIN or RNA'.format(
130 |                 biomolecule))
131 |         raise ValueError
132 |     NUM_SITE_STATES = 21 if biomolecule == 'PROTEIN' else 5
133 |     RES_TO_INT = RES_TO_INT_ALL[biomolecule]
134 |     alignment_int_form = []
135 | 
136 |     num_seqs_with_non_standard_res = 0
137 |     num_non_standard_res = 0
138 |     total_num_seqs_in_msa = 0
139 |     for seq in alignment:
140 |         try:
141 |             seq_int = [RES_TO_INT[res.upper()] for res in seq]
142 |         except KeyError:
143 |             num_seqs_with_non_standard_res += 1
144 |             seq_int = []
145 |             for res in seq:
146 |                 res = res.upper()
147 |                 if res in RES_TO_INT.keys():
148 |                     seq_int.append(RES_TO_INT[res.upper()])
149 |                 else:
150 |                     num_non_standard_res += 1
151 |                     seq_int.append(NUM_SITE_STATES)
152 |         total_num_seqs_in_msa += 1
153 |         if seq_int not in alignment_int_form:
154 |             alignment_int_form.append(seq_int)
155 |     if num_seqs_with_non_standard_res > 0:
156 |         logger.info('\n\tFound {} non-standard residues in {} sequences'
157 |             ''.format(num_non_standard_res, num_seqs_with_non_standard_res)
158 |         )
159 |     logger.info('\n\tTotal number of sequences read from file: {}'.format(total_num_seqs_in_msa))
160 |     if not alignment_int_form:
161 |         logger.error('\n\tNo data found in alignment in integer representation')
162 |         raise ValueError
163 |     return alignment_int_form
164 | 
165 | 
166 | def get_alignment_int_form(file_name, biomolecule='protein'):
167 |     """Converts sequences in integer representation. The sequences are
168 |     first read by get_alignment_from_fasta_file(file_name) function that returns
169 |     a list of sequences.
170 | 
171 |     Parameters
172 |     ----------
173 |         file_name : str
174 |             Fasta file name containing the alignment data.
175 |         biomolecule : str
176 |             The type of biomolecule the sequence data reprsents. This can be
177 |             either portein or RNA in lower or upper cases.
178 | 
179 |     Returns
180 |     -------
181 |       alignment_int_form : list
182 |         a list of alignments, each sequence in a list of integers.
183 |     """
184 | 
185 |     alignment = get_alignment_from_fasta_file(file_name)
186 |     alignment_int_form = alignment_letter2int(alignment, biomolecule)
187 | 
188 |     return alignment_int_form
189 | 
190 | 
191 | def get_alignment_char_form(file_name, biomolecule = 'PROTEIN'):
192 |     """Give a list of sequcences whose residues are represented by integers,
193 |     this function converts to a list of sequences with the residues represented
194 |     by chars. The sequences in integer representation are obtained from
195 |     get_alignment_int_form(file_name, biomolecule) function.
196 | 
197 |     Parameters
198 |     ----------
199 |         file_name : str
200 |             FASTA file name
201 |         biomolecule : str
202 |             Type of biomolecule (protein or RNA)
203 | 
204 |     Returns
205 |     -------
206 |         seqs_char_form : list
207 |             A list of sequences whose residues are represented by characters.
208 |     """
209 |     biomolecule = biomolecule.strip().upper()
210 |     seqs_int_form = get_alignment_int_form(
211 |         file_name,
212 |         biomolecule=biomolecule,
213 |     )
214 | 
215 |     msg = '\n\tConverting sequences back to character representation'
216 |     logger.info(msg)
217 | 
218 |     RES_TO_CHAR = res_to_char(biomolecule)
219 | 
220 |     seqs_char_form = []
221 |     for seq in seqs_int_form:
222 |         seq_char = ''.join([RES_TO_CHAR[res] for res in seq])
223 |         seqs_char_form.append(seq_char)
224 |     return seqs_char_form
225 | 
226 | 
227 | def sequences_to_char_form(seqs_lst, biomolecule):
228 |     """Give a list of sequences whose residues are represented by integers, this
229 |     function converts the sequences in which the residues are represented by
230 |     characters. Note that this function does not read from FASTA files.
231 | 
232 |     Parameters
233 |     ----------
234 |         seqs_lst : list
235 |             List of sequences each in int represenation
236 | 
237 |     Returns
238 |     -------
239 |         seqs_char_lst : list
240 |             List of sequences each in char representation
241 |     """
242 |     biomolecule = biomolecule.strip().upper()
243 |     RES_TO_CHAR = res_to_char(biomolecule)
244 | 
245 |     seqs_char_form = []
246 |     for seq_int in seqs_lst:
247 |         seq_char = [RES_TO_CHAR[res] for res in seq_int]
248 |         seqs_char_form.append(''.join(seq_char))
249 |     return seqs_char_form
250 | 
251 | 
252 | if __name__ == '__main__':
253 |     """
254 |     from argparse import ArgumentParser
255 |     import pdb
256 |     pdb.set_trace()
257 |     logging.basicConfig()
258 |     parser = ArgumentParser(description='Parse args when loading fasta_reader as __main__')
259 |     parser.add_argument('msa_file', help='alignment file in FASTA format')
260 |     parser.add_argument('biomolecule', help='name of biomolecule', choices=['protein', 'PROTEIN', 'rna', 'RNA'])
261 |     args = parser.parse_args()
262 |     sequences_int_form = get_alignment_int_form(args.msa_file, args.biomolecule)
263 |     sequences_char_form = sequences_to_char_form(sequences_int_form, args.biomolecule)
264 | 
265 |     for seq_int, seq_char in zip(sequences_int_form, sequences_char_form):
266 |         print(seq_int, len(seq_int))
267 |         print(seq_char)
268 |     """
269 | 


--------------------------------------------------------------------------------
/pydca/main.py:
--------------------------------------------------------------------------------
  1 | from pydca.meanfield_dca import meanfield_dca
  2 | from pydca.sequence_backmapper.sequence_backmapper import SequenceBackmapper
  3 | from pydca.dca_utilities import dca_utilities
  4 | from pydca.contact_visualizer.contact_visualizer import DCAVisualizer, PDBContent
  5 | from pydca.msa_trimmer.msa_trimmer import MSATrimmer
  6 | from argparse import ArgumentParser
  7 | import logging
  8 | import sys
  9 | import os
 10 | 
 11 | """Top level module for the pydca package. It implements command line
 12 | intefaces, including logging configuration, command line arguments and help
 13 | messages.
 14 | 
 15 | Author: Mehari B. Zerihun
 16 | """
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | 
 20 | def configure_logging():
 21 |     """Configures logging. When configured, the logging level is INFO and
 22 |     messages are logged to stream handler. Log level name are colored whenever
 23 |     the terminal supports that. INFO level is Green, WARNING level is Yellow and
 24 |     ERROR level is Red.
 25 |     """
 26 |     from pydca.config_dca.config_log import LOGGING_CONFIG
 27 |     from pydca.config_dca.config_log import ConsoleColor as c_color
 28 |     import logging.config
 29 | 
 30 |     logging.config.dictConfig(LOGGING_CONFIG)
 31 |     logging.addLevelName(logging.INFO, '{}{}{}'.format(
 32 |         c_color.green, logging.getLevelName(logging.INFO), c_color.nocolor))
 33 |     logging.addLevelName(logging.WARNING, '{}{}{}'.format(
 34 |         c_color.yellow, logging.getLevelName(logging.WARNING), c_color.nocolor))
 35 |     logging.addLevelName(logging.ERROR, '{}{}{}'.format(
 36 |         c_color.red, logging.getLevelName(logging.ERROR), c_color.nocolor))
 37 |     return None
 38 | 
 39 | class CmdArgs:
 40 |     """Defines variables related to command line parsing.
 41 |     """
 42 |     subcommand_name = 'subcommand_name'
 43 |     #variables for command line use
 44 |     output_dir_optional = '--output_dir'
 45 |     output_dir_help = """Directory path to which output results are written.
 46 |     If the directory is not existing, it will be created provided that the user
 47 |     has a privilege to do so. If this path is not provided, an output directory
 48 |     is created using the base name of the MSA file, with a prefix and/or postfix
 49 |     added to it.
 50 |     """
 51 |     verbose_optional = '--verbose'
 52 |     verbose_help = 'Show logging information on the terminal.'
 53 |     apc_optional = '--apc'
 54 |     apc_help = """Compute the average product corrected (APC) DCA score.
 55 |     """
 56 |     force_seq_type_optional = '--force_seq_type'
 57 |     force_seq_type_help = """Typically the anticipated number of residue types
 58 |     plus a gap is 21 for protein, and 5 for RNA sequences. If there is a significant
 59 |     deviation from these values, it is interpreted as the user has inadvertently
 60 |     entered a biomolecule type mismatch and an error may be raised.
 61 |     This can happen when the alignment data contains too many/few non-standared
 62 |     residues or when a wrong biomolecule type is entered. If you are sure about
 63 |     the biomolecule type the MSA data represents, use --force_seq_type to bypass
 64 |     this error.
 65 |     """
 66 |     msa_file = 'msa_file'
 67 |     msa_file_help = 'Multiple sequence alignment (MSA) file in FASTA format'
 68 | 
 69 |     biomolecule = 'biomolecule'
 70 |     biomolecule_help = """Type of biomolecule.
 71 |     It should be either protein or RNA in lower or upper case letters.
 72 |     """
 73 |     refseq_file = 'refseq_file'
 74 |     refseq_file_optional = '--refseq_file'
 75 |     refseq_file_help = """FASTA formatted file containing a reference sequence.
 76 |     The reference sequence should not contain gaps or non-standard residues.
 77 |     """
 78 | 
 79 |     pseudocount_optional = '--pseudocount'
 80 |     pseudocount_help = """Relative value of the pseudocount so as to regularize
 81 |     emperical single-site and pair-site frequencies obtained from alignment data.
 82 |     Note that this is not the actual value of the pseudocount, instead, it is
 83 |     the ratio X/(X + Meff) where X is the actual pseudocount and Meff is the
 84 |     effective number of sequences.
 85 |     """
 86 | 
 87 |     seqid_optional = '--seqid'
 88 |     seqid_help = """Cut-off value of sequences similarity above which they
 89 |     are lumped together.
 90 |     """
 91 |     pdb_file = 'pdb_file'
 92 |     pdb_file_help = """Path to a PDB file.
 93 |     """
 94 |     dca_file = 'dca_file'
 95 |     dca_file_help = """File containing the result of DCA computation. The first
 96 |     and second columns should contain site pairs (i, j) such that i < j. Optionally,
 97 |     a third column can contain the DCA scores. The DCA scores are not mandatory
 98 |     as the site-pairs are assumed to be sorted, in descending order of DCA score.
 99 |     """
100 |     rna_secstruct_file = 'rna_secstruct_file'
101 |     rna_secstruct_file_optional = '--rna_secstruct_file'
102 |     rna_secstruct_file_help = """File containing an RNA secondary structure. The
103 |     structure should be in one line and in dot bracket notation, i.e., the allowed
104 |     symbols are '.', '(', and ')'. Comments should start with # sign.
105 |     """
106 |     wc_neighbor_dist_optional = '--wc_neighbor_dist'
107 |     wc_neighbor_dist_help = """For RNAs, if two residues (i, j) form
108 |     secondary structure pair and the wc_neighbor_dist is n,  all (2n + 1)(2n + 1)
109 |     (neighboring plus WC pairs) are excluded. That is all pairs (-n + i, -n + j),
110 |     (-n + 1 + i, -n + j), ..., (i, j), ..., (i + n, j + n). Using
111 |     wc_neighbor_dist = 2 excludes 25 pairs.
112 |     """
113 |     pdb_chain_id = 'pdb_chain_id'
114 |     pdb_chain_id_help = """ID of a PDB chain. This helps to identify the desired
115 |     chain since PDB files can contain multiple protein/RNA chains or a
116 |     mix of protein and RNA chains in one file.
117 |     """
118 |     pdb_id_optional = '--pdb_id'
119 |     pdb_id_help = """The PDB ID in the PDB database.
120 |     """
121 | 
122 |     linear_dist = 'linear_dist'
123 |     linear_dist_optional = '--linear_dist'
124 |     linear_dist_help = """The distance between two residues in sequence above
125 |     which they are considered to be tertiary contacts. For RNAs see also
126 |     wc_neighbor_dist parameter.
127 |     """
128 |     contact_dist = 'contact_dist'
129 |     contact_dist_optional = '--contact_dist'
130 |     contact_dist_help = """Cut-off distance between two residues in a PDB structure
131 |     below which they are considered to be in contact. This distance is between two
132 |     heavy atoms of the residues.
133 |     """
134 |     num_dca_contacts = 'num_dca_contacts'
135 |     num_dca_contacts_optional = '--num_dca_contacts'
136 |     num_dca_contacts_help = """Number of DCA contacts to be taken among all DCA
137 |     ranked contacts. The counting is done after appropriate filtering of the DCA
138 |     ranked contacts is done if a filtering criteria (e.g., a non-zero linear distance)
139 |     is set.
140 |     """
141 |     max_gap_optional = '--max_gap'
142 |     max_gap_help = """The maximum fraction of gaps in MSA column. When an MSA
143 |     data is trimmed, columns containing gap fraction more than this value are
144 |     removed.
145 |     """
146 |     remove_all_gaps_optional = '--remove_all_gaps'
147 |     remove_all_gaps_help = """Removes columns in the MSA correponding to
148 |     the matching sequence's gap positions.
149 |     """
150 | ## end of class CmdArgs
151 | 
152 | DCA_VISUALIZATION_SUBCOMMANDS = ['plot_contact_map', 'plot_tp_rate']
153 | 
154 | FILE_CONTENT_SUBCOMMANDS = ['pdb_content', 'refseq_content', 'dca_content',
155 |     'rna_secstruct_content',
156 | ]
157 | MSA_TRIMMING_SUBCOMMANDS = ['trim_by_refseq', 'trim_by_gap_size']
158 | # all subcommands
159 | ALL_SUBCOMMANDS = list()
160 | ALL_SUBCOMMANDS.extend(DCA_VISUALIZATION_SUBCOMMANDS)
161 | ALL_SUBCOMMANDS.extend(FILE_CONTENT_SUBCOMMANDS)
162 | ALL_SUBCOMMANDS.extend(MSA_TRIMMING_SUBCOMMANDS)
163 | 
164 | 
165 | def get_dcavisualizer_instance(biomolecule, pdb_chain_id, pdb_file, refseq_file,
166 |         dca_file = None, rna_secstruct_file=None, linear_dist=None,
167 |         contact_dist=None, num_dca_contacts=None,
168 |         wc_neighbor_dist=None, pdb_id=None):
169 |     """Creates and returns a DCAVisualizer instance
170 | 
171 |     Parameters
172 |     ----------
173 |         biomolecule : str
174 |             The biomolecule type (protein or RNA)
175 |         pdb_chain_id : str
176 |             Chain ID of a PDB chain in PDB file.
177 |         pdb_file : str
178 |             Path to PDB file
179 |         refseq_file : str
180 |             Path to FASTA formated reference sequence file.
181 |         dca_file : str
182 |             Path to text file containing DCA ranked pairs.
183 |         rna_secstruct_file : str
184 |             Path to text file containing an RNA secondary structure.
185 |         linear_dis : int
186 |             Distance between two residues in a sequence.
187 |         contact_dist : float
188 |             Distance between two residues below which they are considered to be
189 |             contacts.
190 |         pdb_id : str
191 |             The PDB ID as obtained from PDB database.
192 |     Returns
193 |     -------
194 |         dcavisualizer_inst : DCAVisualizer
195 |             An instance of DCAVisualizer class.
196 |     """
197 |     dcavisualizer_inst = DCAVisualizer(biomolecule, pdb_chain_id, pdb_file,
198 |         refseq_file=refseq_file, dca_file=dca_file,
199 |         rna_secstruct_file=rna_secstruct_file, linear_dist=linear_dist,
200 |         contact_dist=contact_dist, num_dca_contacts=num_dca_contacts,
201 |         wc_neighbor_dist=wc_neighbor_dist, pdb_id=pdb_id,
202 |     )
203 |     return dcavisualizer_inst
204 | 
205 | 
206 | def get_pdb_content_instance(pdb_file):
207 |     """Creates a PDBContent instance and returns it.
208 | 
209 |     Parameters
210 |     ----------
211 |         pdb_file : str
212 |             Path to a PDB file.
213 | 
214 |     Returns
215 |     -------
216 |         pdb_content_instance : PDBContent
217 |             An instance of PDBContent.
218 |     """
219 |     pdb_content = PDBContent(pdb_file)
220 |     return pdb_content
221 | 
222 | 
223 | def add_args_to_subparser(the_parser, subcommand_name):
224 |     """Adds arguments to a pasrser. These added arguments are common to all
225 |     sub-commands that are used for mean-field DCA computation.
226 | 
227 |     Parameters
228 |     ----------
229 |         the_parser : ArgumentParser
230 |             The (sub) parser that to which the arguments are added.
231 | 
232 |     Returns
233 |     -------
234 |         None
235 |     """
236 | 
237 |     the_parser.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_help,
238 |         action='store_true',
239 |     )
240 | 
241 |     if subcommand_name in DCA_VISUALIZATION_SUBCOMMANDS:
242 |         the_parser.add_argument(CmdArgs.biomolecule, help=CmdArgs.biomolecule_help)
243 |         the_parser.add_argument(CmdArgs.pdb_chain_id, help=CmdArgs.pdb_chain_id_help)
244 |         the_parser.add_argument(CmdArgs.pdb_file, help=CmdArgs.pdb_file_help)
245 |         the_parser.add_argument(CmdArgs.refseq_file, help=CmdArgs.refseq_file_help)
246 |         the_parser.add_argument(CmdArgs.dca_file, help=CmdArgs.dca_file_help)
247 |         the_parser.add_argument(CmdArgs.rna_secstruct_file_optional,
248 |                 help=CmdArgs.rna_secstruct_file_help,
249 |             )
250 |         the_parser.add_argument(CmdArgs.linear_dist_optional,
251 |             help=CmdArgs.linear_dist_help, type = int,
252 |         )
253 |         the_parser.add_argument(CmdArgs.contact_dist_optional,
254 |             help=CmdArgs.contact_dist_help, type = float,
255 |         )
256 |         the_parser.add_argument(CmdArgs.num_dca_contacts_optional,
257 |             help = CmdArgs.num_dca_contacts_help, type = int,
258 |         )
259 |         the_parser.add_argument(CmdArgs.wc_neighbor_dist_optional, type= int,
260 |             help = CmdArgs.wc_neighbor_dist_help,
261 |         )
262 |         the_parser.add_argument(CmdArgs.pdb_id_optional, help = CmdArgs.pdb_id_help)
263 | 
264 |     if subcommand_name in FILE_CONTENT_SUBCOMMANDS:
265 |         if subcommand_name == 'pdb_content':
266 |             the_parser.add_argument(CmdArgs.pdb_file, help = CmdArgs.pdb_file_help)
267 |     if subcommand_name in MSA_TRIMMING_SUBCOMMANDS:
268 |         the_parser.add_argument(CmdArgs.max_gap_optional,
269 |             type = float, help = CmdArgs.max_gap_help,
270 |         )
271 |         if subcommand_name == 'trim_by_refseq':
272 |             the_parser.add_argument(CmdArgs.biomolecule, help=CmdArgs.biomolecule_help)
273 |             the_parser.add_argument(CmdArgs.msa_file, help=CmdArgs.msa_file_help)
274 |             the_parser.add_argument(CmdArgs.refseq_file, help=CmdArgs.refseq_file_help)
275 |             the_parser.add_argument(CmdArgs.remove_all_gaps_optional,
276 |                 help= CmdArgs.remove_all_gaps_help, action='store_true',
277 |             )
278 |         if subcommand_name == 'trim_by_gap_size':
279 |             the_parser.add_argument(CmdArgs.msa_file, help=CmdArgs.msa_file_help)
280 |     return None
281 | 
282 | 
283 | def execute_from_command_line(msa_file=None, biomolecule=None, the_command=None, refseq_file=None,
284 |         force_seq_type=False, verbose=False, output_dir=None,
285 |         pdb_file=None, pdb_chain_id=None, dca_file=None, rna_secstruct_file=None,
286 |         linear_dist=None, contact_dist=None, num_dca_contacts=None,
287 |         wc_neighbor_dist=None, pdb_id=None, max_gap=None, remove_all_gaps=False):
288 |     """Do computations according to the parameters passed on the command line
289 | 
290 |     Parameters
291 |     ----------
292 |         msa_file : str
293 |             Path to MSA file.
294 |         biomolecule : str
295 |             Name of biomolecule (protein or RNA, case insensitive).
296 |         the_command : str
297 |             Name of the command passed from the command line.
298 |         refseq_file : str
299 |             Path to reference sequence file.
300 |         force_seq_type : bool
301 |             Force computation although there might be a biomolecule type  mismatch.
302 |         verbsoe : bool
303 |             Display logging message to the screen.
304 |         ouput_dir : str
305 |             Output directory to write ouput files.
306 |         pdb_file : str
307 |             PDB file path.
308 |         pdb_chain_id : str
309 |             The name of a PDB chain in PDB file.
310 |         dca_file : str
311 |             DCA file path.
312 |         rna_secstruct_file : str
313 |             RNA secondary structure file path.
314 |         linear_dist : int
315 |             Minimum separation between two residues in sequence.
316 |         contact_dist : float
317 |             Maximum distance between two residues in PDB to be considered as contacts.
318 |         num_dca_contacts : int
319 |             Number of DCA contacts to be taken from ranked pairs.
320 |         wc_neighbor_dist : int
321 |             Maximum radius for a residue to be considered as a neighbor of WC pairs.
322 |         pdb_id : str
323 |             The ID of a PDB as it is in PDB repository.
324 |         max_gap : float
325 |             Maximum fraction of gaps in an MSA column.
326 |         remove_all_gaps : float
327 |             Remove all gaps in MSA correponding to the best matching sequence to
328 |             a reference.
329 | 
330 | 
331 |     Returns
332 |     -------
333 |         None : None 
334 |     """
335 | 
336 | 
337 |     if verbose: configure_logging()
338 | 
339 | 
340 |     if the_command not in ALL_SUBCOMMANDS:
341 |         logger.error('\n\t{} is unknown command.'.format(the_command))
342 |         raise ValueError
343 |     ######  start dca computation #############################################
344 |     if the_command.strip() in DCA_VISUALIZATION_SUBCOMMANDS:
345 |         dca_visualizing_commands = ['plot_contact_map', 'plot_tp_rate']
346 |         if the_command.strip() in dca_visualizing_commands:
347 |             # get an instance of DCAVisualizer
348 |             dcavisualizer = get_dcavisualizer_instance(
349 |                 biomolecule, pdb_chain_id, pdb_file, refseq_file=refseq_file,
350 |                 dca_file = dca_file, rna_secstruct_file=rna_secstruct_file,
351 |                 linear_dist = linear_dist, contact_dist=contact_dist,
352 |                 num_dca_contacts = num_dca_contacts,
353 |                 wc_neighbor_dist = wc_neighbor_dist,
354 |                 pdb_id = pdb_id,
355 |             )
356 |             # get metadata about dcavisualizer
357 |             dcavisualizer_metadata = dca_utilities.get_dcavisualizer_metadata(
358 |                 dcavisualizer
359 |             )
360 |         if the_command.strip() == 'plot_contact_map':
361 |             contact_categories_dict = dcavisualizer.plot_contact_map()
362 |             if not output_dir:
363 |                 pdb_file_basename, ext = os.path.splitext(os.path.basename(pdb_file))
364 |                 output_dir = 'contact_map_' + pdb_file_basename
365 |             output_file_path = dca_utilities.get_dca_output_file_path(
366 |                 output_dir, pdb_file, prefix= 'contact_map', postfix='.txt'
367 |             )
368 |             dca_utilities.create_directories(output_dir)
369 |             dca_utilities.write_contact_map(output_file_path,
370 |                 contact_categories_dict, metadata = dcavisualizer_metadata
371 |             )
372 | 
373 |         if the_command.strip() == 'plot_tp_rate':
374 |             true_positive_rates_dict = dcavisualizer.plot_true_positive_rates()
375 |             if not output_dir:
376 |                 pdb_file_basename, ext = os.path.splitext(os.path.basename(pdb_file))
377 |                 output_dir = 'TPR_' + pdb_file_basename
378 |             output_file_path = dca_utilities.get_dca_output_file_path(
379 |                 output_dir, pdb_file, prefix= 'TPR_', postfix='.txt'
380 |             )
381 |             tpr_metadata = ['\n# First column is DCA true positive rate per rank'
382 |                 '\n# Second column is the PDB true positive rate per rank'
383 |             ]
384 |             metadata = dcavisualizer_metadata[:6] + tpr_metadata
385 |             dca_utilities.create_directories(output_dir)
386 |             dca_utilities.write_tp_rate(output_file_path,
387 |                 true_positive_rates_dict=true_positive_rates_dict,
388 |                 metadata=metadata,
389 |             )
390 | 
391 |     ### if the_command.strip() in FILE_CONTENT_SUBCOMANDS
392 |     if the_command.strip() in FILE_CONTENT_SUBCOMMANDS:
393 |         if the_command.strip() == 'pdb_content':
394 |             pdb_content = get_pdb_content_instance(pdb_file)
395 |             pdb_content.show_struct_info()
396 | 
397 |     ### MSA trimming commands
398 |     if the_command.strip() in MSA_TRIMMING_SUBCOMMANDS:
399 |         if the_command.strip() == 'trim_by_refseq':
400 |             msa_trimmer = MSATrimmer(msa_file,
401 |                 biomolecule=biomolecule,
402 |                 refseq_file=refseq_file,
403 |                 max_gap= max_gap,
404 |             )
405 |             columns_to_remove = msa_trimmer.trim_by_refseq(
406 |                 remove_all_gaps=remove_all_gaps,
407 |             )
408 |         if the_command.strip() == 'trim_by_gap_size':
409 |             msa_trimmer = MSATrimmer(msa_file, max_gap=max_gap)
410 |             columns_to_remove = msa_trimmer.trim_by_gap_size()
411 |         if not output_dir:
412 |             msa_file_basename, ext = os.path.splitext(os.path.basename(msa_file))
413 |             output_dir = 'Trimmed_' +  msa_file_basename
414 |             dca_utilities.create_directories(output_dir)
415 |         output_file_path = dca_utilities.get_dca_output_file_path(
416 |             output_dir, msa_file, prefix= 'Trimmed_', postfix='.fa'
417 |         )
418 |         dca_utilities.write_trimmed_msa(output_file_path, msa_trimmer=msa_trimmer,
419 |             columns_to_remove=columns_to_remove,
420 |             )
421 |     return None
422 | 
423 | 
424 | def run_pydca():
425 |     """Entry point for DCA computations.
426 | 
427 |     Parameters
428 |     ----------
429 |         All arguments to be used are captured from the command line.
430 | 
431 |     Returns
432 |     -------
433 |         None.
434 |     """
435 |     parser = ArgumentParser()
436 | 
437 |     #Create subparsers
438 |     subparsers = parser.add_subparsers(dest = CmdArgs.subcommand_name)
439 | 
440 |     parser_dca_visualizer_contact_map = subparsers.add_parser('plot_contact_map',
441 |         help = 'Provides a quick contact map comparison of DCA computation results.'
442 |             ' For a given PDB chain ID, the PDB contacts are extracted from PDB'
443 |             ' file. Then top N ranked DCA pairs are used for the contact map'
444 |             ' comparison. Use --help for more information.'
445 |     )
446 |     add_args_to_subparser(parser_dca_visualizer_contact_map, 'plot_contact_map')
447 |     parser_dca_visualizer_tp_rate = subparsers.add_parser('plot_tp_rate',
448 |         help = ' Plots the true positive rate per rank of a DCA computation'
449 |             ' result. The DCA file should contain ranked pairs (i, j) such that'
450 |             ' i < j. If the biomolecule is RNA, a secondary structure file should'
451 |             ' be provided if one is interested on tertiary contacts. Use --help'
452 |             ' for more information.'
453 | 
454 |     )
455 |     add_args_to_subparser(parser_dca_visualizer_tp_rate, 'plot_tp_rate')
456 | 
457 |     parser_pdb_content = subparsers.add_parser('pdb_content',
458 |         help = 'Displays information about the contents of a PDB file.'
459 |             ' Use --verbose optional argument to display the PDB summary'
460 |             ' on the terminal.',
461 |     )
462 |     add_args_to_subparser(parser_pdb_content, 'pdb_content')
463 | 
464 |     parser_trim_by_refseq = subparsers.add_parser('trim_by_refseq',
465 |         help='Removes MSA columns containing fraction of gaps more than'
466 |         ' the value specified by {} (default 0.5) if these columns'
467 |         ' do not correspond to residues of the sequence in MSA that matches'
468 |         ' with the reference. Setting max_gap to zero removes all columns'
469 |         ' except those corresponding to the residues of the matching sequence'
470 |         ' to the reference.'.format(CmdArgs.max_gap_optional)
471 |     )
472 |     add_args_to_subparser(parser_trim_by_refseq, 'trim_by_refseq')
473 | 
474 |     parser_trim_by_gap_size = subparsers.add_parser('trim_by_gap_size',
475 |         help = 'Removes MSA columns containing gap fraction more than the value'
476 |             ' specified by {} (default 0.5)'.format(CmdArgs.max_gap_optional)
477 |     )
478 |     add_args_to_subparser(parser_trim_by_gap_size, 'trim_by_gap_size')
479 | 
480 |     #display help if no argument is passed
481 |     args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
482 |     args_dict = vars(args)
483 |     # Do the computations based on the arguments passed from the command line
484 |     execute_from_command_line(
485 |         biomolecule = args_dict.get('biomolecule'),
486 |         msa_file = args_dict.get('msa_file'),
487 |         refseq_file = args_dict.get('refseq_file'),
488 |         the_command = args_dict.get('subcommand_name'),
489 |         force_seq_type=args_dict.get('force_seq_type'),
490 |         verbose = args_dict.get('verbose'),
491 |         output_dir = args_dict.get('output_dir'),
492 |         pdb_file = args_dict.get('pdb_file'),
493 |         pdb_chain_id = args_dict.get('pdb_chain_id'),
494 |         dca_file = args_dict.get('dca_file'),
495 |         rna_secstruct_file = args_dict.get('rna_secstruct_file'),
496 |         linear_dist = args_dict.get('linear_dist'),
497 |         contact_dist = args_dict.get('contact_dist'),
498 |         num_dca_contacts = args_dict.get('num_dca_contacts'),
499 |         wc_neighbor_dist = args_dict.get('wc_neighbor_dist'),
500 |         pdb_id = args_dict.get('pdb_id'),
501 |         max_gap = args_dict.get('max_gap'),
502 |         remove_all_gaps = args_dict.get('remove_all_gaps'),
503 |     )
504 |     logger.info('\n\tDONE')
505 |     return None
506 | 
507 | 
508 | if __name__ == '__main__':
509 |     """
510 |     """
511 |     run_pydca()
512 | 


--------------------------------------------------------------------------------
/pydca/meanfield_dca/__init__.py:
--------------------------------------------------------------------------------
1 | """Implements mean-field algorithm of direct coupling analysis (DCA) for protein
2 | and RNA families.
3 | """
4 | 


--------------------------------------------------------------------------------
/pydca/meanfield_dca/msa_numerics.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import numpy as np
  3 | from numba import jit
  4 | from numba import prange as parallel_range
  5 | 
  6 | 
  7 | """This module implements computationally constly routines while performing
  8 | Direct Coupling Analysis.
  9 | 
 10 | Author : Mehari B. Zerihun
 11 | """
 12 | 
 13 | @jit(nopython=True, parallel=True)
 14 | def compute_sequences_weight(alignment_data=None, seqid=None):
 15 |     """Computes weight of sequences. The weights are calculated by lumping
 16 |     together sequences whose identity is greater that a particular threshold.
 17 |     For example, if there are m similar sequences, each of them will be assigned
 18 |     a weight of 1/m. Note that the effective number of sequences is the sum of
 19 |     these weights.
 20 | 
 21 |     Parameters
 22 |     ----------
 23 |         alignmnet_data : np.array()
 24 |             Numpy 2d array of the alignment data, after the alignment is put in
 25 |             integer representation
 26 |         seqid : float
 27 |             Value at which beyond this sequences are considered similar. Typical
 28 |             values could be 0.7, 0.8, 0.9 and so on
 29 | 
 30 |     Returns
 31 |     -------
 32 |         seqs_weight : np.array()
 33 |             A 1d numpy array containing computed weights. This array has a size
 34 |             of the number of sequences in the alignment data.
 35 |     """
 36 |     alignment_shape = alignment_data.shape
 37 |     num_seqs = alignment_shape[0]
 38 |     seqs_len = alignment_shape[1]
 39 |     seqs_weight = np.zeros((num_seqs,), dtype=np.float64)
 40 |     #count similar sequences
 41 |     for i in parallel_range(num_seqs):
 42 |         seq_i = alignment_data[i]
 43 |         for j in range(num_seqs):
 44 |             seq_j = alignment_data[j]
 45 |             iid = np.sum(seq_i==seq_j)
 46 |             if np.float64(iid)/np.float64(seqs_len) > seqid:
 47 |                 seqs_weight[i] += 1
 48 |     #compute the weight of each sequence in the alignment
 49 |     for i in range(num_seqs): seqs_weight[i] = 1.0/float(seqs_weight[i])
 50 |     return seqs_weight
 51 | 
 52 | 
 53 | @jit(nopython=True)
 54 | def compute_single_site_freqs(alignment_data=None,
 55 |         num_site_states=None, seqs_weight=None):
 56 |     """Computes single site frequency counts for a particular aligmnet data.
 57 | 
 58 |     Parameters
 59 |     ----------
 60 |         alignment_data : np.array()
 61 |             A 2d numpy array of alignment data represented in integer form.
 62 | 
 63 |         num_site_states : int
 64 |             An integer value fo the number of states a sequence site can have
 65 |             including a gap state. Typical value is 5 for RNAs and 21 for
 66 |             proteins.
 67 | 
 68 |         seqs_weight : np.array()
 69 |             A 1d numpy array of sequences weight
 70 | 
 71 |     Returns
 72 |     -------
 73 |         single_site_freqs : np.array()
 74 |             A 2d numpy array of of data type float64. The shape of this array is
 75 |             (seqs_len, num_site_states) where seqs_len is the length of sequences
 76 |             in the alignment data.
 77 |     """
 78 |     alignment_shape = alignment_data.shape
 79 |     #num_seqs = alignment_shape[0]
 80 |     seqs_len = alignment_shape[1]
 81 |     m_eff = np.sum(seqs_weight)
 82 |     single_site_freqs = np.zeros(shape = (seqs_len, num_site_states),
 83 |         dtype = np.float64)
 84 |     for i in range(seqs_len):
 85 |         for a in range(1, num_site_states + 1):#we need gap states single site freqs too
 86 |             column_i = alignment_data[:,i]
 87 |             freq_ia = np.sum((column_i==a)*seqs_weight)
 88 |             single_site_freqs[i, a-1] = freq_ia/m_eff
 89 |     return single_site_freqs
 90 | 
 91 | 
 92 | @jit(nopython=True)
 93 | def get_reg_single_site_freqs(single_site_freqs = None, seqs_len = None,
 94 |         num_site_states = None, pseudocount = None):
 95 |     """Regularizes single site frequencies.
 96 | 
 97 |     Parameters
 98 |     ----------
 99 |         single_site_freqs : np.array()
100 |             A 2d numpy array of single site frequencies of shape
101 |             (seqs_len, num_site_states). Note that gap state frequencies are
102 |             included in this data.
103 |         seqs_len : int
104 |             The length of sequences in the alignment data
105 |         num_site_states : int
106 |             Total number of states that a site in a sequence can accommodate. It
107 |             includes gap states.
108 |         pseudocount : float
109 |             This is the value of the relative pseudo count of type float.
110 |             theta = lambda/(meff + lambda), where meff is the effective number of
111 |             sequences and lambda is the real pseudo count.
112 | 
113 |     Returns
114 |     -------
115 |         reg_single_site_freqs : np.array()
116 |             A 2d numpy array of shape (seqs_len, num_site_states) of single site
117 |             frequencies after they are regularized.
118 |     """
119 |     reg_single_site_freqs = single_site_freqs
120 |     theta_by_q = np.float64(pseudocount)/np.float64(num_site_states)
121 |     for i in range(seqs_len):
122 |         for a in range(num_site_states):
123 |             reg_single_site_freqs[i, a] = theta_by_q + \
124 |                 (1.0 - pseudocount)*reg_single_site_freqs[i, a]
125 |     return reg_single_site_freqs
126 | 
127 | 
128 | # This function is replaced by the parallelized version below
129 | @jit(nopython=True)
130 | def compute_pair_site_freqs_serial(alignment_data=None, num_site_states=None,
131 |         seqs_weight=None):
132 |     
133 |     """Computes pair site frequencies for an alignmnet data.
134 | 
135 |     Parameters
136 |     ----------
137 |         alignment_data : np.array()
138 |             A 2d numpy array conatining alignment data. The residues in the
139 |             alignment are in integer representation.
140 |         num_site_states : int
141 |             The number of possible states including gap state that sequence
142 |             sites can accomodate. It must be an integer
143 |         seqs_weight:
144 |             A 1d numpy array of sequences weight
145 | 
146 |     Returns
147 |     -------
148 |         pair_site_freqs : np.array()
149 |             A 3d numpy array of shape
150 |             (num_pairs, num_site_states, num_site_states) where num_pairs is
151 |             the number of unique pairs we can form from sequence sites. The
152 |             pairs are assumed to in the order (0, 1), (0, 2) (0, 3), ...(0, L-1),
153 |             ... (L-1, L). This ordering is critical and any change must be
154 |             documented.
155 |     """
156 |     alignment_shape = alignment_data.shape
157 |     num_seqs = alignment_shape[0]
158 |     seqs_len = alignment_shape[1]
159 |     num_site_pairs = (seqs_len -1)*seqs_len/2
160 |     num_site_pairs = np.int64(num_site_pairs)
161 |     m_eff = np.sum(seqs_weight)
162 |     pair_site_freqs = np.zeros(
163 |         shape=(num_site_pairs, num_site_states - 1, num_site_states - 1),
164 |         dtype = np.float64)
165 |     pair_counter = 0
166 |     for i in range(seqs_len-1):
167 |         column_i = alignment_data[:, i]
168 |         for j in range(i+1, seqs_len):
169 |             column_j = alignment_data[:, j]
170 |             for a in range(1, num_site_states):
171 |                 count_ai = column_i==a
172 |                 for b in range(1, num_site_states):
173 |                     count_bj = column_j==b
174 |                     count_ai_bj = count_ai * count_bj
175 |                     freq_ia_jb = np.sum(count_ai_bj*seqs_weight)
176 |                     pair_site_freqs[pair_counter, a-1, b-1] = freq_ia_jb/m_eff
177 |             #move to the next site pair (i, j)
178 |             pair_counter += 1
179 |     return pair_site_freqs
180 | 
181 | 
182 | @jit(nopython=True, parallel=True)
183 | def compute_pair_site_freqs(alignment_data=None, num_site_states=None, seqs_weight=None):
184 |     """Computes pair site frequencies for an alignmnet data.
185 | 
186 |     Parameters
187 |     ----------
188 |         alignment_data : np.array()
189 |             A 2d numpy array conatining alignment data. The residues in the
190 |             alignment are in integer representation.
191 |         num_site_states : int
192 |             The number of possible states including gap state that sequence
193 |             sites can accomodate. It must be an integer
194 |         seqs_weight:
195 |             A 1d numpy array of sequences weight
196 | 
197 |     Returns
198 |     -------
199 |         pair_site_freqs : np.array()
200 |             A 3d numpy array of shape
201 |             (num_pairs, num_site_states, num_site_states) where num_pairs is
202 |             the number of unique pairs we can form from sequence sites. The
203 |             pairs are assumed to in the order (0, 1), (0, 2) (0, 3), ...(0, L-1),
204 |             ... (L-1, L). This ordering is critical and any change must be
205 |             documented.
206 |     """
207 |     alignment_shape = alignment_data.shape
208 |     num_seqs = alignment_shape[0]
209 |     seqs_len = alignment_shape[1]
210 |     num_site_pairs = (seqs_len -1)*seqs_len/2
211 |     num_site_pairs = np.int64(num_site_pairs)
212 |     m_eff = np.sum(seqs_weight)
213 |     pair_site_freqs = np.zeros(
214 |         shape=(num_site_pairs, num_site_states - 1, num_site_states - 1),
215 |         dtype = np.float64
216 |     )
217 |     for i in parallel_range(seqs_len - 1):
218 |         column_i = alignment_data[:, i]
219 |         for j in range(i+1, seqs_len):
220 |             pair_site = int((seqs_len * (seqs_len - 1)/2) - (seqs_len - i) * ((seqs_len - i) - 1)/2  + j  - i - 1)
221 |             column_j = alignment_data[:, j]
222 |             for a in range(1, num_site_states):
223 |                 count_ai = column_i==a
224 |                 for b in range(1, num_site_states):
225 |                     count_bj = column_j==b
226 |                     count_ai_bj = count_ai * count_bj
227 |                     freq_ia_jb = np.sum(count_ai_bj*seqs_weight)
228 |                     pair_site_freqs[pair_site, a-1, b-1] += freq_ia_jb/m_eff
229 |     return pair_site_freqs 
230 | 
231 | @jit(nopython=True)
232 | def get_reg_pair_site_freqs(pair_site_freqs = None, seqs_len = None,
233 |         num_site_states = None, pseudocount = None):
234 |     """Regularizes pair site frequencies
235 | 
236 |     Parameters
237 |     ----------
238 |         pair_site_freqs : np.array()
239 |             A 3d numpy array of shape (num_unique_site_pairs, num_site_states -1,
240 |             num_site_states -1) containing raw pair site frequency counts where
241 |             num_unique_site_pairs is the total number of unique site pairs
242 |             excluding self pairing. Note that the order in with the pairing is
243 |             done is important. It must be taken in (0, 1), (0,2), ...,
244 |             (0, seqs_len-1), (1, 2)... order. Note that this data does not
245 |             contain pairings with gap states.
246 |         seqs_len : int
247 |             The length of sequences in the alignment.
248 |         num_site_states : int
249 |             The total number of states that a site in the sequences can
250 |             accommodate. This includes gap states.
251 | 
252 |     Returns
253 |     -------
254 |         reg_pair_site_freqs : np.array()
255 |             A numpy array of shape the same as pair_site_freqs
256 |     """
257 |     reg_pair_site_freqs = pair_site_freqs
258 |     theta_by_qsqrd = pseudocount/float(num_site_states * num_site_states)
259 |     pair_counter = 0
260 |     for i in range(seqs_len - 1):
261 |         for j in range(i + 1, seqs_len):
262 |             for a in range(num_site_states-1):
263 |                 for b in range(num_site_states-1):
264 |                     reg_pair_site_freqs[pair_counter, a, b] = theta_by_qsqrd + \
265 |                         (1.0 - pseudocount)*reg_pair_site_freqs[pair_counter, a, b]
266 |             pair_counter += 1
267 |     return reg_pair_site_freqs
268 | 
269 | 
270 | @jit(nopython=True)
271 | def construct_corr_mat(reg_fi = None, reg_fij = None, seqs_len = None,
272 |         num_site_states = None):
273 |     """Constructs correlation matrix from regularized frequency counts.
274 | 
275 |     Parameters
276 |     ----------
277 |         reg_fi : np.array()
278 |             A 2d numpy array of shape (seqs_len, num_site_states) of regularized
279 |             single site frequncies. Note that only fi[:, 0:num_site_states-1] are
280 |             used for construction of the correlation matrix, since values
281 |             corresponding to fi[:, num_site_states]  are the frequncies of gap
282 |             states.
283 |         reg_fij : np.array()
284 |             A 3d numpy array of shape (num_unique_pairs, num_site_states -1,
285 |             num_site_states - 1), where num_unique_pairs is the total number of
286 |             unique site pairs execluding self-pairings.
287 |         seqs_len : int
288 |             The length of sequences in the alignment
289 |         num_site_states : int
290 |             Total number of states a site in a sequence can accommodate.
291 | 
292 |     Returns
293 |     -------
294 |         corr_mat : np.array()
295 |             A 2d numpy array of shape (N, N)
296 |             where N = seqs_len * num_site_states -1
297 |     """
298 |     corr_mat_len = seqs_len * (num_site_states - 1)
299 |     corr_mat = np.zeros((corr_mat_len, corr_mat_len), dtype=np.float64)
300 |     pair_counter = 0
301 |     for i in range(seqs_len):
302 |         site_i = i * (num_site_states - 1)
303 |         for j in range(i, seqs_len):
304 |             site_j = j * (num_site_states - 1)
305 |             for a in range(num_site_states - 1):
306 |                 row = site_i + a
307 |                 for b in range(num_site_states -1):
308 |                     col = site_j + b
309 |                     if i==j:
310 |                         fia, fib = reg_fi[i, a], reg_fi[i, b]
311 |                         corr_ij_ab = fia*(1.0 - fia) if a == b else -1.0*fia*fib
312 |                     else:
313 |                         corr_ij_ab = reg_fij[pair_counter, a, b] - reg_fi[i, a] * reg_fi[j, b]
314 |                     corr_mat[row, col] = corr_ij_ab
315 |                     corr_mat[col, row] = corr_ij_ab
316 |             if i != j: pair_counter += 1
317 | 
318 |     return corr_mat
319 | 
320 | 
321 | @jit(nopython=True)
322 | def compute_couplings(corr_mat = None):
323 |     """Computes the couplings by inverting the correlation matrix
324 | 
325 |     Parameters
326 |     ----------
327 |         corr_mat : np.array()
328 |             A numpy array of shape (N, N) where N = seqs_len *(num_site_states -1)
329 |             where seqs_len  is the length of sequences in the alignment data and
330 |             num_site_states is the total number of states a site in a sequence
331 |             can accommodate, including gapped states.
332 | 
333 |     Returns
334 |     -------
335 |         couplings : np.array()
336 |             A 2d numpy array of the same shape as the correlation matrix. Note
337 |             that the couplings are the negative of the inverse of the
338 |             correlation matrix.
339 |     """
340 |     couplings = np.linalg.inv(corr_mat)
341 |     couplings = -1.0*couplings
342 |     return couplings
343 | 
344 | 
345 | @jit(nopython=True)
346 | def slice_couplings(couplings = None, site_pair = None, num_site_states=None):
347 |     """Returns couplings corresponding to site pair (i, j). Note that the
348 |     the couplings involving gaps are included, but they are set to zero.
349 | 
350 |     Parameters
351 |     ----------
352 |         couplings : np.array
353 |             A 2d numpy array of couplings. It has a shape of (L(q-1), L(q-1))
354 |             where L and q are the length of sequences in alignment data and total
355 |             number of standard residues plus gap.
356 |         site_pair : tuple
357 |             A tuple of site pairs. Example (0, 1), (0, L-1), ..., (L-2, L-1).
358 |         num_site_states : int
359 |             The value of q.
360 | 
361 |     Returns
362 |     -------
363 |         couplings_ij : np.array
364 |             A2d numpy array of shape (q, q) containing the couplings. Note that
365 |             couplings_ij[q, :] and couplings[:, q] are set to zero.
366 |     """
367 |     q = num_site_states
368 |     couplings_ij = np.zeros((q, q), dtype = np.float64)
369 |     row_begin = site_pair[0] * (q - 1)
370 |     row_end = row_begin + q - 1
371 |     column_begin = site_pair[1] * (q -1)
372 |     column_end = column_begin + q - 1
373 |     couplings_ij[:q-1, :q-1] = couplings[row_begin:row_end, column_begin:column_end]
374 |     return couplings_ij
375 | 
376 | 
377 | @jit(nopython=True)
378 | def compute_two_site_model_fields(couplings = None, reg_fi = None,
379 |         seqs_len = None, num_site_states = None):
380 |     """Computes two-site model fields iteratively.
381 | 
382 |     Parameters
383 |     ----------
384 |         couplings : np.array
385 |             A numpy array of couplings of shape (N, N) where
386 |             N = seqs_len * (num_site_states - 1)
387 | 
388 |         reg_fi : np.array
389 |             A numpy array of regularized single site frequncies of shape
390 |             (seqs_len, num_site_states)
391 | 
392 |         seqs_len : int
393 |             Length of sequences in alignment data
394 | 
395 |         num_site_states : int
396 |             Total number of states a site in a sequence can accommodate,
397 |             including gap state.
398 | 
399 |     Returns
400 |     -------
401 |         two_site_model_fields : np.array
402 |             A numpy array of shape (P, 2, num_site_states), where P is the number
403 |             of unique site pairs excluding self pairings.
404 |             P = seqs_len * (seqs_len - 1)/2.
405 |     """
406 | 
407 |     num_unique_pairs = seqs_len * (seqs_len -1)
408 |     num_unique_pairs /= 2
409 |     q = num_site_states
410 |     two_site_model_fields = np.zeros((np.int64(num_unique_pairs), 2, q), dtype=np.float64)
411 |     TOLERANCE = 1.0e-4
412 |     pair_counter = 0
413 |     for i in range(seqs_len - 1):
414 |         freq_i = np.reshape(reg_fi[i], (q, 1))
415 |         for j in range(i + 1, seqs_len):
416 |             site_pair = (i, j)
417 |             freq_j = np.reshape(reg_fi[j], (q, 1))
418 |             couplings_ij = np.exp(slice_couplings(couplings = couplings,
419 |                 site_pair = site_pair, num_site_states = q))
420 |             fields_i_old = np.full((q, 1), 1.0/np.float64(q))
421 |             fields_j_old = np.full((q, 1), 1.0/np.float64(q))
422 |             max_fields_change = 10.0
423 |             while max_fields_change > TOLERANCE:
424 |                 x_i = np.dot(couplings_ij , fields_j_old)
425 |                 x_j = np.dot(np.transpose(couplings_ij), fields_i_old)
426 | 
427 |                 fields_i_new =  freq_i / x_i
428 |                 fields_i_new /= np.sum(fields_i_new)
429 |                 fields_j_new = freq_j / x_j
430 |                 fields_j_new /= np.sum(fields_j_new)
431 | 
432 |                 delta_fields_i = np.max(np.absolute(fields_i_new - fields_i_old))
433 |                 delta_fields_j = np.max(np.absolute(fields_j_new - fields_j_old))
434 |                 max_fields_change = np.max(np.array([delta_fields_i, delta_fields_j]))
435 | 
436 |                 fields_i_old = fields_i_new
437 |                 fields_j_old = fields_j_new
438 |             #capture computed fields after iteration is converged
439 |             two_site_model_fields[pair_counter][0] = fields_i_new.T
440 |             two_site_model_fields[pair_counter][1] = fields_j_new.T
441 |             pair_counter += 1
442 |     return two_site_model_fields
443 | 
444 | 
445 | @jit(nopython=True)
446 | def compute_direct_info(couplings = None, fields_ij = None, reg_fi = None,
447 |         seqs_len = None, num_site_states = None):
448 |     """Computes the direct information from direct probabilities.
449 | 
450 |     Parameters
451 |     ----------
452 |         couplings : np.array
453 |             A 2d numpy array of shape (L(q-1), L(q-1)), where L and q are the
454 |             length of sequences in MSA and number of site-states respectively.
455 |             Note that the couplings are the negative of the inverse of the
456 |             correlation matrix.
457 | 
458 |         fields_ij : np.array
459 |             A 3d numpy array of two-site model fields. The shape of this array
460 |             is (P, 2, q). Where P is the number of unique site pairs and q is the
461 |             total number of site states. The ordering of site-pairs is very
462 |             important. For example index P=0 refers to site pairs (0, 1), and
463 |             as p increase the pairs are (0, 2), ... ,(0, L-1), (1, 2), ...,
464 |             (1, L-1), ..., (L-2, L-1). the first index of the second dimension
465 |             refers to the first site in site pair. Example, fields_ij[0][0]
466 |             contains the fields of site 0 when its paired with site 1, and
467 |             fields_ij[0][1] contains those of site 1 in the same pair, and so on.
468 | 
469 |         reg_fi : np.array
470 |             A 2d numpy array of regularized single site frequencies. It has
471 |             a shape of (L, q) where L and q are the length of the sequences
472 |             in alignment data and number of total site states respectively.
473 |             Example, reg_fi[0] contains the frequencies of the first column in
474 |             MSA.
475 | 
476 |         seqs_len : int
477 |             The length of sequences in MSA.
478 | 
479 |         num_site_states : int
480 |             The total number of residues plus gap.
481 | 
482 |     Returns
483 |     -------
484 |         unsorted_DI : np.array
485 |             A 1d numpy array of shape (P, ) containing the values of
486 |             direct informations (DI).  P is the total number of unique site pairs.
487 |             Example, index P = 0 contains DI of pair (0, 1),index P = 1 that
488 |             of (0, 2) and so on. The last pair is (L-2, L-1).  Note that the
489 |             direct information is computed from couplings and fields that involve
490 |             residues, although the direct probability is computed for all couplings
491 |             and new fields. The couplings involving a gap are set to 0. The fields
492 |             of gap states are not necessarily zero, they are  the new fields as
493 |             computed by two site model. If Pdir is the direct probabiliy of shape
494 |             (q, q), we use Pdir[:q-1, :q-1] when computing the direct information.
495 |     """
496 |     num_unique_pairs = np.int64(seqs_len * (seqs_len - 1)/2)
497 |     unsorted_DI = np.zeros(num_unique_pairs, dtype=np.float64)
498 |     q = num_site_states
499 |     EPSILON = 1.0e-20
500 |     pair_counter = 0
501 |     for i in range(seqs_len - 1):
502 |         fi = reg_fi[i]
503 |         for j in range(i + 1, seqs_len):
504 |             site_pair = (i, j)
505 |             fj = reg_fi[j]
506 |             #h_i = fields_ij[pair_counter][0]
507 |             #h_j = fields_ij[pair_counter][1]
508 |             hij = np.dot(np.reshape(fields_ij[pair_counter][0], (q, 1)),
509 |                 np.transpose(np.reshape(fields_ij[pair_counter][1], (q, 1))),
510 |             )
511 | 
512 |             couplingsij = np.exp(slice_couplings(couplings = couplings,
513 |                 site_pair = site_pair, num_site_states = q),
514 |             )
515 |             #Compute direct information
516 |             pdir_ij = couplingsij * hij
517 |             pdir_ij /= np.sum(pdir_ij)
518 |             #Compute product of single site frequencies
519 |             fij = np.dot(np.reshape(fi, (q, 1)),
520 |                 np.transpose(np.reshape(fj, (q, 1)))
521 |             )
522 |             #Only take into account residue residue interactions for computing
523 |             #direct information
524 |             fij_residues = fij[:q-1, :q-1] + EPSILON # + operator creats a copy
525 |             pdir_ij_residues = pdir_ij[:q-1, :q-1] + EPSILON
526 |             pdir_by_fij_residues =  pdir_ij_residues/fij_residues
527 |             #Compute direct information
528 |             DI_ij = np.sum(pdir_ij_residues * np.log(pdir_by_fij_residues))
529 |             unsorted_DI[pair_counter] = DI_ij
530 |             #Move to the next site pair
531 |             pair_counter += 1
532 | 
533 |     return unsorted_DI
534 | 


--------------------------------------------------------------------------------
/pydca/mfdca_main.py:
--------------------------------------------------------------------------------
  1 | from pydca.meanfield_dca import meanfield_dca
  2 | from pydca.sequence_backmapper.sequence_backmapper import SequenceBackmapper
  3 | from pydca.dca_utilities import dca_utilities
  4 | from argparse import ArgumentParser
  5 | import logging
  6 | import sys
  7 | import os
  8 | 
  9 | """Top level module for the pydca package. It implements command line
 10 | intefaces, including logging configuration, command line arguments and help
 11 | messages.
 12 | 
 13 | Author: Mehari B. Zerihun
 14 | """
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | def configure_logging():
 19 |     """Configures logging. When configured, the logging level is INFO and
 20 |     messages are logged to stream handler. Log level name are colored whenever
 21 |     the terminal supports that. INFO level is Green, WARNING level is Yellow and
 22 |     ERROR level is Red.
 23 |     """
 24 |     from pydca.config_dca.config_log import LOGGING_CONFIG
 25 |     from pydca.config_dca.config_log import ConsoleColor as c_color
 26 |     import logging.config
 27 | 
 28 |     logging.config.dictConfig(LOGGING_CONFIG)
 29 |     logging.addLevelName(logging.INFO, '{}{}{}'.format(
 30 |         c_color.green, logging.getLevelName(logging.INFO), c_color.nocolor))
 31 |     logging.addLevelName(logging.WARNING, '{}{}{}'.format(
 32 |         c_color.yellow, logging.getLevelName(logging.WARNING), c_color.nocolor))
 33 |     logging.addLevelName(logging.ERROR, '{}{}{}'.format(
 34 |         c_color.red, logging.getLevelName(logging.ERROR), c_color.nocolor))
 35 |     return None
 36 | 
 37 | class CmdArgs:
 38 |     """Defines variables related to command line parsing.
 39 |     """
 40 |     subcommand_name = 'subcommand_name'
 41 |     #variables for command line use
 42 |     output_dir_optional = '--output_dir'
 43 |     output_dir_help = """Directory path to which output results are written.
 44 |     If the directory is not existing, it will be created provided that the user
 45 |     has a privilege to do so. If this path is not provided, an output directory
 46 |     is created using the base name of the MSA file, with a prefix and/or postfix
 47 |     added to it.
 48 |     """
 49 |     verbose_optional = '--verbose'
 50 |     verbose_help = 'Show logging information on the terminal.'
 51 |     apc_optional = '--apc'
 52 |     apc_help = """Compute the average product corrected (APC) DCA score.
 53 |     """
 54 | 
 55 |     msa_file = 'msa_file'
 56 |     msa_file_help = 'Multiple sequence alignment (MSA) file in FASTA format'
 57 | 
 58 |     biomolecule = 'biomolecule'
 59 |     biomolecule_help = """Type of biomolecule.
 60 |     It should be either protein or RNA in lower or upper case letters.
 61 |     """
 62 |     refseq_file = 'refseq_file'
 63 |     refseq_file_optional = '--refseq_file'
 64 |     refseq_file_help = """FASTA formatted file containing a reference sequence.
 65 |     The reference sequence should not contain gaps or non-standard residues.
 66 |     """
 67 | 
 68 |     pseudocount_optional = '--pseudocount'
 69 |     pseudocount_help = """Relative value of the pseudocount so as to regularize
 70 |     emperical single-site and pair-site frequencies obtained from alignment data.
 71 |     Note that this is not the actual value of the pseudocount, instead, it is
 72 |     the ratio X/(X + Meff) where X is the actual pseudocount and Meff is the
 73 |     effective number of sequences.
 74 |     """
 75 | 
 76 |     seqid_optional = '--seqid'
 77 |     seqid_help = """Cut-off value of sequences similarity above which they
 78 |     are lumped together.
 79 |     """
 80 | 
 81 |     ranked_by_optional = '--ranked_by'
 82 |     ranked_by_optional_help="""Method in which DCA scores are calculated. There are
 83 |     four options: direct information (DI), Frobenius norm (FN) and their average
 84 |     product corrected forms (DI_APC, FN_APC).
 85 |     """
 86 | 
 87 |     linear_dist_optional = '--linear_dist'
 88 |     linear_dist_help="""Minimum separation beteween site pairs in sequence. 
 89 |     """
 90 |     num_site_pairs_optional = '--num_site_pairs'
 91 |     num_site_pairs_help = """The maximum number of site pairs whose couplings are 
 92 |     to be extracted.
 93 |     """
 94 | ## end of class CmdArgs
 95 | 
 96 | DCA_COMPUTATION_SUBCOMMANDS = ['compute_di', 'compute_fn','compute_couplings',
 97 |     'compute_fields', 'compute_params','compute_fi', 'compute_fij',
 98 | ]
 99 | 
100 | 
101 | def get_mfdca_instance(msa_file, biomolecule, force_seq_type=False, **kwargs):
102 |     """Inititalizes the MeanFieldDCA instance based on the arguments passed
103 |     on the command line
104 | 
105 |     Parameters
106 |     ----------
107 |         msa_file : str
108 |             Path to FASTA file containing MSA data.
109 |         biomolecule : str
110 |             Type of biomolecule (protein or RNA)
111 |         **kwargs : dict
112 |             This list of keyword arguments contains:
113 |             i)      seqid : The sequence identity.
114 |             ii)     pseudocount : The relative pseudo count.
115 | 
116 |     Returns
117 |     -------
118 |         mfdca_inst : MeanFieldDCA
119 |             A MeanFieldDCA instance.
120 |     """
121 |     #mfdca_inst = meanfield_dca.MeanFieldDCA(msa_file, biomolecule)
122 |     seqid = kwargs.get('seqid')
123 |     pseudocount = kwargs.get('pseudocount')
124 |     mfdca_inst = meanfield_dca.MeanFieldDCA(msa_file, biomolecule,
125 |         pseudocount=pseudocount,seqid=seqid
126 |     )
127 |     return mfdca_inst
128 | 
129 | 
130 | def execute_from_command_line(msa_file=None, biomolecule=None, seqid=None,
131 |         pseudocount=None, the_command=None, refseq_file = None,
132 |         verbose=False, output_dir=None, apc=False, ranked_by=None,
133 |         linear_dist=None, num_site_pairs=None):
134 |     """Do computations according to the parameters passed on the command line
135 | 
136 |     Parameters
137 |     ----------
138 |         msa_file : str
139 |             Path to MSA file.
140 |         biomolecule : str
141 |             Name of biomolecule (protein or RNA, case insensitive).
142 |         seqid : float
143 |             Value of sequence identity.
144 |         pseudocount : float
145 |             Value of relative pseudo count.
146 |         the_command : str
147 |             Name of the command passed from the command line.
148 |         refseq_file : str
149 |             Path to reference sequence file.
150 |         verbsoe : bool
151 |             Display logging message to the screen.
152 |         ouput_dir : str
153 |             Output directory to write ouput files.
154 |         apc : bool
155 |             Perform average product correction.
156 | 
157 |     Returns
158 |     -------
159 |         None : None 
160 |     """
161 | 
162 | 
163 |     if verbose: configure_logging()
164 | 
165 |     if the_command.strip() in DCA_COMPUTATION_SUBCOMMANDS:
166 |         mfdca_instance = get_mfdca_instance(msa_file, biomolecule, seqid=seqid,
167 |             pseudocount=pseudocount,
168 |         )
169 |         seqbackmapper = None
170 |         # update mapped_sites  when refseq is provided
171 |         if refseq_file:# do backmapping when reference sequence file is provided
172 |             seqbackmapper = SequenceBackmapper(
173 |                 alignment_data=mfdca_instance.alignment,
174 |                 refseq_file = refseq_file,
175 |                 biomolecule = mfdca_instance.biomolecule
176 |             )
177 |         param_metadata = dca_utilities.mfdca_param_metadata(mfdca_instance)
178 |         #create path to output directory is not supplied by user
179 |         if not output_dir:
180 |             msa_file_base_name, ext = os.path.splitext(os.path.basename(msa_file))
181 |             output_dir = 'MFDCA_output_' + msa_file_base_name
182 |         # create dca coutput directory
183 |         dca_utilities.create_directories(output_dir)
184 |         # compute DCA score
185 |         if the_command.strip()=='compute_di':
186 |             if apc: # do average product correction if apc is passed from the command line
187 |                 sorted_DI = mfdca_instance.compute_sorted_DI_APC(seqbackmapper=seqbackmapper)
188 |                 score_type = ' MF DI average product corrected (APC)'
189 |                 di_file_path = dca_utilities.get_dca_output_file_path(output_dir,
190 |                     msa_file, prefix='MFDCA_apc_di_scores_', postfix='.txt'
191 |                 )
192 |             else: # compute raw DCA score if apc is not asked
193 |                 sorted_DI = mfdca_instance.compute_sorted_DI(seqbackmapper=seqbackmapper)
194 |                 score_type = 'raw DI'
195 |                 di_file_path = dca_utilities.get_dca_output_file_path(output_dir,
196 |                     msa_file, prefix='MFDCA_raw_di_scores_', postfix='.txt'
197 |                 )
198 |             
199 |             dca_utilities.write_sorted_dca_scores(di_file_path,sorted_DI,
200 |                 metadata=param_metadata,
201 |                 score_type = score_type
202 |             )
203 |         # compute Frobenius norm of couplings
204 |         if the_command.strip()=='compute_fn':
205 |             if apc:
206 |                 score_type = 'MFDCA Frobenius norm, average product corrected (APC)'
207 |                 sorted_FN = mfdca_instance.compute_sorted_FN_APC(seqbackmapper=seqbackmapper)
208 |                 fn_file_path = dca_utilities.get_dca_output_file_path(output_dir,
209 |                     msa_file, prefix = 'MFDCA_apc_fn_scores_', postfix='.txt'
210 |                 )
211 |             else:
212 |                 score_type = 'MFDCA raw Frobenius norm'
213 |                 sorted_FN = mfdca_instance.compute_sorted_FN(seqbackmapper=seqbackmapper)
214 |                 fn_file_path = dca_utilities.get_dca_output_file_path(output_dir,
215 |                     msa_file, prefix = 'MFDCA_raw_fn_scores_', postfix='.txt'
216 |                 )
217 |             dca_utilities.write_sorted_dca_scores(fn_file_path, sorted_FN,
218 |                 metadata = param_metadata,
219 |                 score_type = score_type
220 |             )
221 |         # compute global probability local fields
222 |         if the_command.strip()=='compute_fields':
223 |             fields = mfdca_instance.compute_fields()
224 |             residue_repr_metadata = dca_utilities.mfdca_residue_repr_metadata(
225 |                 mfdca_instance.biomolecule
226 |             )
227 |             metadata = param_metadata + residue_repr_metadata
228 |             fields_file_path = dca_utilities.get_dca_output_file_path(output_dir,
229 |                 msa_file, prefix = 'fields_', postfix='.txt'
230 |             )
231 |             dca_utilities.write_fields_csv(fields_file_path, fields, 
232 |                 metadata=metadata,
233 |             )
234 |         # compute fields and couplings
235 |         if the_command.strip() == 'compute_params':
236 |             fields, couplings = mfdca_instance.compute_params(
237 |                 seqbackmapper = seqbackmapper,
238 |                 ranked_by = ranked_by,
239 |                 linear_dist = linear_dist,
240 |                 num_site_pairs = num_site_pairs,
241 |             )
242 |             residue_repr_metadata = dca_utilities.mfdca_residue_repr_metadata(
243 |                 mfdca_instance.biomolecule
244 |             )
245 |             metadata = param_metadata + residue_repr_metadata
246 |             # write fields to text file
247 |             fields_file_path = dca_utilities.get_dca_output_file_path(output_dir,
248 |                     msa_file, prefix = 'fields_', postfix='.txt'
249 |                 )
250 |             param_metadata.append('#\tTotal number of sites whose fields are extracted: {}'.format(len(fields))) 
251 |             dca_utilities.write_fields_csv(fields_file_path, fields, metadata=param_metadata)
252 |             couplings_file_path = dca_utilities.get_dca_output_file_path(output_dir,
253 |                     msa_file, prefix = 'couplings_', postfix='.txt'
254 |                 )
255 |             param_metadata.pop() 
256 |             param_metadata.append('#\tTotal number of site pairs whose couplings are extracted: {}'.format(len(couplings)))
257 |             if ranked_by is None: # the default is FN_APC
258 |                 ranked_by = 'FN_APC'
259 |             param_metadata.append('#\tDCA ranking method used: {}'.format(ranked_by))
260 |             if linear_dist is None: # default is |i - j| > 4
261 |                 linear_dist = 4
262 |             param_metadata.append('#\tMinimum separation beteween site pairs in sequence: |i - j| > {}'.format(linear_dist))
263 |             dca_utilities.write_couplings_csv(couplings_file_path, couplings, metadata=param_metadata)
264 | 
265 |         #Compute single site frequencies
266 |         if the_command.strip() == 'compute_fi':
267 |             #pass --pseudocount 0.0 if raw frequencies are desired
268 |             fi = mfdca_instance.get_reg_single_site_freqs()
269 |             residue_repr_metadata = dca_utilities.mfdca_residue_repr_metadata(
270 |                 mfdca_instance.biomolecule)
271 |             metadata = param_metadata + residue_repr_metadata
272 |             fi_file_path = dca_utilities.get_dca_output_file_path(output_dir,
273 |                 msa_file, prefix='fi_', postfix='.txt')
274 |             dca_utilities.write_single_site_freqs(fi_file_path, fi,
275 |                 seqs_len = mfdca_instance.sequences_len,
276 |                 num_site_states = mfdca_instance.num_site_states,
277 |                 metadata = metadata)
278 | 
279 |         #Compute pair site frequencies
280 |         if the_command.strip() == 'compute_fij':
281 |             # pass --pseudocount 0.0 to compute raw fij
282 |             file_path = dca_utilities.get_dca_output_file_path(output_dir,
283 |                 msa_file, prefix='fij_', postfix='.txt',
284 |             )
285 |             residue_repr_metadata = dca_utilities.mfdca_residue_repr_metadata(
286 |                 mfdca_instance.biomolecule,
287 |             )
288 |             metadata = param_metadata + residue_repr_metadata
289 |             fij = mfdca_instance.get_reg_pair_site_freqs()
290 |             dca_utilities.write_pair_site_freqs(file_path, fij,
291 |                 seqs_len = mfdca_instance.sequences_len,
292 |                 num_site_states = mfdca_instance.num_site_states,
293 |                 metadata = metadata,
294 |             )
295 | 
296 |     return None
297 | 
298 | 
299 | def run_meanfield_dca():
300 |     """Entry point for DCA computations.
301 | 
302 |     Parameters
303 |     ----------
304 |         All arguments to be used are captured from the command line.
305 | 
306 |     Returns
307 |     -------
308 |         None.
309 |     """
310 |     parser = ArgumentParser()
311 | 
312 |     #Create subparsers
313 |     subparsers = parser.add_subparsers(dest = CmdArgs.subcommand_name)
314 | 
315 |     # Mean-field DI computation parser
316 |     parser_compute_di = subparsers.add_parser('compute_di',
317 |         help = 'Computes the direct information.'
318 |         ' Example: mfdca compute_di <biomolecule> <MSA> --verbose, where'
319 |         ' <biomolecule> takes a value protein or rna (case insensitive)'
320 |         ' <MSA> takes path to the MSA file',
321 |     )
322 |     
323 |     parser_compute_di.add_argument(CmdArgs.biomolecule, help = CmdArgs.biomolecule_help)
324 |     parser_compute_di.add_argument(CmdArgs.msa_file, help = CmdArgs.msa_file_help)
325 |     parser_compute_di.add_argument(CmdArgs.seqid_optional, help = CmdArgs.seqid_help, type = float)
326 |     parser_compute_di.add_argument(CmdArgs.pseudocount_optional, help=CmdArgs.pseudocount_help, type = float)
327 |     parser_compute_di.add_argument(CmdArgs.refseq_file_optional, help = CmdArgs.refseq_file_help)
328 |     parser_compute_di.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
329 |     parser_compute_di.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_help, action='store_true')
330 |     parser_compute_di.add_argument(CmdArgs.apc_optional, help=CmdArgs.apc_help, action='store_true')
331 | 
332 |     # Mean-field FN computation parser
333 |     parser_compute_fn = subparsers.add_parser('compute_fn',
334 |         help = 'Compute the Frobenius norm of couplings.'
335 |             ' Example: see compute_di',
336 |     )
337 |     
338 |     parser_compute_fn.add_argument(CmdArgs.biomolecule, help = CmdArgs.biomolecule_help)
339 |     parser_compute_fn.add_argument(CmdArgs.msa_file, help = CmdArgs.msa_file_help)
340 |     parser_compute_fn.add_argument(CmdArgs.seqid_optional, help = CmdArgs.seqid_help, type = float)
341 |     parser_compute_fn.add_argument(CmdArgs.pseudocount_optional, help=CmdArgs.pseudocount_help, type = float)
342 |     parser_compute_fn.add_argument(CmdArgs.refseq_file_optional, help = CmdArgs.refseq_file_help)
343 |     parser_compute_fn.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
344 |     parser_compute_fn.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_help, action='store_true')
345 |     parser_compute_fn.add_argument(CmdArgs.apc_optional, help=CmdArgs.apc_help, action='store_true')
346 | 
347 |     # parameters (fields and couplings) computation parser
348 |     parser_compute_params = subparsers.add_parser('compute_params',
349 |         help = 'Computes the parameters of global probability model, i.e., '
350 |             ' couplings and fields in one run.'
351 |     )
352 |     parser_compute_params.add_argument(CmdArgs.biomolecule, help = CmdArgs.biomolecule_help)
353 |     parser_compute_params.add_argument(CmdArgs.msa_file, help = CmdArgs.msa_file_help)
354 |     parser_compute_params.add_argument(CmdArgs.seqid_optional, help = CmdArgs.seqid_help, type = float)
355 |     parser_compute_params.add_argument(CmdArgs.pseudocount_optional, help=CmdArgs.pseudocount_help, type = float)
356 |     parser_compute_params.add_argument(CmdArgs.refseq_file_optional, help = CmdArgs.refseq_file_help)
357 |     parser_compute_params.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
358 |     parser_compute_params.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_help, action='store_true')
359 |     parser_compute_params.add_argument(CmdArgs.apc_optional, help=CmdArgs.apc_help, action='store_true')
360 |     parser_compute_params.add_argument(CmdArgs.ranked_by_optional, help=CmdArgs.ranked_by_optional_help, 
361 |         choices= ('FN', 'FN_APC', 'DI', 'DI_APC', 'fn', 'fn_apc', 'di', 'di_apc')
362 |     )
363 |     parser_compute_params.add_argument(CmdArgs.linear_dist_optional, help=CmdArgs.linear_dist_help, type=int)
364 |     parser_compute_params.add_argument(CmdArgs.num_site_pairs_optional, help=CmdArgs.num_site_pairs_help, type=int)
365 | 
366 |     
367 |     #Single site frequencies computation parser
368 |     parser_compute_fi = subparsers.add_parser('compute_fi',
369 |         help = 'Computes regularized single-site frequencies from MSA.'
370 |             ' If raw frequencies are desired, use --pseudocount 0'
371 |     )
372 |     parser_compute_fi.add_argument(CmdArgs.biomolecule, help = CmdArgs.biomolecule_help)
373 |     parser_compute_fi.add_argument(CmdArgs.msa_file, help = CmdArgs.msa_file_help)
374 |     parser_compute_fi.add_argument(CmdArgs.seqid_optional, help = CmdArgs.seqid_help, type = float)
375 |     parser_compute_fi.add_argument(CmdArgs.pseudocount_optional, help=CmdArgs.pseudocount_help, type = float)
376 |     parser_compute_fi.add_argument(CmdArgs.refseq_file_optional, help = CmdArgs.refseq_file_help)
377 |     parser_compute_fi.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
378 |     parser_compute_fi.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_help, action='store_true')
379 |     parser_compute_fi.add_argument(CmdArgs.apc_optional, help=CmdArgs.apc_help, action='store_true')
380 |     
381 |     #pair site frequencies computation parser
382 |     parser_compute_fij = subparsers.add_parser('compute_fij',
383 |         help = 'Computes regularized pair-site frequencies from MSA. If raw'
384 |         ' frequenceis are desired, set the pseudocount to zero. Use --help'
385 |         ' for more information.'
386 |     )
387 |     parser_compute_fij.add_argument(CmdArgs.biomolecule, help = CmdArgs.biomolecule_help)
388 |     parser_compute_fij.add_argument(CmdArgs.msa_file, help = CmdArgs.msa_file_help)
389 |     parser_compute_fij.add_argument(CmdArgs.seqid_optional, help = CmdArgs.seqid_help, type = float)
390 |     parser_compute_fij.add_argument(CmdArgs.pseudocount_optional, help=CmdArgs.pseudocount_help, type = float)
391 |     parser_compute_fij.add_argument(CmdArgs.refseq_file_optional, help = CmdArgs.refseq_file_help)
392 |     parser_compute_fij.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
393 |     parser_compute_fij.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_help, action='store_true')
394 |     parser_compute_fij.add_argument(CmdArgs.apc_optional, help=CmdArgs.apc_help, action='store_true')
395 | 
396 |     #display help if no argument is passed
397 |     args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
398 |     args_dict = vars(args)
399 |     # Do the computations based on the arguments passed from the command line
400 |     execute_from_command_line(
401 |         biomolecule = args_dict.get('biomolecule'),
402 |         msa_file = args_dict.get('msa_file'),
403 |         seqid = args_dict.get('seqid'),
404 |         pseudocount = args_dict.get('pseudocount'),
405 |         refseq_file = args_dict.get('refseq_file'),
406 |         the_command = args_dict.get('subcommand_name'),
407 |         verbose = args_dict.get('verbose'),
408 |         output_dir = args_dict.get('output_dir'),
409 |         apc = args_dict.get('apc'),
410 |         ranked_by = args_dict.get('ranked_by'),
411 |         linear_dist = args_dict.get('linear_dist'),
412 |         num_site_pairs = args_dict.get('num_site_pairs'),
413 |     )
414 |     logger.info('\n\tDONE')
415 |     return None
416 | 
417 | 
418 | if __name__ == '__main__':
419 |     #run DCA computation when this file is loaded as __main__
420 |     run_meanfield_dca()
421 | 


--------------------------------------------------------------------------------
/pydca/msa_trimmer/__init__.py:
--------------------------------------------------------------------------------
1 | """MSA trimmer module. Provides utilities to trim MSA data by reference sequence 
2 | and/or percentage of gaps in MSA columns.
3 | """


--------------------------------------------------------------------------------
/pydca/msa_trimmer/msa_trimmer.py:
--------------------------------------------------------------------------------
  1 | from Bio import AlignIO
  2 | from ..sequence_backmapper.sequence_backmapper import SequenceBackmapper
  3 | import logging
  4 | """Trims MSA data by gap percentage or removing all gaps corresponding to best
  5 | matching sequence to a reference sequence.
  6 | 
  7 | Author: Mehari B. Zerihun
  8 | """
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | class MSATrimmerException(Exception):
 13 |     """Raises exceptions related to MSA trimming
 14 |     """
 15 | 
 16 | class MSATrimmer:
 17 | 
 18 |     def __init__(self, msa_file, biomolecule=None,max_gap=None, refseq_file=None):
 19 |         """
 20 |         Parameters
 21 |         ----------
 22 |             self : MSATrimmer
 23 |                 An instance of MSATrimmer class
 24 |             msa_file : str
 25 |                 Path to the FASTA formatted MSA file
 26 |             biomolecule : str
 27 |                 Type of biomolecule (protein or RNA)
 28 |         """
 29 |         self.__msa_file = msa_file
 30 |         self.__refseq_file = refseq_file
 31 |         self.__max_gap = 0.5 if max_gap is None else max_gap
 32 |         if self.__max_gap > 1.0 or self.__max_gap < 0.0:
 33 |             logger.error('\n\tThe value of max_gap should be between 0 and 1')
 34 |             raise MSATrimmerException
 35 |         if biomolecule is not None:
 36 |             self.__biomolecule = biomolecule.strip().upper()
 37 |         else:
 38 |             self.__biomolecule = biomolecule
 39 |         self.__alignment_data = list(AlignIO.read(self.__msa_file, 'fasta'))
 40 | 
 41 |         logger.info('\n\tMSA file: {0}'
 42 |             '\n\tReference sequence file: {1}'
 43 |             '\n\tbiomolecule: {2}'
 44 |             ''.format(self.__msa_file, self.__refseq_file,
 45 |                 self.__biomolecule,
 46 |             )
 47 |         )
 48 |         return None
 49 | 
 50 | 
 51 |     @property
 52 |     def alignment_data(self):
 53 |         """
 54 |         """
 55 |         return self.__alignment_data
 56 | 
 57 | 
 58 |     def compute_msa_columns_gap_size(self):
 59 |         """Computes the gap size of each column in MSA
 60 | 
 61 |         Parameters
 62 |         ----------
 63 |             self : MSATrimmer
 64 |                 Instance of MSATrimmer class
 65 | 
 66 |         Returns
 67 |         -------
 68 |             msa_columns_gap_size : tuple
 69 |                 A tuple of column gap sizes. The column gap size is computed as
 70 |                 the fraction of gaps in a particular MSA column.
 71 | 
 72 |         """
 73 |         logger.info('\n\tObtaining columns containing more than {}% of gaps'.format(
 74 |             self.__max_gap * 100)
 75 |         )
 76 |         seqs_len = len(self.__alignment_data[0].seq)
 77 |         num_seqs = len(self.__alignment_data)
 78 |         logger.info('\n\tTotal number of sequences read from MSA file:{}'
 79 |             '\n\tLength of the sequences:{}'.format(num_seqs, seqs_len)
 80 |         )
 81 |         msa_columns_gap_size = list()
 82 |         for i in range(seqs_len):
 83 |             num_gaps = 0
 84 |             for record in self.__alignment_data:
 85 |                 state_i = record.seq[i]
 86 |                 if state_i == '.' or state_i == '-': num_gaps += 1
 87 |             gap_fraction_i = float(num_gaps)/float(num_seqs)
 88 |             msa_columns_gap_size.append(gap_fraction_i)
 89 |         max_gap_size = max(msa_columns_gap_size)
 90 |         min_gap_size  = min(msa_columns_gap_size)
 91 |         logger.info('\n\tMinimum and maximum gap percentages, respectively:'
 92 |             '{0:.2f}% and {1:.2f}%'.format(max_gap_size * 100, min_gap_size * 100)
 93 |         )
 94 |         return tuple(msa_columns_gap_size)
 95 | 
 96 | 
 97 |     def msa_columns_beyond_max_gap(self):
 98 |         """Obtains the columns in MSA tha contain more than the given fraction of
 99 |         gaps treshold.
100 | 
101 |         Parameters
102 |         ----------
103 |             self : MSATrimmer
104 |                 An instance of MSATrimmer class
105 | 
106 |         Returns
107 |         -------
108 |             msa_columns_beyond_max_gap : tuple
109 |                 A tuple of MSA columns that contain fraction of gaps beyond the
110 |                 max_gap
111 |         """
112 |         columns_gap_size = self.compute_msa_columns_gap_size()
113 |         seqs_len = len(self.__alignment_data[0].seq)
114 |         msa_columns_beyond_max_gap = [
115 |             i for i in range(seqs_len) if columns_gap_size[i] > self.__max_gap
116 |         ]
117 |         return tuple(msa_columns_beyond_max_gap)
118 | 
119 | 
120 |     def trim_by_gap_size(self):
121 |         """Returns a tuple of MSA columns that have beyond self.__max_gap gap
122 |         fraction.
123 | 
124 |         Parameters
125 |         ---------
126 |             self : MSATrimmer
127 |                 An instance of MSATrimmer class
128 | 
129 |         Returns
130 |         -------
131 |             columns_to_remove : tuple
132 |                 A tuple containing columns that are going to to trimmed. These
133 |                 are MSA columns that have a gap fraction beyond self.__max_gap.
134 |         """
135 |         columns_to_remove = self.msa_columns_beyond_max_gap()
136 |         return tuple(columns_to_remove)
137 | 
138 | 
139 |     def trim_by_refseq(self, remove_all_gaps=False):
140 |         """Obtains columns in MSA that contain gaps more that the gap treshold
141 |         and do not involve residues in the best matchin sequence with reference.
142 |         If remove_all_gaps is set True, all columns involving gaps in the matching
143 |         sequence to reference are removed.
144 | 
145 |         Parameters
146 |         ----------
147 |             self : MSATrimmer
148 |                 An instance of MSATrimmer
149 |             remove_all_gaps : bool
150 |                 If set to True, all columns with gaps in the matching sequence
151 |                 with the reference are removed.
152 | 
153 |         Returns
154 |         -------
155 |             columns_to_remove : tuple
156 |                 A tuple of MSA column positions. These columns are going to
157 |                 be removed from the MSA.
158 |         """
159 |         seqbackmapper = SequenceBackmapper(msa_file = self.__msa_file,
160 |             refseq_file = self.__refseq_file,
161 |             biomolecule = self.__biomolecule,
162 |         )
163 |         matching_seqs = seqbackmapper.find_matching_seqs_from_alignment()
164 |         logger.info('\n\tRemoving gapped columns corresponding to best'
165 |             ' matching sequence to the reference'
166 |         )
167 |         first_matching_seq = matching_seqs[0]
168 |         logger.info('\n\tSequence in MSA that matches the reference'
169 |             '\n\t{}'.format(first_matching_seq)
170 |         )
171 | 
172 |         gap_symbols = ['-', '.']
173 |         if not remove_all_gaps:
174 |             candidate_columns_to_remove = self.msa_columns_beyond_max_gap()
175 |             # find out MSA columns that does correspond to gaps w.r.t the sequence
176 |             # in MSA that matches with the reference
177 |             logger.info('\n\tNumber of columns with more than {0:.2f}% gaps:{1}'
178 |                 ''.format(self.__max_gap* 100, len(candidate_columns_to_remove))
179 |             )
180 |             columns_to_remove = [
181 |                 i for i in candidate_columns_to_remove if first_matching_seq[i] in gap_symbols
182 |             ]
183 |             logger.info('\n\tNumber of columns to remove: {}'.format(len(columns_to_remove)))
184 |         else: # if remove all gaps
185 |             logger.info('\n\tRemoving all columns corresponding to gaps in the matching sequence')
186 |             seqs_len = len(self.__alignment_data[0].seq)
187 |             columns_to_remove = [
188 |                 i for i in range(seqs_len) if first_matching_seq[i] in gap_symbols
189 |             ]
190 |             logger.info('\n\tNumber of columns to be removed from MSA:{}'.format(
191 |                 len(columns_to_remove))
192 |             )
193 | 
194 |         return tuple(columns_to_remove)
195 | 
196 |     
197 |     def get_msa_trimmed_by_refseq(self, remove_all_gaps=False):
198 |         """
199 |         """
200 |         columns_to_remove = self.trim_by_refseq(remove_all_gaps=remove_all_gaps)
201 |         trimmed_msa = list()
202 |         for record in self.__alignment_data:
203 |             seq, seqid = record.seq, record.id
204 |             trimmed_seq = [seq[i] for i in range(len(seq)) if i not in columns_to_remove]
205 |             id_seq_pair = seqid, ''.join(trimmed_seq) 
206 |             trimmed_msa.append(id_seq_pair)
207 |         return trimmed_msa
208 | 
209 | 


--------------------------------------------------------------------------------
/pydca/plmdca/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | pseudolikelihood maximization direct coupling analysis (plmDCA) module for 
3 | protein and RNA multiple sequence alignments. 
4 | """


--------------------------------------------------------------------------------
/pydca/plmdca/include/plmdca.h:
--------------------------------------------------------------------------------
 1 | #ifndef PLMDCA_BACKEND_H
 2 | #define PLMDCA_BACKEND_H
 3 |     #include<cstdio>    
 4 |     #include<cstdlib>
 5 |     #include<fstream>
 6 |     #include<iostream>
 7 |     #include<string>
 8 |     #include<vector>
 9 |     #include<unordered_map>
10 |     #include<algorithm>
11 |     #include<cmath>
12 |     #include<numeric>
13 |     #include"../lbfgs/include/lbfgs.h"
14 | 
15 | 
16 |     class PlmDCA {
17 | 
18 |         public:
19 |             
20 |             
21 |             PlmDCA(const char* m_msa_file, unsigned int m_biomolecule, unsigned int m_seqs_len, 
22 |                 unsigned int m_num_site_states, float m_seqid, float m_lambda_h, 
23 |                 float m_lambda_J, unsigned int num_threads
24 |             );
25 | 
26 |             //float value(const TVector &fields_and_couplings);
27 |             float gradient(const float* fields_and_couplings, float* grad);
28 | 
29 |             void initFieldsAndCouplings(float* fields_and_couplings);
30 |             std::vector<std::vector<float>> getSingleSiteFreqs();
31 |             std::vector<float> getPairSiteFreqs();
32 |             std::vector<std::vector<std::vector<std::vector<float>>>> getPairSiteFreqsFragmented();
33 |             std::vector<std::vector<std::vector<std::vector<float>>>> fourDimVecFactory(
34 |                 unsigned int const vec_size_1, unsigned int const vec_size_2, 
35 |                 unsigned int const vec_size_3, unsigned int const vec_size_4
36 |             );
37 |             void printPairSiteFreqsMultiThreaded();
38 |             void printPairSiteFreqsFragmented();
39 |             unsigned int mapIndexPairSiteFreqs(const unsigned int, const unsigned int,  
40 |                 const unsigned int, const unsigned int
41 |             );
42 |             unsigned int mapIndexPairSiteFreqsLocal(const unsigned int, const unsigned int,
43 |                 const unsigned int
44 |             );
45 |             void printMapIndexPairSiteFreqsLocal(const unsigned int);
46 |             void testSingleSiteFreqs();
47 |             std::vector<std::vector<unsigned int>> readSequencesFromFile();
48 |             unsigned int mapIndexCouplings(const unsigned  int i, const unsigned int j, 
49 |                 const unsigned int a, const unsigned int b
50 |             );
51 |             unsigned int mapIndexCouplingsOneSite(const unsigned int j, 
52 |                 const unsigned int a, const unsigned int b
53 |             );
54 |             unsigned int mapIndexFields(const unsigned int i, const unsigned int a);
55 |             void printIndexMappingFields();
56 | 
57 |             void printIndexMappingCouplings();
58 | 
59 |             std::vector<float> computeSeqsWeight();
60 |             void printWeights();
61 |             void runPlmDCALocal(unsigned int num_iteration);
62 |             float* computeGradient();
63 |             //void printDCAScores(float* h_and_J);
64 |             int runPlmDCA();
65 | 
66 |             void printSeqs();
67 |             
68 |         private:
69 |             const char* msa_file;
70 |             unsigned int biomolecule;
71 |             unsigned int seqs_len;
72 |             unsigned int  num_site_states;
73 |             float seqid;
74 |             float lambda_h;
75 |             float lambda_J;
76 |             std::vector<std::vector<unsigned int>> seqs_int_form;
77 |             unsigned int num_seqs;
78 |             std::vector<float> seqs_weight;
79 |             unsigned int  num_fields;
80 |             unsigned int num_couplings;
81 |             unsigned int num_fields_and_couplings;
82 |             unsigned int  num_threads;
83 |             float  Meff;
84 |     };
85 | #endif
86 | 


--------------------------------------------------------------------------------
/pydca/plmdca/lbfgs/include/arithmetic_ansi.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *      ANSI C implementation of vector operations.
  3 |  *
  4 |  * Copyright (c) 2007-2010 Naoaki Okazaki
  5 |  * All rights reserved.
  6 |  *
  7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |  * of this software and associated documentation files (the "Software"), to deal
  9 |  * in the Software without restriction, including without limitation the rights
 10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the Software is
 12 |  * furnished to do so, subject to the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be included in
 15 |  * all copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 |  * THE SOFTWARE.
 24 |  */
 25 | 
 26 | /* $Id$ */
 27 | 
 28 | #include <stdlib.h>
 29 | #include <memory.h>
 30 | 
 31 | #if     LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
 32 | #define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
 33 | #else
 34 | #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
 35 | #endif/*LBFGS_IEEE_FLOAT*/
 36 | 
 37 | inline static void* vecalloc(size_t size)
 38 | {
 39 |     void *memblock = malloc(size);
 40 |     if (memblock) {
 41 |         memset(memblock, 0, size);
 42 |     }
 43 |     return memblock;
 44 | }
 45 | 
 46 | inline static void vecfree(void *memblock)
 47 | {
 48 |     free(memblock);
 49 | }
 50 | 
 51 | inline static void vecset(lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
 52 | {
 53 |     int i;
 54 |     
 55 |     for (i = 0;i < n;++i) {
 56 |         x[i] = c;
 57 |     }
 58 | }
 59 | 
 60 | inline static void veccpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
 61 | {
 62 |     int i;
 63 | 
 64 |     for (i = 0;i < n;++i) {
 65 |         y[i] = x[i];
 66 |     }
 67 | }
 68 | 
 69 | inline static void vecncpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
 70 | {
 71 |     int i;
 72 | 
 73 |     for (i = 0;i < n;++i) {
 74 |         y[i] = -x[i];
 75 |     }
 76 | }
 77 | 
 78 | inline static void vecadd(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
 79 | {
 80 |     int i;
 81 | 
 82 |     for (i = 0;i < n;++i) {
 83 |         y[i] += c * x[i];
 84 |     }
 85 | }
 86 | 
 87 | inline static void vecdiff(lbfgsfloatval_t *z, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
 88 | {
 89 |     int i;
 90 | 
 91 |     for (i = 0;i < n;++i) {
 92 |         z[i] = x[i] - y[i];
 93 |     }
 94 | }
 95 | 
 96 | inline static void vecscale(lbfgsfloatval_t *y, const lbfgsfloatval_t c, const int n)
 97 | {
 98 |     int i;
 99 | 
100 |     for (i = 0;i < n;++i) {
101 |         y[i] *= c;
102 |     }
103 | }
104 | 
105 | inline static void vecmul(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
106 | {
107 |     int i;
108 | 
109 |     for (i = 0;i < n;++i) {
110 |         y[i] *= x[i];
111 |     }
112 | }
113 | 
114 | inline static void vecdot(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
115 | {
116 |     int i;
117 |     *s = 0.;
118 |     for (i = 0;i < n;++i) {
119 |         *s += x[i] * y[i];
120 |     }
121 | }
122 | 
123 | inline static void vec2norm(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
124 | {
125 |     vecdot(s, x, x, n);
126 |     *s = (lbfgsfloatval_t)sqrt(*s);
127 | }
128 | 
129 | inline static void vec2norminv(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
130 | {
131 |     vec2norm(s, x, n);
132 |     *s = (lbfgsfloatval_t)(1.0 / *s);
133 | }
134 | 


--------------------------------------------------------------------------------
/pydca/plmdca/lbfgs/include/arithmetic_sse_double.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *      SSE2 implementation of vector oprations (64bit double).
  3 |  *
  4 |  * Copyright (c) 2007-2010 Naoaki Okazaki
  5 |  * All rights reserved.
  6 |  *
  7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |  * of this software and associated documentation files (the "Software"), to deal
  9 |  * in the Software without restriction, including without limitation the rights
 10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the Software is
 12 |  * furnished to do so, subject to the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be included in
 15 |  * all copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 |  * THE SOFTWARE.
 24 |  */
 25 | 
 26 | /* $Id$ */
 27 | 
 28 | #include <stdlib.h>
 29 | #ifndef __APPLE__
 30 | #include <malloc.h>
 31 | #endif
 32 | #include <memory.h>
 33 | 
 34 | #if     1400 <= _MSC_VER
 35 | #include <intrin.h>
 36 | #endif/*1400 <= _MSC_VER*/
 37 | 
 38 | #if     HAVE_EMMINTRIN_H
 39 | #include <emmintrin.h>
 40 | #endif/*HAVE_EMMINTRIN_H*/
 41 | 
 42 | inline static void* vecalloc(size_t size)
 43 | {
 44 | #if     defined(_MSC_VER)
 45 |     void *memblock = _aligned_malloc(size, 16);
 46 | #elif   defined(__APPLE__)  /* OS X always aligns on 16-byte boundaries */
 47 |     void *memblock = malloc(size);
 48 | #else
 49 |     void *memblock = NULL, *p = NULL;
 50 |     if (posix_memalign(&p, 16, size) == 0) {
 51 |         memblock = p;
 52 |     }
 53 | #endif
 54 |     if (memblock != NULL) {
 55 |         memset(memblock, 0, size);
 56 |     }
 57 |     return memblock;
 58 | }
 59 | 
 60 | inline static void vecfree(void *memblock)
 61 | {
 62 | #ifdef	_MSC_VER
 63 |     _aligned_free(memblock);
 64 | #else
 65 |     free(memblock);
 66 | #endif
 67 | }
 68 | 
 69 | #define fsigndiff(x, y) \
 70 |     ((_mm_movemask_pd(_mm_set_pd(*(x), *(y))) + 1) & 0x002)
 71 | 
 72 | #define vecset(x, c, n) \
 73 | { \
 74 |     int i; \
 75 |     __m128d XMM0 = _mm_set1_pd(c); \
 76 |     for (i = 0;i < (n);i += 8) { \
 77 |         _mm_store_pd((x)+i  , XMM0); \
 78 |         _mm_store_pd((x)+i+2, XMM0); \
 79 |         _mm_store_pd((x)+i+4, XMM0); \
 80 |         _mm_store_pd((x)+i+6, XMM0); \
 81 |     } \
 82 | }
 83 | 
 84 | #define veccpy(y, x, n) \
 85 | { \
 86 |     int i; \
 87 |     for (i = 0;i < (n);i += 8) { \
 88 |         __m128d XMM0 = _mm_load_pd((x)+i  ); \
 89 |         __m128d XMM1 = _mm_load_pd((x)+i+2); \
 90 |         __m128d XMM2 = _mm_load_pd((x)+i+4); \
 91 |         __m128d XMM3 = _mm_load_pd((x)+i+6); \
 92 |         _mm_store_pd((y)+i  , XMM0); \
 93 |         _mm_store_pd((y)+i+2, XMM1); \
 94 |         _mm_store_pd((y)+i+4, XMM2); \
 95 |         _mm_store_pd((y)+i+6, XMM3); \
 96 |     } \
 97 | }
 98 | 
 99 | #define vecncpy(y, x, n) \
100 | { \
101 |     int i; \
102 |     for (i = 0;i < (n);i += 8) { \
103 |         __m128d XMM0 = _mm_setzero_pd(); \
104 |         __m128d XMM1 = _mm_setzero_pd(); \
105 |         __m128d XMM2 = _mm_setzero_pd(); \
106 |         __m128d XMM3 = _mm_setzero_pd(); \
107 |         __m128d XMM4 = _mm_load_pd((x)+i  ); \
108 |         __m128d XMM5 = _mm_load_pd((x)+i+2); \
109 |         __m128d XMM6 = _mm_load_pd((x)+i+4); \
110 |         __m128d XMM7 = _mm_load_pd((x)+i+6); \
111 |         XMM0 = _mm_sub_pd(XMM0, XMM4); \
112 |         XMM1 = _mm_sub_pd(XMM1, XMM5); \
113 |         XMM2 = _mm_sub_pd(XMM2, XMM6); \
114 |         XMM3 = _mm_sub_pd(XMM3, XMM7); \
115 |         _mm_store_pd((y)+i  , XMM0); \
116 |         _mm_store_pd((y)+i+2, XMM1); \
117 |         _mm_store_pd((y)+i+4, XMM2); \
118 |         _mm_store_pd((y)+i+6, XMM3); \
119 |     } \
120 | }
121 | 
122 | #define vecadd(y, x, c, n) \
123 | { \
124 |     int i; \
125 |     __m128d XMM7 = _mm_set1_pd(c); \
126 |     for (i = 0;i < (n);i += 4) { \
127 |         __m128d XMM0 = _mm_load_pd((x)+i  ); \
128 |         __m128d XMM1 = _mm_load_pd((x)+i+2); \
129 |         __m128d XMM2 = _mm_load_pd((y)+i  ); \
130 |         __m128d XMM3 = _mm_load_pd((y)+i+2); \
131 |         XMM0 = _mm_mul_pd(XMM0, XMM7); \
132 |         XMM1 = _mm_mul_pd(XMM1, XMM7); \
133 |         XMM2 = _mm_add_pd(XMM2, XMM0); \
134 |         XMM3 = _mm_add_pd(XMM3, XMM1); \
135 |         _mm_store_pd((y)+i  , XMM2); \
136 |         _mm_store_pd((y)+i+2, XMM3); \
137 |     } \
138 | }
139 | 
140 | #define vecdiff(z, x, y, n) \
141 | { \
142 |     int i; \
143 |     for (i = 0;i < (n);i += 8) { \
144 |         __m128d XMM0 = _mm_load_pd((x)+i  ); \
145 |         __m128d XMM1 = _mm_load_pd((x)+i+2); \
146 |         __m128d XMM2 = _mm_load_pd((x)+i+4); \
147 |         __m128d XMM3 = _mm_load_pd((x)+i+6); \
148 |         __m128d XMM4 = _mm_load_pd((y)+i  ); \
149 |         __m128d XMM5 = _mm_load_pd((y)+i+2); \
150 |         __m128d XMM6 = _mm_load_pd((y)+i+4); \
151 |         __m128d XMM7 = _mm_load_pd((y)+i+6); \
152 |         XMM0 = _mm_sub_pd(XMM0, XMM4); \
153 |         XMM1 = _mm_sub_pd(XMM1, XMM5); \
154 |         XMM2 = _mm_sub_pd(XMM2, XMM6); \
155 |         XMM3 = _mm_sub_pd(XMM3, XMM7); \
156 |         _mm_store_pd((z)+i  , XMM0); \
157 |         _mm_store_pd((z)+i+2, XMM1); \
158 |         _mm_store_pd((z)+i+4, XMM2); \
159 |         _mm_store_pd((z)+i+6, XMM3); \
160 |     } \
161 | }
162 | 
163 | #define vecscale(y, c, n) \
164 | { \
165 |     int i; \
166 |     __m128d XMM7 = _mm_set1_pd(c); \
167 |     for (i = 0;i < (n);i += 4) { \
168 |         __m128d XMM0 = _mm_load_pd((y)+i  ); \
169 |         __m128d XMM1 = _mm_load_pd((y)+i+2); \
170 |         XMM0 = _mm_mul_pd(XMM0, XMM7); \
171 |         XMM1 = _mm_mul_pd(XMM1, XMM7); \
172 |         _mm_store_pd((y)+i  , XMM0); \
173 |         _mm_store_pd((y)+i+2, XMM1); \
174 |     } \
175 | }
176 | 
177 | #define vecmul(y, x, n) \
178 | { \
179 |     int i; \
180 |     for (i = 0;i < (n);i += 8) { \
181 |         __m128d XMM0 = _mm_load_pd((x)+i  ); \
182 |         __m128d XMM1 = _mm_load_pd((x)+i+2); \
183 |         __m128d XMM2 = _mm_load_pd((x)+i+4); \
184 |         __m128d XMM3 = _mm_load_pd((x)+i+6); \
185 |         __m128d XMM4 = _mm_load_pd((y)+i  ); \
186 |         __m128d XMM5 = _mm_load_pd((y)+i+2); \
187 |         __m128d XMM6 = _mm_load_pd((y)+i+4); \
188 |         __m128d XMM7 = _mm_load_pd((y)+i+6); \
189 |         XMM4 = _mm_mul_pd(XMM4, XMM0); \
190 |         XMM5 = _mm_mul_pd(XMM5, XMM1); \
191 |         XMM6 = _mm_mul_pd(XMM6, XMM2); \
192 |         XMM7 = _mm_mul_pd(XMM7, XMM3); \
193 |         _mm_store_pd((y)+i  , XMM4); \
194 |         _mm_store_pd((y)+i+2, XMM5); \
195 |         _mm_store_pd((y)+i+4, XMM6); \
196 |         _mm_store_pd((y)+i+6, XMM7); \
197 |     } \
198 | }
199 | 
200 | 
201 | 
202 | #if     3 <= __SSE__ || defined(__SSE3__)
203 | /*
204 |     Horizontal add with haddps SSE3 instruction. The work register (rw)
205 |     is unused.
206 |  */
207 | #define __horizontal_sum(r, rw) \
208 |     r = _mm_hadd_ps(r, r); \
209 |     r = _mm_hadd_ps(r, r);
210 | 
211 | #else
212 | /*
213 |     Horizontal add with SSE instruction. The work register (rw) is used.
214 |  */
215 | #define __horizontal_sum(r, rw) \
216 |     rw = r; \
217 |     r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
218 |     r = _mm_add_ps(r, rw); \
219 |     rw = r; \
220 |     r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
221 |     r = _mm_add_ps(r, rw);
222 | 
223 | #endif
224 | 
225 | #define vecdot(s, x, y, n) \
226 | { \
227 |     int i; \
228 |     __m128d XMM0 = _mm_setzero_pd(); \
229 |     __m128d XMM1 = _mm_setzero_pd(); \
230 |     __m128d XMM2, XMM3, XMM4, XMM5; \
231 |     for (i = 0;i < (n);i += 4) { \
232 |         XMM2 = _mm_load_pd((x)+i  ); \
233 |         XMM3 = _mm_load_pd((x)+i+2); \
234 |         XMM4 = _mm_load_pd((y)+i  ); \
235 |         XMM5 = _mm_load_pd((y)+i+2); \
236 |         XMM2 = _mm_mul_pd(XMM2, XMM4); \
237 |         XMM3 = _mm_mul_pd(XMM3, XMM5); \
238 |         XMM0 = _mm_add_pd(XMM0, XMM2); \
239 |         XMM1 = _mm_add_pd(XMM1, XMM3); \
240 |     } \
241 |     XMM0 = _mm_add_pd(XMM0, XMM1); \
242 |     XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
243 |     XMM0 = _mm_add_pd(XMM0, XMM1); \
244 |     _mm_store_sd((s), XMM0); \
245 | }
246 | 
247 | #define vec2norm(s, x, n) \
248 | { \
249 |     int i; \
250 |     __m128d XMM0 = _mm_setzero_pd(); \
251 |     __m128d XMM1 = _mm_setzero_pd(); \
252 |     __m128d XMM2, XMM3, XMM4, XMM5; \
253 |     for (i = 0;i < (n);i += 4) { \
254 |         XMM2 = _mm_load_pd((x)+i  ); \
255 |         XMM3 = _mm_load_pd((x)+i+2); \
256 |         XMM4 = XMM2; \
257 |         XMM5 = XMM3; \
258 |         XMM2 = _mm_mul_pd(XMM2, XMM4); \
259 |         XMM3 = _mm_mul_pd(XMM3, XMM5); \
260 |         XMM0 = _mm_add_pd(XMM0, XMM2); \
261 |         XMM1 = _mm_add_pd(XMM1, XMM3); \
262 |     } \
263 |     XMM0 = _mm_add_pd(XMM0, XMM1); \
264 |     XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
265 |     XMM0 = _mm_add_pd(XMM0, XMM1); \
266 |     XMM0 = _mm_sqrt_pd(XMM0); \
267 |     _mm_store_sd((s), XMM0); \
268 | }
269 | 
270 | 
271 | #define vec2norminv(s, x, n) \
272 | { \
273 |     int i; \
274 |     __m128d XMM0 = _mm_setzero_pd(); \
275 |     __m128d XMM1 = _mm_setzero_pd(); \
276 |     __m128d XMM2, XMM3, XMM4, XMM5; \
277 |     for (i = 0;i < (n);i += 4) { \
278 |         XMM2 = _mm_load_pd((x)+i  ); \
279 |         XMM3 = _mm_load_pd((x)+i+2); \
280 |         XMM4 = XMM2; \
281 |         XMM5 = XMM3; \
282 |         XMM2 = _mm_mul_pd(XMM2, XMM4); \
283 |         XMM3 = _mm_mul_pd(XMM3, XMM5); \
284 |         XMM0 = _mm_add_pd(XMM0, XMM2); \
285 |         XMM1 = _mm_add_pd(XMM1, XMM3); \
286 |     } \
287 |     XMM2 = _mm_set1_pd(1.0); \
288 |     XMM0 = _mm_add_pd(XMM0, XMM1); \
289 |     XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
290 |     XMM0 = _mm_add_pd(XMM0, XMM1); \
291 |     XMM0 = _mm_sqrt_pd(XMM0); \
292 |     XMM2 = _mm_div_pd(XMM2, XMM0); \
293 |     _mm_store_sd((s), XMM2); \
294 | }
295 | 


--------------------------------------------------------------------------------
/pydca/plmdca/lbfgs/include/arithmetic_sse_float.h:
--------------------------------------------------------------------------------
  1 | /*
  2 |  *      SSE/SSE3 implementation of vector oprations (32bit float).
  3 |  *
  4 |  * Copyright (c) 2007-2010 Naoaki Okazaki
  5 |  * All rights reserved.
  6 |  *
  7 |  * Permission is hereby granted, free of charge, to any person obtaining a copy
  8 |  * of this software and associated documentation files (the "Software"), to deal
  9 |  * in the Software without restriction, including without limitation the rights
 10 |  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 11 |  * copies of the Software, and to permit persons to whom the Software is
 12 |  * furnished to do so, subject to the following conditions:
 13 |  *
 14 |  * The above copyright notice and this permission notice shall be included in
 15 |  * all copies or substantial portions of the Software.
 16 |  *
 17 |  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 18 |  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 19 |  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 20 |  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 21 |  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 22 |  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 23 |  * THE SOFTWARE.
 24 |  */
 25 | 
 26 | /* $Id$ */
 27 | 
 28 | #include <stdlib.h>
 29 | #ifndef __APPLE__
 30 | #include <malloc.h>
 31 | #endif
 32 | #include <memory.h>
 33 | 
 34 | #if     1400 <= _MSC_VER
 35 | #include <intrin.h>
 36 | #endif/*_MSC_VER*/
 37 | 
 38 | #if     HAVE_XMMINTRIN_H
 39 | #include <xmmintrin.h>
 40 | #endif/*HAVE_XMMINTRIN_H*/
 41 | 
 42 | #if     LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
 43 | #define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
 44 | #else
 45 | #define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
 46 | #endif/*LBFGS_IEEE_FLOAT*/
 47 | 
 48 | inline static void* vecalloc(size_t size)
 49 | {
 50 | #if     defined(_MSC_VER)
 51 |     void *memblock = _aligned_malloc(size, 16);
 52 | #elif   defined(__APPLE__)  /* OS X always aligns on 16-byte boundaries */
 53 |     void *memblock = malloc(size);
 54 | #else
 55 |     void *memblock = NULL, *p = NULL;
 56 |     if (posix_memalign(&p, 16, size) == 0) {
 57 |         memblock = p;
 58 |     }
 59 | #endif
 60 |     if (memblock != NULL) {
 61 |         memset(memblock, 0, size);
 62 |     }
 63 |     return memblock;
 64 | }
 65 | 
 66 | inline static void vecfree(void *memblock)
 67 | {
 68 |     _aligned_free(memblock);
 69 | }
 70 | 
 71 | #define vecset(x, c, n) \
 72 | { \
 73 |     int i; \
 74 |     __m128 XMM0 = _mm_set_ps1(c); \
 75 |     for (i = 0;i < (n);i += 16) { \
 76 |         _mm_store_ps((x)+i   , XMM0); \
 77 |         _mm_store_ps((x)+i+ 4, XMM0); \
 78 |         _mm_store_ps((x)+i+ 8, XMM0); \
 79 |         _mm_store_ps((x)+i+12, XMM0); \
 80 |     } \
 81 | }
 82 | 
 83 | #define veccpy(y, x, n) \
 84 | { \
 85 |     int i; \
 86 |     for (i = 0;i < (n);i += 16) { \
 87 |         __m128 XMM0 = _mm_load_ps((x)+i   ); \
 88 |         __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
 89 |         __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
 90 |         __m128 XMM3 = _mm_load_ps((x)+i+12); \
 91 |         _mm_store_ps((y)+i   , XMM0); \
 92 |         _mm_store_ps((y)+i+ 4, XMM1); \
 93 |         _mm_store_ps((y)+i+ 8, XMM2); \
 94 |         _mm_store_ps((y)+i+12, XMM3); \
 95 |     } \
 96 | }
 97 | 
 98 | #define vecncpy(y, x, n) \
 99 | { \
100 |     int i; \
101 |     const uint32_t mask = 0x80000000; \
102 |     __m128 XMM4 = _mm_load_ps1((float*)&mask); \
103 |     for (i = 0;i < (n);i += 16) { \
104 |         __m128 XMM0 = _mm_load_ps((x)+i   ); \
105 |         __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
106 |         __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
107 |         __m128 XMM3 = _mm_load_ps((x)+i+12); \
108 |         XMM0 = _mm_xor_ps(XMM0, XMM4); \
109 |         XMM1 = _mm_xor_ps(XMM1, XMM4); \
110 |         XMM2 = _mm_xor_ps(XMM2, XMM4); \
111 |         XMM3 = _mm_xor_ps(XMM3, XMM4); \
112 |         _mm_store_ps((y)+i   , XMM0); \
113 |         _mm_store_ps((y)+i+ 4, XMM1); \
114 |         _mm_store_ps((y)+i+ 8, XMM2); \
115 |         _mm_store_ps((y)+i+12, XMM3); \
116 |     } \
117 | }
118 | 
119 | #define vecadd(y, x, c, n) \
120 | { \
121 |     int i; \
122 |     __m128 XMM7 = _mm_set_ps1(c); \
123 |     for (i = 0;i < (n);i += 8) { \
124 |         __m128 XMM0 = _mm_load_ps((x)+i  ); \
125 |         __m128 XMM1 = _mm_load_ps((x)+i+4); \
126 |         __m128 XMM2 = _mm_load_ps((y)+i  ); \
127 |         __m128 XMM3 = _mm_load_ps((y)+i+4); \
128 |         XMM0 = _mm_mul_ps(XMM0, XMM7); \
129 |         XMM1 = _mm_mul_ps(XMM1, XMM7); \
130 |         XMM2 = _mm_add_ps(XMM2, XMM0); \
131 |         XMM3 = _mm_add_ps(XMM3, XMM1); \
132 |         _mm_store_ps((y)+i  , XMM2); \
133 |         _mm_store_ps((y)+i+4, XMM3); \
134 |     } \
135 | }
136 | 
137 | #define vecdiff(z, x, y, n) \
138 | { \
139 |     int i; \
140 |     for (i = 0;i < (n);i += 16) { \
141 |         __m128 XMM0 = _mm_load_ps((x)+i   ); \
142 |         __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
143 |         __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
144 |         __m128 XMM3 = _mm_load_ps((x)+i+12); \
145 |         __m128 XMM4 = _mm_load_ps((y)+i   ); \
146 |         __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
147 |         __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
148 |         __m128 XMM7 = _mm_load_ps((y)+i+12); \
149 |         XMM0 = _mm_sub_ps(XMM0, XMM4); \
150 |         XMM1 = _mm_sub_ps(XMM1, XMM5); \
151 |         XMM2 = _mm_sub_ps(XMM2, XMM6); \
152 |         XMM3 = _mm_sub_ps(XMM3, XMM7); \
153 |         _mm_store_ps((z)+i   , XMM0); \
154 |         _mm_store_ps((z)+i+ 4, XMM1); \
155 |         _mm_store_ps((z)+i+ 8, XMM2); \
156 |         _mm_store_ps((z)+i+12, XMM3); \
157 |     } \
158 | }
159 | 
160 | #define vecscale(y, c, n) \
161 | { \
162 |     int i; \
163 |     __m128 XMM7 = _mm_set_ps1(c); \
164 |     for (i = 0;i < (n);i += 8) { \
165 |         __m128 XMM0 = _mm_load_ps((y)+i  ); \
166 |         __m128 XMM1 = _mm_load_ps((y)+i+4); \
167 |         XMM0 = _mm_mul_ps(XMM0, XMM7); \
168 |         XMM1 = _mm_mul_ps(XMM1, XMM7); \
169 |         _mm_store_ps((y)+i  , XMM0); \
170 |         _mm_store_ps((y)+i+4, XMM1); \
171 |     } \
172 | }
173 | 
174 | #define vecmul(y, x, n) \
175 | { \
176 |     int i; \
177 |     for (i = 0;i < (n);i += 16) { \
178 |         __m128 XMM0 = _mm_load_ps((x)+i   ); \
179 |         __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
180 |         __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
181 |         __m128 XMM3 = _mm_load_ps((x)+i+12); \
182 |         __m128 XMM4 = _mm_load_ps((y)+i   ); \
183 |         __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
184 |         __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
185 |         __m128 XMM7 = _mm_load_ps((y)+i+12); \
186 |         XMM4 = _mm_mul_ps(XMM4, XMM0); \
187 |         XMM5 = _mm_mul_ps(XMM5, XMM1); \
188 |         XMM6 = _mm_mul_ps(XMM6, XMM2); \
189 |         XMM7 = _mm_mul_ps(XMM7, XMM3); \
190 |         _mm_store_ps((y)+i   , XMM4); \
191 |         _mm_store_ps((y)+i+ 4, XMM5); \
192 |         _mm_store_ps((y)+i+ 8, XMM6); \
193 |         _mm_store_ps((y)+i+12, XMM7); \
194 |     } \
195 | }
196 | 
197 | 
198 | 
199 | #if     3 <= __SSE__ || defined(__SSE3__)
200 | /*
201 |     Horizontal add with haddps SSE3 instruction. The work register (rw)
202 |     is unused.
203 |  */
204 | #define __horizontal_sum(r, rw) \
205 |     r = _mm_hadd_ps(r, r); \
206 |     r = _mm_hadd_ps(r, r);
207 | 
208 | #else
209 | /*
210 |     Horizontal add with SSE instruction. The work register (rw) is used.
211 |  */
212 | #define __horizontal_sum(r, rw) \
213 |     rw = r; \
214 |     r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
215 |     r = _mm_add_ps(r, rw); \
216 |     rw = r; \
217 |     r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
218 |     r = _mm_add_ps(r, rw);
219 | 
220 | #endif
221 | 
222 | #define vecdot(s, x, y, n) \
223 | { \
224 |     int i; \
225 |     __m128 XMM0 = _mm_setzero_ps(); \
226 |     __m128 XMM1 = _mm_setzero_ps(); \
227 |     __m128 XMM2, XMM3, XMM4, XMM5; \
228 |     for (i = 0;i < (n);i += 8) { \
229 |         XMM2 = _mm_load_ps((x)+i  ); \
230 |         XMM3 = _mm_load_ps((x)+i+4); \
231 |         XMM4 = _mm_load_ps((y)+i  ); \
232 |         XMM5 = _mm_load_ps((y)+i+4); \
233 |         XMM2 = _mm_mul_ps(XMM2, XMM4); \
234 |         XMM3 = _mm_mul_ps(XMM3, XMM5); \
235 |         XMM0 = _mm_add_ps(XMM0, XMM2); \
236 |         XMM1 = _mm_add_ps(XMM1, XMM3); \
237 |     } \
238 |     XMM0 = _mm_add_ps(XMM0, XMM1); \
239 |     __horizontal_sum(XMM0, XMM1); \
240 |     _mm_store_ss((s), XMM0); \
241 | }
242 | 
243 | #define vec2norm(s, x, n) \
244 | { \
245 |     int i; \
246 |     __m128 XMM0 = _mm_setzero_ps(); \
247 |     __m128 XMM1 = _mm_setzero_ps(); \
248 |     __m128 XMM2, XMM3; \
249 |     for (i = 0;i < (n);i += 8) { \
250 |         XMM2 = _mm_load_ps((x)+i  ); \
251 |         XMM3 = _mm_load_ps((x)+i+4); \
252 |         XMM2 = _mm_mul_ps(XMM2, XMM2); \
253 |         XMM3 = _mm_mul_ps(XMM3, XMM3); \
254 |         XMM0 = _mm_add_ps(XMM0, XMM2); \
255 |         XMM1 = _mm_add_ps(XMM1, XMM3); \
256 |     } \
257 |     XMM0 = _mm_add_ps(XMM0, XMM1); \
258 |     __horizontal_sum(XMM0, XMM1); \
259 |     XMM2 = XMM0; \
260 |     XMM1 = _mm_rsqrt_ss(XMM0); \
261 |     XMM3 = XMM1; \
262 |     XMM1 = _mm_mul_ss(XMM1, XMM1); \
263 |     XMM1 = _mm_mul_ss(XMM1, XMM3); \
264 |     XMM1 = _mm_mul_ss(XMM1, XMM0); \
265 |     XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
266 |     XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
267 |     XMM3 = _mm_add_ss(XMM3, XMM1); \
268 |     XMM3 = _mm_mul_ss(XMM3, XMM2); \
269 |     _mm_store_ss((s), XMM3); \
270 | }
271 | 
272 | #define vec2norminv(s, x, n) \
273 | { \
274 |     int i; \
275 |     __m128 XMM0 = _mm_setzero_ps(); \
276 |     __m128 XMM1 = _mm_setzero_ps(); \
277 |     __m128 XMM2, XMM3; \
278 |     for (i = 0;i < (n);i += 16) { \
279 |         XMM2 = _mm_load_ps((x)+i  ); \
280 |         XMM3 = _mm_load_ps((x)+i+4); \
281 |         XMM2 = _mm_mul_ps(XMM2, XMM2); \
282 |         XMM3 = _mm_mul_ps(XMM3, XMM3); \
283 |         XMM0 = _mm_add_ps(XMM0, XMM2); \
284 |         XMM1 = _mm_add_ps(XMM1, XMM3); \
285 |     } \
286 |     XMM0 = _mm_add_ps(XMM0, XMM1); \
287 |     __horizontal_sum(XMM0, XMM1); \
288 |     XMM2 = XMM0; \
289 |     XMM1 = _mm_rsqrt_ss(XMM0); \
290 |     XMM3 = XMM1; \
291 |     XMM1 = _mm_mul_ss(XMM1, XMM1); \
292 |     XMM1 = _mm_mul_ss(XMM1, XMM3); \
293 |     XMM1 = _mm_mul_ss(XMM1, XMM0); \
294 |     XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
295 |     XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
296 |     XMM3 = _mm_add_ss(XMM3, XMM1); \
297 |     _mm_store_ss((s), XMM3); \
298 | }
299 | 


--------------------------------------------------------------------------------
/pydca/plmdca/msa_numerics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np 
  2 | from numba import jit 
  3 | from numba import prange as parallel_range
  4 | 
  5 | """Computes the direct information (DI) for pseudolikelihood maximization direct
  6 | coupling analysis.
  7 | 
  8 | Author: Mehari B. Zerihun
  9 | """
 10 | 
 11 | 
 12 | @jit(nopython=True, parallel=True)
 13 | def compute_sequences_weight(alignment_data=None, sequence_identity=None):
 14 |     """Computes weight of sequences. The weights are calculated by lumping
 15 |     together sequences whose identity is greater that a particular threshold.
 16 |     For example, if there are m similar sequences, each of them will be assigned
 17 |     a weight of 1/m. Note that the effective number of sequences is the sum of
 18 |     these weights.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |         alignmnet_data : np.array()
 23 |             Numpy 2d array of the alignment data, after the alignment is put in
 24 |             integer representation
 25 |         sequence_identity : float
 26 |             Value at which beyond this sequences are considered similar. Typical
 27 |             values could be 0.7, 0.8, 0.9 and so on
 28 | 
 29 |     Returns
 30 |     -------
 31 |         seqs_weight : np.array()
 32 |             A 1d numpy array containing computed weights. This array has a size
 33 |             of the number of sequences in the alignment data.
 34 |     """
 35 |     alignment_shape = alignment_data.shape
 36 |     num_seqs = alignment_shape[0]
 37 |     seqs_len = alignment_shape[1]
 38 |     seqs_weight = np.zeros((num_seqs,), dtype=np.float64)
 39 |     #count similar sequences
 40 |     for i in parallel_range(num_seqs):
 41 |         seq_i = alignment_data[i]
 42 |         for j in range(num_seqs):
 43 |             seq_j = alignment_data[j]
 44 |             iid = np.sum(seq_i==seq_j)
 45 |             if np.float64(iid)/np.float64(seqs_len) > sequence_identity:
 46 |                 seqs_weight[i] += 1
 47 |     #compute the weight of each sequence in the alignment
 48 |     for i in range(num_seqs): seqs_weight[i] = 1.0/float(seqs_weight[i])
 49 |     return seqs_weight
 50 | 
 51 | 
 52 | @jit(nopython=True)
 53 | def compute_single_site_freqs(alignment_data=None,
 54 |         num_site_states=None, seqs_weight=None):
 55 |     """Computes single site frequency counts for a particular aligmnet data.
 56 | 
 57 |     Parameters
 58 |     ----------
 59 |         alignment_data : np.array()
 60 |             A 2d numpy array of alignment data represented in integer form.
 61 | 
 62 |         num_site_states : int
 63 |             An integer value fo the number of states a sequence site can have
 64 |             including a gap state. Typical value is 5 for RNAs and 21 for
 65 |             proteins.
 66 | 
 67 |         seqs_weight : np.array()
 68 |             A 1d numpy array of sequences weight
 69 | 
 70 |     Returns
 71 |     -------
 72 |         single_site_freqs : np.array()
 73 |             A 2d numpy array of of data type float64. The shape of this array is
 74 |             (seqs_len, num_site_states) where seqs_len is the length of sequences
 75 |             in the alignment data.
 76 |     """
 77 |     alignment_shape = alignment_data.shape
 78 |     #num_seqs = alignment_shape[0]
 79 |     seqs_len = alignment_shape[1]
 80 |     m_eff = np.sum(seqs_weight)
 81 |     single_site_freqs = np.zeros(shape = (seqs_len, num_site_states),
 82 |         dtype = np.float64)
 83 |     for i in range(seqs_len):
 84 |         for a in range(1, num_site_states + 1):#we need gap states single site freqs too
 85 |             column_i = alignment_data[:,i]
 86 |             freq_ia = np.sum((column_i==a)*seqs_weight)
 87 |             single_site_freqs[i, a-1] = freq_ia/m_eff
 88 |     return single_site_freqs
 89 | 
 90 | 
 91 | @jit(nopython=True)
 92 | def get_reg_single_site_freqs(single_site_freqs = None, seqs_len = None,
 93 |         num_site_states = None, pseudocount = None):
 94 |     """Regularizes single site frequencies.
 95 | 
 96 |     Parameters
 97 |     ----------
 98 |         single_site_freqs : np.array()
 99 |             A 2d numpy array of single site frequencies of shape
100 |             (seqs_len, num_site_states). Note that gap state frequencies are
101 |             included in this data.
102 |         seqs_len : int
103 |             The length of sequences in the alignment data
104 |         num_site_states : int
105 |             Total number of states that a site in a sequence can accommodate. It
106 |             includes gap states.
107 |         pseudocount : float
108 |             This is the value of the relative pseudo count of type float.
109 |             theta = lambda/(meff + lambda), where meff is the effective number of
110 |             sequences and lambda is the real pseudo count.
111 | 
112 |     Returns
113 |     -------
114 |         reg_single_site_freqs : np.array()
115 |             A 2d numpy array of shape (seqs_len, num_site_states) of single site
116 |             frequencies after they are regularized.
117 |     """
118 |     reg_single_site_freqs = single_site_freqs
119 |     theta_by_q = np.float64(pseudocount)/np.float64(num_site_states)
120 |     for i in range(seqs_len):
121 |         for a in range(num_site_states):
122 |             reg_single_site_freqs[i, a] = theta_by_q + \
123 |                 (1.0 - pseudocount)*reg_single_site_freqs[i, a]
124 |     return reg_single_site_freqs
125 | 
126 | 
127 | @jit(nopython=True)
128 | def slice_couplings(couplings = None, site_pair=None, num_site_states=None, seqs_len = None):
129 |     """Constructs couplings array suitable for computing two-site-model fields as well 
130 |     as DI scores. 
131 | 
132 |     Parameters
133 |     ----------
134 |         couplings : np.array
135 |             A 1 array of couplings excluding gap state couplings
136 |         site_pair : tuple
137 |             Site pair (i, j) such that j > i with o <= i < seqs_len
138 |         num_site_states : int 
139 |             Number of site states for sequence 
140 |         seqs_len : int 
141 |             Length of sequences in MSA data 
142 |     """
143 |     i, j = site_pair[0], site_pair[1]
144 |     q = num_site_states
145 |     qm1 = q - 1
146 |     pair_loc = int((seqs_len * (seqs_len - 1)/2) - (seqs_len - i) * ((seqs_len - i) - 1)/2  + j  - i - 1)
147 |     start_indx = pair_loc * qm1 * qm1  
148 |     end_indx = start_indx + qm1 * qm1
149 |     couplings_ij = np.zeros((q, q), dtype = np.float64)
150 |     couplings_tmp = couplings[start_indx:end_indx]
151 |     couplings_ij[:q-1, :q-1]  = np.reshape(couplings_tmp, shape = (q-1, q-1))
152 |     return couplings_ij 
153 | 
154 | 
155 | @jit(nopython=True)
156 | def compute_two_site_model_fields(couplings = None, reg_fi = None,
157 |         seqs_len = None, num_site_states = None):
158 |     """Computes two-site model fields iteratively.
159 | 
160 |     Parameters
161 |     ----------
162 |         couplings : np.array
163 |             A numpy array of couplings of shape (N, N) where
164 |             N = seqs_len * (num_site_states - 1)
165 | 
166 |         reg_fi : np.array
167 |             A numpy array of regularized single site frequncies of shape
168 |             (seqs_len, num_site_states)
169 | 
170 |         seqs_len : int
171 |             Length of sequences in alignment data
172 | 
173 |         num_site_states : int
174 |             Total number of states a site in a sequence can accommodate,
175 |             including gap state.
176 | 
177 |     Returns
178 |     -------
179 |         two_site_model_fields : np.array
180 |             A numpy array of shape (P, 2, num_site_states), where P is the number
181 |             of unique site pairs excluding self pairings.
182 |             P = seqs_len * (seqs_len - 1)/2.
183 |     """
184 |     num_unique_pairs = seqs_len * (seqs_len -1)
185 |     num_unique_pairs /= 2
186 |     q = num_site_states
187 |     two_site_model_fields = np.zeros((np.int64(num_unique_pairs), 2, q), dtype=np.float64)
188 |     TOLERANCE = 1.0e-4
189 |     pair_counter = 0
190 |     for i in range(seqs_len - 1):
191 |         freq_i = np.reshape(reg_fi[i], (q, 1))
192 |         for j in range(i + 1, seqs_len):
193 |             site_pair = (i, j)
194 |             freq_j = np.reshape(reg_fi[j], (q, 1))
195 |             couplings_ij = np.exp(slice_couplings(couplings = couplings,
196 |                 site_pair = site_pair, num_site_states = q, seqs_len=seqs_len)
197 |             )
198 |             fields_i_old = np.full((q, 1), 1.0/np.float64(q))
199 |             fields_j_old = np.full((q, 1), 1.0/np.float64(q))
200 |             max_fields_change = 10.0
201 |             while max_fields_change > TOLERANCE:
202 |                 x_i = np.dot(couplings_ij , fields_j_old)
203 |                 x_j = np.dot(np.transpose(couplings_ij), fields_i_old)
204 | 
205 |                 fields_i_new =  freq_i / x_i
206 |                 fields_i_new /= np.sum(fields_i_new)
207 |                 fields_j_new = freq_j / x_j
208 |                 fields_j_new /= np.sum(fields_j_new)
209 | 
210 |                 delta_fields_i = np.max(np.absolute(fields_i_new - fields_i_old))
211 |                 delta_fields_j = np.max(np.absolute(fields_j_new - fields_j_old))
212 |                 max_fields_change = np.max(np.array([delta_fields_i, delta_fields_j]))
213 | 
214 |                 fields_i_old = fields_i_new
215 |                 fields_j_old = fields_j_new
216 |             #capture computed fields after iteration is converged
217 |             two_site_model_fields[pair_counter][0] = fields_i_new.T
218 |             two_site_model_fields[pair_counter][1] = fields_j_new.T
219 |             pair_counter += 1
220 |     return two_site_model_fields
221 | 
222 | 
223 | @jit(nopython=True)
224 | def compute_direct_info(couplings = None, fields_ij = None, reg_fi = None,
225 |         seqs_len = None, num_site_states = None):
226 |     """Computes the direct information from direct probabilities.
227 | 
228 |     Parameters
229 |     ----------
230 |         couplings : np.array
231 |             A 2d numpy array of shape (L(q-1), L(q-1)), where L and q are the
232 |             length of sequences in MSA and number of site-states respectively.
233 |             Note that the couplings are the negative of the inverse of the
234 |             correlation matrix.
235 | 
236 |         fields_ij : np.array
237 |             A 3d numpy array of two-site model fields. The shape of this array
238 |             is (P, 2, q). Where P is the number of unique site pairs and q is the
239 |             total number of site states. The ordering of site-pairs is very
240 |             important. For example index P=0 refers to site pairs (0, 1), and
241 |             as p increase the pairs are (0, 2), ... ,(0, L-1), (1, 2), ...,
242 |             (1, L-1), ..., (L-2, L-1). the first index of the second dimension
243 |             refers to the first site in site pair. Example, fields_ij[0][0]
244 |             contains the fields of site 0 when its paired with site 1, and
245 |             fields_ij[0][1] contains those of site 1 in the same pair, and so on.
246 | 
247 |         reg_fi : np.array
248 |             A 2d numpy array of regularized single site frequencies. It has
249 |             a shape of (L, q) where L and q are the length of the sequences
250 |             in alignment data and number of total site states respectively.
251 |             Example, reg_fi[0] contains the frequencies of the first column in
252 |             MSA.
253 | 
254 |         seqs_len : int
255 |             The length of sequences in MSA.
256 | 
257 |         num_site_states : int
258 |             The total number of residues plus gap.
259 | 
260 |     Returns
261 |     -------
262 |         unsorted_DI : np.array
263 |             A 1d numpy array of shape (P, ) containing the values of
264 |             direct informations (DI).  P is the total number of unique site pairs.
265 |             Example, index P = 0 contains DI of pair (0, 1),index P = 1 that
266 |             of (0, 2) and so on. The last pair is (L-2, L-1).  Note that the
267 |             direct information is computed from couplings and fields that involve
268 |             residues, although the direct probability is computed for all couplings
269 |             and new fields. The couplings involving a gap are set to 0. The fields
270 |             of gap states are not necessarily zero, they are  the new fields as
271 |             computed by two site model. If Pdir is the direct probabiliy of shape
272 |             (q, q), we use Pdir[:q-1, :q-1] when computing the direct information.
273 |     """
274 |     num_unique_pairs = np.int64(seqs_len * (seqs_len - 1)/2)
275 |     unsorted_DI = np.zeros(num_unique_pairs, dtype=np.float64)
276 |     q = num_site_states
277 |     EPSILON = 1.0e-20
278 |     pair_counter = 0
279 |     for i in range(seqs_len - 1):
280 |         fi = reg_fi[i]
281 |         for j in range(i + 1, seqs_len):
282 |             site_pair = (i, j)
283 |             fj = reg_fi[j]
284 |             #h_i = fields_ij[pair_counter][0]
285 |             #h_j = fields_ij[pair_counter][1]
286 |             hij = np.dot(np.reshape(fields_ij[pair_counter][0], (q, 1)),
287 |                 np.transpose(np.reshape(fields_ij[pair_counter][1], (q, 1))),
288 |             )
289 | 
290 |             couplingsij = np.exp(slice_couplings(couplings = couplings,
291 |                 site_pair = site_pair, num_site_states = q, seqs_len = seqs_len)
292 |             )
293 |             #Compute direct information
294 |             pdir_ij = couplingsij * hij
295 |             pdir_ij /= np.sum(pdir_ij)
296 |             #Compute product of single site frequencies
297 |             fij = np.dot(np.reshape(fi, (q, 1)),
298 |                 np.transpose(np.reshape(fj, (q, 1)))
299 |             )
300 |             #Only take into account residue residue interactions for computing
301 |             #direct information
302 |             fij_residues = fij[:q-1, :q-1] + EPSILON # + operator creats a copy
303 |             pdir_ij_residues = pdir_ij[:q-1, :q-1] + EPSILON
304 |             pdir_by_fij_residues =  pdir_ij_residues/fij_residues
305 |             #Compute direct information
306 |             DI_ij = np.sum(pdir_ij_residues * np.log(pdir_by_fij_residues))
307 |             unsorted_DI[pair_counter] = DI_ij
308 |             #Move to the next site pair
309 |             pair_counter += 1
310 | 
311 |     return unsorted_DI
312 | 
313 | 


--------------------------------------------------------------------------------
/pydca/plmdca/plmdcaBackend.cpp:
--------------------------------------------------------------------------------
  1 | #include "include/plmdca.h"
  2 | 
  3 | /*Implements the pseudolikelihood maximization direct couplings analysis 
  4 | for protein and RNA sequences.
  5 | 
  6 | Authors: Mehari B. Zerihun, Fabrizio Pucci
  7 | 
  8 | */
  9 | 
 10 | 
 11 | class ObjectiveFunction{
 12 |     /*Objective Function for lbfgs input. 
 13 |         
 14 |     Attributes
 15 |     ----------
 16 |         m_x     : A dynamic array containing fields and couplings.
 17 |         plmdca_inst : PlmDCA 
 18 |         m_verbose : bool 
 19 |         m_max_iterations : unsigned int 
 20 |     */
 21 |    
 22 |     protected:
 23 |         float* m_x;
 24 |         bool m_verbose;
 25 |         unsigned int m_max_iterations;
 26 |         PlmDCA plmdca_inst;
 27 |         
 28 |     public:
 29 |         ObjectiveFunction(unsigned short const biomolecule, unsigned short const num_site_states, 
 30 |             const char* msa_file, unsigned int const seqs_len, float const seqid, float const lambda_h, 
 31 |             float const lambda_J, unsigned int const max_iterations, const unsigned int num_threads, bool verbose):
 32 |             m_x(NULL), 
 33 |             m_verbose(verbose), 
 34 |             m_max_iterations(max_iterations), 
 35 |             plmdca_inst(msa_file, biomolecule, seqs_len, num_site_states, seqid, lambda_h, lambda_J, num_threads)
 36 |         {
 37 |             // ObjectiveFunction constructor body
 38 |         }        
 39 | 
 40 | 
 41 |         float* getFieldsAndCouplings() 
 42 |         {
 43 |             return this->m_x; 
 44 |         }
 45 | 
 46 | 
 47 |         int run(int N)
 48 |         {
 49 |             /*Performs plmDCA computation using LBFGS optimization.
 50 | 
 51 |             Parameters
 52 |             ----------
 53 |                 N       : Total number of fields and couplings. 
 54 | 
 55 |             Returns
 56 |             -------
 57 |                 ret     : Exit status of LBFGS optimization.
 58 |             */
 59 |         
 60 |             float fx;
 61 |             this->m_x = lbfgs_malloc(N);
 62 | 
 63 |             if (this->m_x == NULL) {
 64 |                 printf("ERROR: Failed to allocate a memory block for variables.\n");
 65 |                 return 1;
 66 |             }
 67 |             //initialize parameters
 68 |             lbfgs_parameter_t param;
 69 |             lbfgs_parameter_init(&param);
 70 |             param.epsilon = 1E-3;
 71 |             param.max_iterations = this->m_max_iterations;
 72 |             param.max_linesearch = 5;
 73 |             param.ftol = 1E-4;
 74 |             //param.wolfe = 0.2;
 75 |             param.m = 5 ;
 76 | 
 77 |             this->plmdca_inst.initFieldsAndCouplings(m_x);
 78 |             //Start the L-BFGS optimization; this will invoke the callback functions
 79 |             //evaluate() and progress() when necessary.
 80 |                         
 81 |             int ret = lbfgs(N, m_x, &fx, _evaluate, _progress, this, &param);
 82 |             // return status value ret == -1001 corresponds with convergence for a given precision
 83 |             /* Report the result. */
 84 |             if(this->m_verbose){
 85 |                 if (ret==-1001){
 86 |                     fprintf(stderr, "L-BFGS optimization completed\n");
 87 |                 }else{
 88 |                     fprintf(stderr, "L-BFGS optimization terminated with status code = %d\n", ret);
 89 |                     fprintf(stderr, "fx = %f\n", fx);
 90 |                 }
 91 |             }
 92 |         
 93 |             return ret;
 94 |         }
 95 | 
 96 | 
 97 |     protected:
 98 |         static float _evaluate( void* instance, const float*x, float* g, const int n, const float step)
 99 |         {
100 |             /*Computes the gradient of the regularized negative pseudolikelihood function for 
101 |             protein/RNA alignments. 
102 | 
103 |             Parameters
104 |             ----------
105 |                 instance    : An instance of ObjectiveFunction class. 
106 |                 x           : Array of fields and couplings.
107 |                 g           : Array of gradient of the negative log pseudolikelihood
108 |                     of the conditional probablity for protein/RNA alignments.
109 |                 n           : Number of fields and couplings?
110 |                 step        : The step size for gradient decent.
111 | 
112 |             Returns
113 |             --------
114 |                 fx          : Value of plmDCA objective function
115 | 
116 |             */
117 | 
118 |             return reinterpret_cast<ObjectiveFunction*>(instance)->evaluate(x, g, n, step);
119 |         }
120 | 
121 | 
122 |         float evaluate(const float*x, float* g, const int n, const float step)
123 |         {
124 |             float fx;
125 |             fx = this->plmdca_inst.gradient(x, g);
126 |             return fx;
127 |         }
128 | 
129 | 
130 |         static int _progress(void* instance, const float* x, const float* g, const float fx, 
131 |             const float xnorm, const float gnorm, const float step, int n, int k, int ls)
132 |         {
133 |             return reinterpret_cast<ObjectiveFunction*>(instance)->progress(x, g, fx, xnorm, gnorm, step, n, k, ls);
134 |         }
135 | 
136 | 
137 |         int progress(const float* x, const float* g, const float fx, const float xnorm, const float gnorm,
138 |             const float step, int n, int k, int ls)
139 |         {
140 |             if(this->m_verbose){
141 |                 fprintf(stderr, "Iteration %d:\n", k);
142 |                 fprintf(stderr, "fx = %f, xnorm = %f, gnorm = %f, step = %f\n", fx, xnorm, gnorm, step);
143 |                 fprintf(stderr, "\n");
144 |             }
145 |             return 0;
146 |         }
147 | };
148 |     
149 | 
150 | 
151 | extern "C" float* plmdcaBackend(unsigned short const biomolecule, 
152 |     unsigned short const num_site_states, 
153 |     const char* msa_file, unsigned int const seqs_len, 
154 |     float const seqid, float const lambda_h, 
155 |     float const lambda_J, unsigned int const max_iteration, 
156 |     const unsigned int num_threads, bool verbose )
157 | {  
158 |     /*Interface for the Python implementation of plmDCA. 
159 | 
160 |     Parameters
161 |     ----------
162 |         biomolecule     : Type of biomolecule (protein or RNA).
163 |         num_site_states : Number of states/residues plus gap for MSA data.
164 |         msa_file        : Path to the FASTA formatted MSA file.
165 |         seqs_len        : The length of sequences in MSA data.
166 |         seqid           : Sequence identity threshold.
167 |         lambda_h        : Regularization parameter for fields.
168 |         lambda_J        : Regularization parameter for couplings.
169 |         max_iteration   : Maximum number of gradient decent iterations.
170 |         num_threads     : Number of threads for PlmDCA (when OpenMP is supported).
171 |         verbose         : Print logging message on the terminal.
172 | 
173 |     Returns
174 |     -------
175 |         h_and_J        : Fields and couplings array. This data is fetched into the
176 |             Python interface. 
177 | 
178 |     */
179 |    #if defined(_OPENMP)
180 |     // can use multiple threads
181 |     #else 
182 |         if(num_threads > 1){
183 |             std::cerr << "Cannot set multiple threads when OpenMP is not supported\n";
184 |             throw std::runtime_error("Invalid number of threads");
185 |         }
186 |     #endif
187 | 
188 |     const int total_num_params = seqs_len * num_site_states + seqs_len * (seqs_len - 1) * num_site_states * num_site_states/2 ; 
189 | 
190 |     // Start computation 
191 |     ObjectiveFunction objfun_inst(biomolecule, num_site_states, msa_file, seqs_len, seqid, 
192 |         lambda_h, lambda_J, max_iteration, num_threads, verbose
193 |     );
194 | 
195 |     //const int N = total_num_params;
196 | 
197 |     objfun_inst.run(total_num_params);
198 |     auto h_and_J = objfun_inst.getFieldsAndCouplings();
199 | 
200 |     return h_and_J;
201 | }
202 | 
203 | 
204 | extern "C" void freeFieldsAndCouplings(void * h_and_J)
205 | {  
206 |     /*  Frees memory that has been used to store fields and couplings before 
207 |         they are captured in the Python interface.
208 | 
209 |         Parameters
210 |         ----------
211 |             h_and_J : Pointer to the fields and couplings vector 
212 |         
213 |         Returns
214 |         -------
215 |             void    : No return value
216 | 
217 |     */
218 |    float* h_and_J_casted = static_cast<float*>(h_and_J);  
219 |     if(h_and_J_casted !=nullptr){
220 |         delete [] h_and_J_casted;
221 |         h_and_J_casted = nullptr;
222 |     }
223 | }
224 |     


--------------------------------------------------------------------------------
/pydca/plmdca_main.py:
--------------------------------------------------------------------------------
  1 | from pydca.plmdca import plmdca 
  2 | from pydca.sequence_backmapper.sequence_backmapper import SequenceBackmapper
  3 | from pydca.fasta_reader.fasta_reader import get_alignment_from_fasta_file
  4 | from pydca.dca_utilities import dca_utilities
  5 | from argparse import ArgumentParser
  6 | import logging
  7 | import sys
  8 | import os
  9 | 
 10 | 
 11 | """Top level module for plmDCA. Defines command line interface and 
 12 | configures logging.
 13 | 
 14 | Author: Mehari B. Zerihun
 15 | """
 16 | 
 17 | def configure_logging():
 18 |     """Configures logging. When configured, the logging level is INFO and
 19 |     messages are logged to stream handler. Log level name are colored whenever
 20 |     the terminal supports that. INFO level is Green, WARNING level is Yellow and
 21 |     ERROR level is Red.
 22 |     """
 23 |     from pydca.config_dca.config_log import LOGGING_CONFIG
 24 |     from pydca.config_dca.config_log import ConsoleColor as c_color
 25 |     import logging.config
 26 | 
 27 |     logging.config.dictConfig(LOGGING_CONFIG)
 28 |     logging.addLevelName(logging.INFO, '{}{}{}'.format(
 29 |         c_color.green, logging.getLevelName(logging.INFO), c_color.nocolor))
 30 |     logging.addLevelName(logging.WARNING, '{}{}{}'.format(
 31 |         c_color.yellow, logging.getLevelName(logging.WARNING), c_color.nocolor))
 32 |     logging.addLevelName(logging.ERROR, '{}{}{}'.format(
 33 |         c_color.red, logging.getLevelName(logging.ERROR), c_color.nocolor))
 34 |     return None
 35 | 
 36 | 
 37 | class CmdArgs:
 38 |     """Defines command line argument variables for plmDCA.
 39 |     """
 40 | 
 41 |     subcommand_name = 'subcommand_name'
 42 |     msa_file = 'msa_file'
 43 |     msa_file_help = """Multiple sequence alignment (MSA) file in FASTA format.
 44 |     """
 45 |     biomolecule = 'biomolecule'
 46 |     biomolecule_help = """Type of biomolecule. It should be either protein or RNA 
 47 |     in lower or upper case letters.
 48 |     """
 49 |     refseq_file_optional = '--refseq_file'
 50 |     refseq_file_help = """FASTA formatted file containing a reference sequence.
 51 |     The reference sequence should not contain gaps or non-standard residues.
 52 |     """
 53 |     verbose_optional = '--verbose'
 54 |     verbose_optional_help = """Show logging information on the terminal.
 55 |     """
 56 |     apc_optional = '--apc'
 57 |     apc_help = """Compute the average product corrected (APC) DCA score.
 58 |     """
 59 |     subcommand_name_help = """Subcommands destination
 60 |     """
 61 |     seqid_optional = '--seqid'
 62 |     seqid_optional_help = """Cut-off value of sequences similarity above which they
 63 |     are lumped together.
 64 |     """
 65 |     lambda_h_optional = '--lambda_h'
 66 |     lambda_h_optional_help = """Value of fields penalizing constant for L2 
 67 |     regularization of fields.
 68 |     """
 69 |     lambda_J_optional = '--lambda_J'
 70 |     lambda_J_optional_help = """Value of couplings penalizing constant for L2 
 71 |     regularization of couplings.
 72 |     """
 73 |     max_iterations_optional = '--max_iterations'
 74 |     max_iterations_help = """Number of iterations for gradient decent 
 75 |     for negative pseudolikelihood minimization.
 76 |     """
 77 |     num_threads_optional = '--num_threads'
 78 |     num_threads_help = "Number of threads from plmDCA computation"
 79 |     output_dir_optional = '--output_dir'
 80 |     output_dir_help = """Directory path to which output results are written.
 81 |     If the directory is not existing, it will be created provided that the user
 82 |     has a privilege to do so. If this path is not provided, an output directory
 83 |     is created using the base name of the MSA file, with a prefix and/or postfix
 84 |     added to it.
 85 |     """
 86 |     ranked_by_optional = '--ranked_by'
 87 |     ranked_by_optional_help="""Method in which DCA scores are calculated. There are
 88 |     four options: direct information (DI), Frobenius norm (FN) and their average
 89 |     product corrected forms (DI_APC, FN_APC).
 90 |     """
 91 | 
 92 |     linear_dist_optional = '--linear_dist'
 93 |     linear_dist_help="""Minimum separation beteween site pairs in sequence. 
 94 |     """
 95 |     num_site_pairs_optional = '--num_site_pairs'
 96 |     num_site_pairs_help = """The maximum number of site pairs whose couplings are 
 97 |     to be extracted.
 98 |     """
 99 |     
100 | # end of class CmdArgs 
101 | 
102 | DCA_COMPUTATION_SUBCOMMANDS = ('compute_fn', 'compute_di', 'compute_params', 'debug')
103 | 
104 | def get_plmdca_inst(biomolecule, msa_file, seqid=None, lambda_h=None, lambda_J=None, 
105 |         max_iterations = None, num_threads=None, verbose=False):
106 |     """Creates a PlmDCA instance and returns it.
107 | 
108 |     Parameters
109 |     ----------
110 |         msa_file : str 
111 |             Path to FASTA formatted MSA file.
112 |         biomolecule : str
113 |             Type of biomolecule the MSA data represents.
114 |         seqid : float
115 |             Sequences identity cut-off value.
116 |         lambda_h : float 
117 |             Value of fileds penalizing constant. 
118 |         lambda_J : float
119 |             Value of couplings penalizing constant. 
120 |         num_iter_steps : int 
121 |             Number of iteration for gradient decent.
122 | 
123 |     Returns
124 |     ------- 
125 |         plmdca_inst : PlmDCA 
126 |             An instance of PlmDCA class
127 |     """
128 |     plmdca_inst = plmdca.PlmDCA(biomolecule, msa_file, 
129 |         seqid=seqid, lambda_h=lambda_h, 
130 |         lambda_J=lambda_J, max_iterations = max_iterations,
131 |         num_threads = num_threads, verbose=verbose,
132 |     )
133 |     return plmdca_inst 
134 | 
135 | 
136 | def execute_from_command_line(biomolecule, msa_file, the_command = None, 
137 |     refseq_file = None, seqid = None, lambda_h = None, lambda_J = None, 
138 |     max_iterations = None,  apc = False ,verbose = False, output_dir = None,
139 |     num_threads = None, ranked_by = None, linear_dist=None, num_site_pairs=None):
140 |     """Runs plmdca computation from the command line.
141 | 
142 |     Parameters
143 |     ----------
144 |         biomolecule : str
145 |             Type of biomolecule the MSA data represents.
146 |         msa_file : str 
147 |             Path to FASTA formatted MSA file.
148 |         the_command : str
149 |             Name of subcommand for plmDCA
150 |         refseq_file: str
151 |             Path to reference sequence file
152 |         seqid : float
153 |             Sequences identity cut-off value.
154 |         lambda_h : float 
155 |             Value of fileds penalizing constant. 
156 |         lambda_J : float
157 |             Value of couplings penalizing constant. 
158 |         max_iterations : int 
159 |             Number of iteration for gradient decent.
160 |         apc : bool
161 |             Perform average product correction to DCA scores. 
162 |         verbose : bool 
163 |             True or False. Determines if plmdca computation is done in verbose mode or not.
164 |         output_dir : str    
165 |             Directory where computed results are to be saved in.
166 |         ranked_by : str
167 |             DCA scores computation method. Default is by Frobenius Norm (FN). 
168 |     """
169 | 
170 |     if verbose : configure_logging()
171 |     
172 |     plmdca_instance = get_plmdca_inst(msa_file, biomolecule, seqid = seqid, 
173 |         lambda_h = lambda_h, lambda_J = lambda_J, max_iterations = max_iterations, 
174 |         num_threads = num_threads, verbose = verbose
175 |     )
176 | 
177 |     # Compute FN or DI scores
178 |     if the_command in DCA_COMPUTATION_SUBCOMMANDS:
179 |         param_metadata = dca_utilities.plmdca_param_metadata(plmdca_instance)
180 |         if not output_dir:
181 |             msa_file_base_name, ext = os.path.splitext(os.path.basename(msa_file))
182 |             output_dir = 'PLMDCA_output_' + msa_file_base_name
183 |         #create dca coutput directory
184 |         dca_utilities.create_directories(output_dir)
185 |         seqbackmapper = None 
186 |         if refseq_file:# do backmapping when reference sequence file is provided
187 |             seqbackmapper = SequenceBackmapper(
188 |                 msa_file = msa_file,
189 |                 refseq_file = refseq_file,
190 |                 biomolecule = plmdca_instance.biomolecule
191 |             )
192 |         #subcommand compute_fn
193 |         if the_command=='compute_fn':
194 |             if apc:
195 |                 score_type = 'PLMDCA Frobenius norm, average product corrected (APC)'
196 |                 sorted_FN = plmdca_instance.compute_sorted_FN_APC(seqbackmapper=seqbackmapper)
197 |                 fn_file_path = dca_utilities.get_dca_output_file_path(output_dir,
198 |                     msa_file, prefix = 'PLMDCA_apc_fn_scores_', postfix='.txt'
199 |                 )
200 |             else:
201 |                 score_type = 'PLMDCA Frobenius norm, non-APC (not average product corrected)'
202 |                 sorted_FN = plmdca_instance.compute_sorted_FN(seqbackmapper=seqbackmapper)
203 |                 fn_file_path = dca_utilities.get_dca_output_file_path(output_dir,
204 |                     msa_file, prefix = 'PLMDCA_raw_fn_scores_', postfix='.txt'
205 |                 )
206 |             dca_utilities.write_sorted_dca_scores(fn_file_path, sorted_FN,
207 |                 metadata = param_metadata,
208 |                 score_type = score_type
209 |             )
210 |         #subcommand compute_di 
211 |         if the_command == 'compute_di':
212 |             if apc:
213 |                 score_type = 'PLMDCA  DI scores, average product corrected (APC)'
214 |                 sorted_DI = plmdca_instance.compute_sorted_DI_APC(seqbackmapper=seqbackmapper)
215 |                 fn_file_path = dca_utilities.get_dca_output_file_path(output_dir,
216 |                     msa_file, prefix = 'PLMDCA_apc_di_scores_', postfix='.txt'
217 |                 )
218 |             else:
219 |                 score_type = 'PLMDCA DI scores, non-APC (not average product corrected)'
220 |                 sorted_DI = plmdca_instance.compute_sorted_DI(seqbackmapper=seqbackmapper)
221 |                 fn_file_path = dca_utilities.get_dca_output_file_path(output_dir,
222 |                     msa_file, prefix = 'PLMDCA_raw_di_scores_', postfix='.txt'
223 |                 )
224 |             dca_utilities.write_sorted_dca_scores(fn_file_path, sorted_DI,
225 |                 metadata = param_metadata,
226 |                 score_type = score_type
227 |             )
228 |         
229 |         # compute params
230 |         if the_command == 'compute_params':
231 |             fields, couplings = plmdca_instance.compute_params(
232 |                 seqbackmapper=seqbackmapper, 
233 |                 ranked_by=ranked_by,
234 |                 linear_dist=linear_dist,
235 |                 num_site_pairs=num_site_pairs,
236 |                 )
237 |             # write fields to text file
238 |             fields_file_path = dca_utilities.get_dca_output_file_path(output_dir,
239 |                     msa_file, prefix = 'fields_', postfix='.txt'
240 |                 )
241 |             param_metadata.append('#\tTotal number of sites whose fields are extracted: {}'.format(len(fields))) 
242 |             dca_utilities.write_fields_csv(fields_file_path, fields, metadata=param_metadata)
243 |             couplings_file_path = dca_utilities.get_dca_output_file_path(output_dir,
244 |                     msa_file, prefix = 'couplings_', postfix='.txt'
245 |                 )
246 |             param_metadata.pop() 
247 |             param_metadata.append('#\tTotal number of site pairs whose couplings are extracted: {}'.format(len(couplings)))
248 |             if ranked_by is None: # the default is FN_APC
249 |                 ranked_by = 'FN_APC'
250 |             param_metadata.append('#\tDCA ranking method used: {}'.format(ranked_by))
251 |             if linear_dist is None: # default is |i - j| > 4
252 |                 linear_dist = 4
253 |             param_metadata.append('#\tMinimum separation beteween site pairs in sequence: |i - j| > {}'.format(linear_dist))
254 |             dca_utilities.write_couplings_csv(couplings_file_path, couplings, metadata=param_metadata)
255 |             
256 |     return None 
257 | 
258 | 
259 | def run_plm_dca():
260 |     """Performs plmDCA computation based on argument passed from the command line. 
261 |     """
262 |     parser = ArgumentParser()
263 |     subparsers = parser.add_subparsers(dest = CmdArgs.subcommand_name)
264 |     #parser compute_fn
265 |     parser_compute_fn = subparsers.add_parser('compute_fn', 
266 |         help='Computes DCA scores summarized by the Frobenius norm of couplings.'
267 |         'Typically usage is plmdca compute_fn <biomolecule> <msa_file> --max_iterations <ni> ' 
268 |         '--num_threads <nt> --apc --verbose, where <biomolecule> takes rna or protein ' 
269 |         '(case insensitive), <msa_file> is fasta formatted multiple sequence alignment '
270 |         'file, <ni> is the number of maximum gradient decent iterations, <nt> is the number '
271 |         'of threads (if OpenMP) is supported, --apc performs average product correction of '
272 |         'DCA scores and --verbose triggers logging messages to be displayed on the screen. '
273 |         'Help message can be obtained using: plmdca compute_fn --help'
274 |     )
275 |     parser_compute_fn.add_argument(CmdArgs.biomolecule, help=CmdArgs.biomolecule_help)
276 |     parser_compute_fn.add_argument(CmdArgs.msa_file, help=CmdArgs.msa_file_help)
277 |     parser_compute_fn.add_argument(CmdArgs.seqid_optional, help=CmdArgs.seqid_optional_help, type=float)
278 |     parser_compute_fn.add_argument(CmdArgs.lambda_h_optional, help=CmdArgs.lambda_h_optional_help, type=float)
279 |     parser_compute_fn.add_argument(CmdArgs.lambda_J_optional, help=CmdArgs.lambda_J_optional_help, type=float)
280 |     parser_compute_fn.add_argument(CmdArgs.max_iterations_optional, help=CmdArgs.max_iterations_help, type=int)
281 |     parser_compute_fn.add_argument(CmdArgs.num_threads_optional, help=CmdArgs.num_threads_help, type=int)
282 |     parser_compute_fn.add_argument(CmdArgs.refseq_file_optional, help=CmdArgs.refseq_file_help)
283 |     parser_compute_fn.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_optional_help, action='store_true')
284 |     parser_compute_fn.add_argument(CmdArgs.apc_optional, help=CmdArgs.apc_help, action='store_true')
285 |     parser_compute_fn.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
286 | 
287 |     #parser compute_DI_FN
288 |     parser_compute_di = subparsers.add_parser('compute_di', 
289 |         help='Computes DCA scores summarized by direct information. '
290 |         'Typical usage is similar to the command compute_fn. '
291 |         'Help message can be obtained using: plmdca compute_di --help'
292 |     )
293 |     parser_compute_di.add_argument(CmdArgs.biomolecule, help=CmdArgs.biomolecule_help)
294 |     parser_compute_di.add_argument(CmdArgs.msa_file, help=CmdArgs.msa_file_help)
295 |     parser_compute_di.add_argument(CmdArgs.seqid_optional, help=CmdArgs.seqid_optional_help, type=float)
296 |     parser_compute_di.add_argument(CmdArgs.lambda_h_optional, help=CmdArgs.lambda_h_optional_help, type=float)
297 |     parser_compute_di.add_argument(CmdArgs.lambda_J_optional, help=CmdArgs.lambda_J_optional_help, type=float)
298 |     parser_compute_di.add_argument(CmdArgs.max_iterations_optional, help=CmdArgs.max_iterations_help, type=int)
299 |     parser_compute_di.add_argument(CmdArgs.num_threads_optional, help=CmdArgs.num_threads_help, type=int)
300 |     parser_compute_di.add_argument(CmdArgs.refseq_file_optional, help=CmdArgs.refseq_file_help)
301 |     parser_compute_di.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_optional_help, action='store_true')
302 |     parser_compute_di.add_argument(CmdArgs.apc_optional, help=CmdArgs.apc_help, action='store_true')
303 |     parser_compute_di.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
304 |     
305 |     #parser compute_params
306 |     parser_compute_params = subparsers.add_parser('compute_params', 
307 |         help='Computes the fields and couplings of the conditional probability model for '
308 |         'pseudolikelihood maximimization direct coupling anlysis (plmDA). Typical usage is '
309 |         'plmdca compute_params <biomolecule> <msa_file> --max_iterations <ni> --num_threads ' 
310 |         '<nt> --ranked_by fn_apc --verbose. This command computes fields and couplings, with '
311 |         'the couplings ranked by Frobenius norm average product corrected DCA score. '
312 |         'Help message can be obtained using: plmdca compute_params --help'
313 |     )
314 |     parser_compute_params.add_argument(CmdArgs.biomolecule, help=CmdArgs.biomolecule_help)
315 |     parser_compute_params.add_argument(CmdArgs.msa_file, help=CmdArgs.msa_file_help)
316 |     parser_compute_params.add_argument(CmdArgs.seqid_optional, help=CmdArgs.seqid_optional_help, type=float)
317 |     parser_compute_params.add_argument(CmdArgs.lambda_h_optional, help=CmdArgs.lambda_h_optional_help, type=float)
318 |     parser_compute_params.add_argument(CmdArgs.lambda_J_optional, help=CmdArgs.lambda_J_optional_help, type=float)
319 |     parser_compute_params.add_argument(CmdArgs.max_iterations_optional, help=CmdArgs.max_iterations_help, type=int)
320 |     parser_compute_params.add_argument(CmdArgs.num_threads_optional, help=CmdArgs.num_threads_help, type=int)
321 |     parser_compute_params.add_argument(CmdArgs.refseq_file_optional, help=CmdArgs.refseq_file_help)
322 |     parser_compute_params.add_argument(CmdArgs.verbose_optional, help=CmdArgs.verbose_optional_help, action='store_true')
323 |     parser_compute_params.add_argument(CmdArgs.output_dir_optional, help=CmdArgs.output_dir_help)
324 |     parser_compute_params.add_argument(CmdArgs.ranked_by_optional, help=CmdArgs.ranked_by_optional_help, 
325 |         choices= ('FN', 'FN_APC', 'DI', 'DI_APC', 'fn', 'fn_apc', 'di', 'di_apc')
326 |     )
327 |     parser_compute_params.add_argument(CmdArgs.linear_dist_optional, help=CmdArgs.linear_dist_help, type=int)
328 |     parser_compute_params.add_argument(CmdArgs.num_site_pairs_optional, help=CmdArgs.num_site_pairs_help, type=int)
329 | 
330 |     args = parser.parse_args(args=None if sys.argv[1:] else ['--help'])
331 |     args_dict = vars(args)
332 | 
333 |     execute_from_command_line(args_dict.get('biomolecule'),args_dict.get('msa_file'),
334 |         the_command = args_dict.get('subcommand_name'),
335 |         refseq_file = args_dict.get('refseq_file'), 
336 |         seqid=args_dict.get('seqid'),
337 |         lambda_h=args_dict.get('lambda_h'),
338 |         lambda_J = args_dict.get('lambda_J'),
339 |         max_iterations = args_dict.get('max_iterations'),
340 |         num_threads = args_dict.get('num_threads'),
341 |         apc = args_dict.get('apc'),
342 |         output_dir = args_dict.get('output_dir'),
343 |         verbose = args_dict.get('verbose'),
344 |         ranked_by = args_dict.get('ranked_by'), 
345 |         linear_dist = args_dict.get('linear_dist'),
346 |         num_site_pairs = args_dict.get('num_site_pairs')
347 |     )
348 |     return None 
349 | 
350 | if __name__ == "__main__":
351 |     """
352 |     """
353 |     run_plm_dca()


--------------------------------------------------------------------------------
/pydca/sequence_backmapper/__init__.py:
--------------------------------------------------------------------------------
1 | """This module backmappes a reference sequence's sites to the protein/RNA family
2 | multiple sequence alignment columns.
3 | """
4 | 


--------------------------------------------------------------------------------
/pydca/sequence_backmapper/scoring_matrix.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | """This module defines NUC44 scoring matrix for RNA sequence alignment.
 4 | """
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | NUCLEOTIDE_NUCLEOTIDE_SCORING  = {
 8 |     ('A', 'A'): 5, ('A', 'U'): -4, ('A', 'G'): -4, ('A', 'C'): -4,
 9 |     ('U', 'U'): 5, ('U', 'A'): -4, ('U', 'G'): -4, ('U', 'C'): -4,
10 |     ('G', 'G'): 5, ('G', 'A'): -4, ('G', 'C'): -4, ('G', 'U'): -4,
11 |     ('C', 'C'): 5, ('C', 'A'): -4, ('C', 'G'): -4, ('C', 'U'): -4,
12 | }
13 | 
14 | 
15 | AMBIGUOUS_AMBIGUOUS_SCORING =  {
16 |     ('S', 'S'): -1, ('S', 'W'): -4, ('S', 'R'): -2, ('S', 'Y'): -2,
17 |     ('S', 'K'): -2, ('S', 'M'): -2, ('S', 'B'): -1, ('S', 'V'): -1,
18 |     ('S', 'H'): -3, ('S', 'D'): -3, ('S', 'N'): -1,
19 | 
20 |     ('W', 'S'): -4, ('W', 'W'): -1, ('W', 'R'): -2, ('W', 'Y'): -2,
21 |     ('W', 'K'): -2, ('W', 'M'): -2, ('W', 'B'): -3, ('W', 'V'): -3,
22 |     ('W', 'H'): -1, ('W', 'D'): -1, ('W', 'N'): -1,
23 | 
24 |     ('R', 'S'): -2, ('R', 'W'): -2, ('R', 'R'): -1, ('R', 'Y'): -4,
25 |     ('R', 'K'): -2, ('R', 'M'): -2, ('R', 'B'): -3, ('R', 'V'): -1,
26 |     ('R', 'H'): -3, ('R', 'D'): -1, ('R', 'N'): -1,
27 | 
28 |     ('Y', 'S'): -2, ('Y', 'W'): -2, ('Y', 'R'): -4, ('Y', 'Y'): -1,
29 |     ('Y', 'K'): -2, ('Y', 'M'): -2, ('Y', 'B'): -1, ('Y', 'V'): -3,
30 |     ('Y', 'H'): -1, ('Y', 'D'): -3, ('Y', 'N'): -1,
31 | 
32 |     ('K', 'S'): -2, ('K', 'W'): -2, ('K', 'R'): -2, ('K', 'Y'): -2,
33 |     ('K', 'K'): -1, ('K', 'M'): -4, ('K', 'B'): -1, ('K', 'V'): -3,
34 |     ('K', 'H'): -3, ('K', 'D'): -1, ('K', 'N'): -1,
35 | 
36 |     ('M', 'S'): -2, ('M', 'W'): -2, ('M', 'R'): -2, ('M', 'Y'): -2,
37 |     ('M', 'K'): -4, ('M', 'M'): -1, ('M', 'B'): -3, ('M', 'V'): -1,
38 |     ('M', 'H'): -1, ('M', 'D'): -3, ('M', 'N'): -1,
39 | 
40 |     ('B', 'S'): -1, ('B', 'W'): -3, ('B', 'R'): -3, ('B', 'Y'): -1,
41 |     ('B', 'K'): -1, ('B', 'M'): -3, ('B', 'B'): -1, ('B', 'V'): -2,
42 |     ('B', 'H'): -2, ('B', 'D'): -2, ('B', 'N'): -1,
43 | 
44 |     ('V', 'S'): -1, ('V', 'W'): -3, ('V', 'R'): -1, ('V', 'Y'): -3,
45 |     ('V', 'K'): -3, ('V', 'M'): -1, ('V', 'B'): -2, ('V', 'V'): -1,
46 |     ('V', 'H'): -2, ('V', 'D'): -2, ('V', 'N'): -1,
47 | 
48 |     ('H', 'S'): -3, ('H', 'W'): -1, ('H', 'R'): -3, ('H', 'Y'): -1,
49 |     ('H', 'K'): -3, ('H', 'M'): -1, ('H', 'B'): -2, ('H', 'V'): -2,
50 |     ('H', 'H'): -1, ('H', 'D'): -2, ('H', 'N'): -1,
51 | 
52 |     ('D', 'S'): -3, ('D', 'W'): -1, ('D', 'R'): -1, ('D', 'Y'): -3,
53 |     ('D', 'K'): -1, ('D', 'M'): -3, ('D', 'B'): -2, ('D', 'V'): -2,
54 |     ('D', 'H'): -2, ('D', 'D'): -1, ('D', 'N'): -1,
55 | 
56 |     ('N', 'S'): -1, ('N', 'W'): -1, ('N', 'R'): -1, ('N', 'Y'): -1,
57 |     ('N', 'K'): -1, ('N', 'M'): -1, ('N', 'B'): -1, ('N', 'V'): -1,
58 |     ('N', 'H'): -1, ('N', 'D'): -1, ('N', 'N'): -1,
59 | 
60 | }
61 | 
62 | NUCLEOTIDE_AMBIGUOUS_SCORING = {
63 |     ('A', 'S'): -4, ('A', 'W'): 1, ('A', 'R'): 1, ('A', 'Y'): -4,
64 |     ('A', 'K'): -4, ('A', 'M'): 1, ('A', 'B'): -4, ('A', 'V'): -1,
65 |     ('A', 'H'): -1, ('A', 'D'): -1, ('A', 'N'): -2,
66 | 
67 |     ('U', 'S'): -4, ('U', 'W'): 1, ('U', 'R'): -4, ('U', 'Y'): 1,
68 |     ('U', 'K'): 1, ('U', 'M'): -4, ('U', 'B'): -1, ('U', 'V'): -4,
69 |     ('U', 'H'): -1, ('U', 'D'): -1, ('U', 'N'): -2,
70 | 
71 |     ('G', 'S'): 1, ('G', 'W'): -4, ('G', 'R'): 1, ('G', 'Y'): -4,
72 |     ('G', 'K'): 1, ('G', 'M'): -4, ('G', 'B'): -1, ('G', 'V'): -1,
73 |     ('G', 'H'): -4, ('G', 'D'): -1, ('G', 'N'): -2,
74 | 
75 |     ('C', 'S'): 1, ('C', 'W'): -4, ('C', 'R'): -4, ('C', 'Y'): 1,
76 |     ('C', 'K'): -4, ('C', 'M'): 1, ('C', 'B'): -1, ('C', 'V'): -1,
77 |     ('C', 'H'): -1, ('C', 'D'): -4, ('C', 'N'): -2,
78 | 
79 |     ('S', 'A'): -4, ('S', 'U'): -4, ('S', 'G'): 1, ('S', 'C'): 1,
80 |     ('W', 'A'): 1, ('W', 'U'): 1, ('W', 'G'): -4, ('W', 'C'): -4,
81 |     ('R', 'A'): 1, ('R', 'U'): -4, ('R', 'G'): 1, ('R', 'C'): -4,
82 |     ('Y', 'A'): -4, ('Y', 'U'): 1, ('Y', 'G'): -4, ('Y', 'C'): 1,
83 |     ('K', 'A'): -4, ('K', 'U'): 1, ('K', 'G'): 1, ('K', 'C'): -4,
84 |     ('M', 'A'): 1, ('M', 'U'): -4, ('M', 'G'): -4, ('M', 'C'): 1,
85 |     ('B', 'A'): -4, ('B', 'U'): -1, ('B', 'G'): -1, ('B', 'C'): -1,
86 |     ('V', 'A'): -1, ('V', 'U'): -4, ('V', 'G'): -1, ('V', 'C'): -1,
87 |     ('H', 'A'): -1, ('H', 'U'): -1, ('H', 'G'): -4, ('H', 'C'): -1,
88 |     ('D', 'A'): -1, ('D', 'U'): -1, ('D', 'G'): -1, ('D', 'C'): -4,
89 |     ('N', 'A'): -2, ('N', 'U'): -2, ('N', 'G'): -2, ('N', 'C'): -2,
90 | 
91 | }
92 | 
93 | NUC44 = NUCLEOTIDE_NUCLEOTIDE_SCORING
94 | 


--------------------------------------------------------------------------------
/pydca/sequence_backmapper/sequence_backmapper.py:
--------------------------------------------------------------------------------
  1 | from pydca.fasta_reader import fasta_reader
  2 | from . import scoring_matrix
  3 | from Bio import pairwise2
  4 | from Bio.SubsMat.MatrixInfo import blosum62
  5 | import logging
  6 | import os
  7 | 
  8 | """Performs sequence back-mapping of a reference sequence to an MSA sequeces.
  9 | The back-mapping is done by searching the best matching sequence to the reference.
 10 | The searching is carried out by pairwise local alignment of the reference seqeunce
 11 | with the sequences in the multiple sequence alignment (MSA). Among best matching
 12 | sequence (if there are more than one), the first found is taken. The reference
 13 | and best-matching sequences are then pairwise locally aligned to find out the
 14 | portion of the the reference subsequence. The residues in this reference
 15 | subsequence are then mapped to their counterparts in the MSA columns.
 16 | 
 17 | Author: Mehari B. Zerihun
 18 | """
 19 | 
 20 | logger = logging.getLogger(__name__)
 21 | 
 22 | class SequenceBackmapper:
 23 |     """Defines a sequence backmapper class. Instances of SequenceBackmapper perform
 24 |     mapping of a reference sequence to the best matching sequence in the multiple
 25 |     sequence alignment (MSA) data.
 26 |     """
 27 |     def __init__(self, msa_file = None, alignment_data = None, ref_seq=None,
 28 |             refseq_file= None, biomolecule=None):
 29 |         """Initializes a SequenceBackmapper instance
 30 |         Parameters
 31 |         -----------
 32 |             msa_file : str
 33 |                 Path to multiple sequence alignment file in FASTA format.
 34 |             alignment_data : list
 35 |                 A list of alignned sequence in integer representation.
 36 |             ref_seq : str
 37 |                 A reference sequences in character form.
 38 |             refseq_file: str
 39 |                 Path to FASTA formatted file containing a reference sequence.
 40 |             biomolecule : str
 41 |                 Type of the sequence data (protein or RNA)
 42 | 
 43 |         Returns
 44 |         -------
 45 |             None : None
 46 | 
 47 |         """
 48 |         self.__biomolecule = biomolecule.strip().upper()
 49 |         if msa_file:
 50 |             self.__alignment = fasta_reader.get_alignment_char_form(msa_file,
 51 |                 biomolecule=self.__biomolecule
 52 |             )
 53 |         elif alignment_data:
 54 |             unique_seqs = []
 55 |             for seq in alignment_data:
 56 |                 if seq not in unique_seqs: unique_seqs.append(seq)
 57 |             unique_seqs_char_form = fasta_reader.sequences_to_char_form(
 58 |                 unique_seqs, self.__biomolecule)
 59 |             self.__alignment = unique_seqs_char_form
 60 |         else:
 61 |             logger.error('\n\tPlease provide alignment file or a list of alignments')
 62 |             raise ValueError
 63 | 
 64 |         if refseq_file:
 65 |             self.__ref_sequence = self._reference_sequence(
 66 |                 refseq_file=refseq_file)
 67 |         elif ref_seq:
 68 |             self.__ref_sequence = ref_seq.strip().upper()
 69 |         else:
 70 |             logger.error('\n\tPlease provide a reference sequence or a FASTA'
 71 |                 ' file containing a reference sequence')
 72 |             raise ValueError
 73 |         self._validate_refseq()
 74 | 
 75 |         return None
 76 | 
 77 | 
 78 |     @property
 79 |     def alignment(self):
 80 |         """SequenceBackmapper alignment getter
 81 |         Parameters
 82 |         ----------
 83 |             self : SequenceBackmapper
 84 |                 An instance of SequenceBackmapper class
 85 |         Returns
 86 |         -------
 87 |             self.__alignment : list
 88 |                 The list of MSA in char represenation
 89 |         """
 90 |         return self.__alignment
 91 | 
 92 | 
 93 |     @property
 94 |     def ref_sequence(self):
 95 |         """SequenceBackmapper reference sequence getter.
 96 | 
 97 |         Parameters
 98 |         ----------
 99 |             self : SequenceBackmapper
100 |                 An instance of SequenceBackmapper class
101 | 
102 |         Returns
103 |         -------
104 |             self.__ref_sequence : str
105 |                 The reference sequence of a SequenceBackmapper instance.
106 |         """
107 |         return self.__ref_sequence
108 | 
109 |     def __str__(self):
110 |         """Provides a human readable representation of a SequenceBackmapper
111 |         instance.
112 | 
113 |         Parameters
114 |         ----------
115 |             self : SequenceBackmapper
116 |                 An instance of SequenceBackmapper class
117 | 
118 |         Returns
119 |         -------
120 |             describe : str
121 |                 Description about a SequenceBackmapper instance.
122 |         """
123 |         describe = '<A sequence backmapper object of biomolecule type {}>'
124 |         return describe.format(self.__biomolecule)
125 | 
126 | 
127 |     def _validate_refseq(self):
128 |         """Validate the reference sequence by checking if it contains gaps
129 |         or non-standard residues
130 | 
131 |         Parameters
132 |         ----------
133 |             self : SequenceBackmapper
134 |                 An instance of SequenceBackmapper class
135 | 
136 |         Returns
137 |         -------
138 |             None : None
139 |         """
140 |         standard_res_plus_gap = fasta_reader.RES_TO_INT_ALL[self.__biomolecule].keys()
141 |         gap_symbols = ['-', '.', '~']
142 |         standard_residues = [
143 |             state for state in standard_res_plus_gap if state not in gap_symbols
144 |         ]
145 |         for res in self.__ref_sequence:
146 |             if res not in standard_residues:
147 |                 logger.error('\n\tReference sequence should only contain standard residues')
148 |                 raise ValueError
149 |         return None
150 | 
151 | 
152 |     def _reference_sequence(self, refseq_file):
153 |         """Reads a reference sequence from FASTA file. Note that if there are
154 |         more than one sequences in the file, only the one that is found first
155 |         taken as a reference sequence.
156 | 
157 |         Parameters
158 |         -----------
159 |             refseq_file: str
160 |                 Path to fasta file containing the reference sequence.
161 |                 If there are multiple sequences, the first is taken
162 | 
163 |         Returns
164 |         -------
165 |             ref_sequence : str
166 |                 Reference sequence
167 |         """
168 |         logger.info('\n\tObtaining reference sequence from file:'
169 |             '\n\t\t{}'.format(refseq_file)
170 |         )
171 |         ref_seqs = fasta_reader.get_alignment_char_form(
172 |             refseq_file,
173 |             biomolecule = self.__biomolecule,
174 |         )
175 |         ref_sequence = ref_seqs[0]
176 |         if len(ref_seqs) > 1 :
177 |             logger.warning('\n\tFound multiple reference sequences in file {}.'
178 |                 '\n\tFirst sequence taken as reference'.format(os.path.basename(
179 |                 refseq_file)))
180 |         if not ref_sequence:
181 |             logger.error('\n\tNo reference sequence found')
182 |             raise ValueError
183 |         logger.info('\n\tReference sequence:\n\t{}'.format(ref_sequence))
184 | 
185 |         return ref_sequence.strip().upper()
186 | 
187 | 
188 |     def align_pairs_local(self, ref_seq, other_seq, score_only = False):
189 |         """Performs pairwise alignment give two sequences
190 | 
191 |         Parameters
192 |         ----------
193 |             ref_seq : str
194 |                 Reference sequence
195 |             other_seq : str
196 |                 Sequence to be aligned to reference
197 |             biomolecule : str
198 |                 Sequences type, protein or RNA
199 | 
200 |         Returns
201 |         -------
202 |             alignments : tuple
203 |                 A tuple of pairwise aligned sequences, alignment score and
204 |                 start and end indices of alignment
205 |         """
206 |         if self.__biomolecule == 'RNA':
207 |             scoring_mat = scoring_matrix.NUC44
208 |             GAP_OPEN_PEN = -8
209 |             GAP_EXTEND_PEN = 0
210 |         elif self.__biomolecule == 'PROTEIN':
211 |             scoring_mat = blosum62
212 |             GAP_OPEN_PEN = -10
213 |             GAP_EXTEND_PEN = -1
214 |         else:
215 |             logger.error('\n\tUnknown biomolecule type.'
216 |                 ' Cannot figure out the scoring matrix.')
217 |             raise ValueError
218 | 
219 |         alignments = pairwise2.align.localds(
220 |             ref_seq,
221 |             other_seq,
222 |             scoring_mat,
223 |             GAP_OPEN_PEN,
224 |             GAP_EXTEND_PEN,
225 |             score_only = score_only,
226 |         )
227 | 
228 |         return alignments
229 | 
230 | 
231 |     def find_matching_seqs_from_alignment(self):
232 |         """Finds the best matching sequences to the reference
233 |         sequence in the alignment. If multiple matching sequences
234 |         are found, the first one (according to the order in the MSA)
235 |         is taken
236 | 
237 |         Parameters
238 |         ----------
239 |             self : SequenceBackmapper
240 |                 An instance of SequenceBackmapper class
241 | 
242 |         Returns
243 |         -------
244 |             best_matching_seqs : list
245 |                 A list of best matching sequences to reference
246 |         """
247 | 
248 |         logger.info('\n\tSearching for sequence(s) that match best with the'
249 |             ' reference sequence')
250 |         # if the first sequence (gaps removed) in MSA matches with reference,
251 |         # return this sequence.
252 |         first_seq_in_alignment = self.__alignment[0]
253 |         first_seq_in_alignment_gaps_removed = first_seq_in_alignment.replace('-','')
254 |         if first_seq_in_alignment_gaps_removed == self.__ref_sequence:
255 |             logger.info('\n\tFirst sequence in alignment (gaps removed) matches reference,'
256 |                 '\n\tSkipping regorous search for matching sequence'
257 |             )
258 |             first_seq = list()
259 |             first_seq.append(first_seq_in_alignment)
260 |             return first_seq
261 |         pairwise_scores = []
262 |         for seq_indx, seq in enumerate(self.__alignment):
263 |             seq_gaps_removed = seq.replace('-','')
264 | 
265 |             score = self.align_pairs_local(
266 |                 self.__ref_sequence,
267 |                 seq_gaps_removed,
268 |                 score_only = True,
269 |                 )
270 |             score_at_indx = (seq_indx, score)
271 |             pairwise_scores.append(score_at_indx)
272 | 
273 |         seq_indx, max_score = max(pairwise_scores, key=lambda x: x[1])
274 |         matching_seqs_indx = [
275 |             indx  for indx, score in pairwise_scores if score == max_score
276 |         ]
277 | 
278 |         best_matching_seqs = [
279 |             self.__alignment[indx] for indx in matching_seqs_indx
280 |         ]
281 |         num_matching_seqs = len(best_matching_seqs)
282 |         if num_matching_seqs > 1 :
283 |             logger.warning('\n\tFound {} sequences in MSA that match the reference'
284 |                 '\n\tThe first sequence is taken as matching'.format(num_matching_seqs)
285 |             )
286 |         return best_matching_seqs
287 | 
288 |     @staticmethod
289 |     def align_subsequences(ref_middle_subseq = None,
290 |             template_subseq_in_msa = None, num_res_middle_template = None):
291 |         """Aligns the portion of reference sequence (ref_middle_subseq) by
292 |         scanning through the portion of template sequence (template_subseq_in_msa)
293 |         and inserting the gaps encountered into the portion of the reference.
294 | 
295 |         Parameters
296 |         ----------
297 |             ref_middle_subseq : str
298 |                 The portion of the reference sequence that matched when pairwise
299 |                 aligned with the template sequence in MSA.
300 |             template_subseq_in_msa : str
301 |                 The portion of the template sequence that matched with the
302 |                 reference sequence when pairwise aligned with the reference
303 |                 sequence.
304 |             num_res_middle_template : int
305 |                 The number of residues (excluding gaps) in matched portion of
306 |                 the template sequence.
307 | 
308 |         Returns
309 |         -------
310 |             ''.join(mapped_ref_subseq) : str
311 |                 The mapped form of reference sequence. This mapped form contains
312 |                 the residues and gaps of the matching portion of the reference
313 |                 sequence as well as the newly introduced gap states from the
314 |                 tempate sequence as it appears in MSA data.
315 |         """
316 |         mapped_ref_subseq = []
317 |         res_count = 0
318 |         pos = 0
319 |         gap_symbol = '-'.strip()
320 |         for site in template_subseq_in_msa:
321 |             if res_count == num_res_middle_template: break
322 |             if site != gap_symbol:
323 |                 mapped_ref_subseq.append(ref_middle_subseq[pos])
324 |                 pos += 1
325 |                 res_count += 1
326 |                 #added after Fabrizio's bug report
327 |                 if pos == len(ref_middle_subseq): break
328 |             else:
329 | 
330 |                 if ref_middle_subseq[pos] != gap_symbol:
331 |                     mapped_ref_subseq.append(gap_symbol)
332 |                 else:
333 |                     mapped_ref_subseq.append(ref_middle_subseq[pos])
334 |                     pos += 1
335 |         mapped_ref_subseq.extend(list(ref_middle_subseq[pos:]))
336 |         return ''.strip().join(mapped_ref_subseq)
337 | 
338 | 
339 |     def map_to_reference_sequence(self):
340 |         """Mapps the reference sequence to the template sequence in alignment
341 |         data. The template sequence is the best scoring sequence when pairwise
342 |         locally aligned with the reference sequence. Here are the steps for
343 |         backmapping:
344 | 
345 |         i) Find the best matching (template) sequence from the MSA. This is done
346 |         by using pairwise local alignment with the reference sequence.
347 | 
348 |         ii) Find the aligned portions of the reference and template sequence
349 |         when they are aligned locally. In this step, start  and end indices
350 |         of the matching subsequences, and the number of residues in these two
351 |         subsequences are recorded.
352 | 
353 |         iii) Map the matching portion of the reference sequence to the matching
354 |         portion of the template sequence in its form in the MSA data. In this
355 |         step, the gaps that are in the template sequence but are not in the
356 |         reference are inserted into the reference sequence at the corresponding
357 |         positions.
358 | 
359 |         iv) Map the sites (indices) of the reference sequence. In this step,
360 |         mapping is done using the matching start index position of the template
361 |         sequence as it appears in the MSA. The reference sequence residues are
362 |         counted starting from the residue that has been found at the starting
363 |         position of the matching portion when the reference and template
364 |         sequences were locally aligned.
365 | 
366 |         Parameters
367 |         ----------
368 |             self : SequenceBackmapper
369 |                 An instance of SequenceBackmapper class
370 | 
371 |         Returns
372 |         --------
373 |             mapped_sites : dict
374 |                 A dictionary containing the position of mapped residues as they
375 |                 appear in the reference sequence as keys and their corresponding
376 |                 mapping index in the MSA as values. E.g. {4:9, 5:10, 6:13, ..}
377 |                 mapps site 5 (index 4) in the MSA to index 9 (site 10) int the
378 |                 reference sequence, ... and so on. The non-aligned residues that might
379 |                 appear at the begining or end of the matching subsequences are
380 |                 not mapped.
381 |         """
382 |         logger.info('\n\tBackmapping reference sequence to MSA')
383 |         # find best matching sequences to the ref. sequence from the alignment.
384 |         template_sequences_in_msa = self.find_matching_seqs_from_alignment()
385 |         # take the first matching sequence (there can be multiple matches)
386 |         template_seq_in_msa = template_sequences_in_msa[0]
387 |         logger.info('\n\tTemplate sequence in msa:\n{}'.format(template_seq_in_msa))
388 |         # remove the gaps from the matching sequence so that it can be pair
389 |         # aligned with the reference sequence.
390 |         gap_symbol = '-'.strip()
391 |         null_char = ''.strip()
392 |         template_seq_in_msa_gaps_removed = template_seq_in_msa.replace(
393 |             gap_symbol, null_char
394 |         )
395 |         logger.info('\n\tReference sequence and Template sequence (gaps removed)'
396 |             ' respectively:\n{}\n{}'.format(self.__ref_sequence,
397 |                 template_seq_in_msa_gaps_removed
398 |             ),
399 |         )
400 |         # pairwise locally align the reference and matching sequenece
401 |         ref_and_template_aligned = self.align_pairs_local(self.__ref_sequence,
402 |             template_seq_in_msa_gaps_removed,
403 |         )
404 |         # caputre the aligned form of the ref. and matching sequences
405 |         # as well as the score, and start and end indices
406 |         ref_seq_aligned = ref_and_template_aligned[0][0]
407 |         template_seq_aligned = ref_and_template_aligned[0][1]
408 |         the_score = ref_and_template_aligned[0][2]
409 |         start_indx = ref_and_template_aligned[0][3]
410 |         end_indx = ref_and_template_aligned[0][4]
411 |         # capture the matching portions of the aligned sequences
412 |         ref_middle_subseq = ref_seq_aligned[start_indx:end_indx]
413 |         template_middle_subseq = template_seq_aligned[start_indx:end_indx]
414 |         logger.info('\n\tMatching subsequences of the reference and the template'
415 |             ' respectively:\n{}\n{}'
416 |             '\n\tMatching start and end positions:[{}, {}]'.format(ref_middle_subseq,
417 |             template_middle_subseq, start_indx + 1, end_indx + 1,
418 |             ),
419 |         )
420 |         # capture the number of residues (excluding gaps) in each subsequence
421 |         num_leading_res_template = len(
422 |             template_seq_aligned[:start_indx].replace(gap_symbol, null_char)
423 |         )
424 |         num_leading_res_ref = len(
425 |             ref_seq_aligned[:start_indx].replace(gap_symbol, null_char)
426 |         )
427 |         num_res_middle_template = len(
428 |             template_middle_subseq.replace(gap_symbol, null_char)
429 |         )
430 |         num_res_middle_ref = len(
431 |             ref_middle_subseq.replace(gap_symbol, null_char)
432 |         )
433 | 
434 |         # find start index of the matching seq. in the alignment
435 |         res_count = 0
436 |         for k, site in enumerate(template_seq_in_msa, start=0):
437 | 
438 |             if res_count == num_leading_res_template:
439 |                 start_indx_in_msa = k
440 |                 break
441 |             if site != gap_symbol: res_count += 1
442 | 
443 |         template_subseq_in_msa = template_seq_in_msa[start_indx_in_msa:] # excludes only leading sites
444 |         backmapped_ref_subseq = self.align_subsequences(
445 |             ref_middle_subseq = ref_middle_subseq,
446 |             template_subseq_in_msa = template_subseq_in_msa,
447 |             num_res_middle_template = num_res_middle_template,
448 |         )
449 |         logger.info('\nBackmapped ref subsequence:\n{}'.format(backmapped_ref_subseq))
450 |         mapped_sites = dict() # keys are refseq sites and values are matching seq sites in MSA
451 |         mapped_res_count = 0
452 |         for k , site in enumerate(backmapped_ref_subseq, start = 0):
453 |             #we can only map a maximum of alignment length sites
454 |             if k == len(template_seq_in_msa) - start_indx_in_msa: break
455 |             if site != gap_symbol:
456 |                 mapped_sites[mapped_res_count + num_leading_res_ref] = start_indx_in_msa + k
457 |                 mapped_res_count += 1
458 |         logger.info('\n\tNumber of residues mapped: {}'
459 |             '\n\tNumber of residues in the (original) reference sequence: {}'.format(
460 |                 len(mapped_sites), len(self.__ref_sequence))
461 |         )
462 |         # invert the mapping so that keys are matching seq sites in MSA and values are refseq sites.
463 |         mapped_sites_from_msa_to_ref = {
464 |             value:key for key, value in mapped_sites.items()
465 |         }
466 |         return mapped_sites_from_msa_to_ref
467 | 
468 | 
469 | if __name__ == '__main__':
470 |     """
471 |     from pydca.config_dca.config_log import LOGGING_CONFIG
472 |     from argparse import ArgumentParser
473 |     import logging
474 |     import logging.config
475 |     logging.config.dictConfig(LOGGING_CONFIG)
476 |     parser = ArgumentParser()
477 |     parser.add_argument('msa_file', help = 'FASTA file containing alignment data')
478 |     parser.add_argument('refseq_file', help = 'FASTA file containing reference sequence')
479 |     parser.add_argument('biomolecule', choices = ['protein', 'PROTEIN', 'rna', 'RNA'])
480 |     args = parser.parse_args()
481 |     alignment_data = fasta_reader.get_alignment_int_form(args.msa_file,
482 |         biomolecule=args.biomolecule)
483 |     seq_backmapper = SequenceBackmapper(alignment_data = alignment_data,
484 |         refseq_file = args.refseq_file , biomolecule = args.biomolecule)
485 |     backmapped_sites =seq_backmapper.map_to_reference_sequence()
486 |     """
487 |     ref_middle_subseq =      'AAAAAAAA---AA' # 8 res
488 |     template_subseq_in_msa = '---BBB--BB-BBBB----' # 9 res
489 |     num_res_middle_template = 9
490 |     seq_backmapper.trigger_gaps(
491 |         ref_middle_subseq = ref_middle_subseq,
492 |         template_subseq_in_msa = template_subseq_in_msa,
493 |         num_res_middle_template = num_res_middle_template,
494 |     )
495 |     """
496 | 
497 |     """
498 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | scipy==1.3.1
2 | biopython==1.74
3 | numpy>=1.13.3, <=1.15.4
4 | llvmlite==0.30.0
5 | numba==0.46.0
6 | matplotlib==3.0.0
7 | requests>=2.22.0
8 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages, Extension
 2 | from setuptools.command.build_ext import build_ext 
 3 | 
 4 | with open("README.md") as fh:
 5 |     long_description = fh.read()
 6 | 
 7 | requirements = [
 8 |     "scipy==1.3.1",
 9 |     "biopython==1.74",
10 |     "numpy>=1.13.3, <=1.15.4",
11 |     'llvmlite==0.30.0',
12 |     "numba==0.46.0",
13 |     "matplotlib==3.0.0",
14 |     "requests>=2.22.0",
15 | ]
16 | 
17 | plmdca_compile_args = ["-fopenmp", "-std=c++11", "-O3"]  
18 | plmdca_link_args = ["-fopenmp", "-O3"] 
19 | 
20 | 
21 | plmdca_ext = Extension(
22 |     'pydca.plmdca._plmdcaBackend',
23 |     [   
24 |         'pydca/plmdca/lbfgs/lib/lbfgs.cpp',
25 |         'pydca/plmdca/plmdca_numerics.cpp',
26 |         'pydca/plmdca/plmdcaBackend.cpp', 
27 |     ],
28 |     include_dirs=[
29 |         'pydca/plmdca/include/',
30 |         'pydca/plmdca/lbfgs/include/',
31 |     ],
32 |     extra_compile_args = plmdca_compile_args,
33 |     extra_link_args = plmdca_link_args,
34 |     language = "c++",  
35 | )
36 | 
37 | setup(
38 |     name="pydca",
39 |     version="1.23",
40 |     author="Mehari B. Zerihun",
41 |     author_email="mbzerihun@gmail.com",
42 |     python_requires=">=3.5",
43 |     description="Direct couplings analysis (DCA) for protein and RNA sequences",
44 |     long_description=long_description,
45 |     long_description_content_type="text/markdown",
46 |     url="https://github.com/KIT-MBS/pydca",
47 |     download_url="https://pypi.org/project/pydca/",
48 |     packages=find_packages(
49 |         exclude=["*.tests","*.tests.*","tests.*", "tests",
50 |             "*.extras", "*.extras.*", "extras.*", "extras",
51 |             "examples", "*.examples", "examples.*", "*.examples.*",
52 |             "install.sh",
53 |         ],
54 |     ),
55 |     ext_modules = [plmdca_ext],
56 |     classifiers=[
57 |         "Programming Language :: Python :: 3",
58 |         "Programming Language :: C++",
59 |         "Programming Language :: C",
60 |         "License :: OSI Approved :: MIT License",
61 |         "Development Status :: 4 - Beta",
62 |         "Operating System :: POSIX :: Linux",
63 |         "Topic :: Scientific/Engineering :: Bio-Informatics",
64 |     ],
65 |     install_requires= requirements,
66 |     tests_require = requirements,
67 |     entry_points={
68 |         "console_scripts":[
69 |             "mfdca = pydca.mfdca_main:run_meanfield_dca",
70 |             "plmdca = pydca.plmdca_main:run_plm_dca",
71 |             "pydca = pydca.main:run_pydca",
72 |         ],
73 |     },
74 |     test_suite="tests",
75 | )


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | """Implements unit tests for pydca package.
2 | """
3 | 


--------------------------------------------------------------------------------
/tests/fasta_reader_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import, division
 2 | import unittest
 3 | import os
 4 | import glob
 5 | from pydca.fasta_reader import fasta_reader
 6 | from .input_files_path import InputFilesPath
 7 | class TestCase(unittest.TestCase):
 8 |     def setUp(self):
 9 |         """
10 |         """
11 |         self.__rna_msa_file = InputFilesPath.rna_msa_file
12 |         self.__protein_msa_file = InputFilesPath.protein_msa_file
13 |         self.__rna = 'rna'
14 |         self.__protein = 'protein'
15 | 
16 | 
17 | 
18 |     def test_get_alignment_from_fasta_file(self):
19 |         rna_seqs = fasta_reader.get_alignment_from_fasta_file(
20 |             self.__rna_msa_file,
21 |         )
22 |         self.assertIsNotNone(rna_seqs)
23 |         protein_seqs = fasta_reader.get_alignment_from_fasta_file(
24 |             self.__protein_msa_file,
25 |         )
26 |         self.assertIsNotNone(protein_seqs)
27 | 
28 |     def test_get_alignment_int_form(self):
29 |         rna_seqs_int_form = fasta_reader.get_alignment_int_form(
30 |             self.__rna_msa_file,
31 |             biomolecule = self.__rna,
32 |         )
33 |         self.assertIsNotNone(rna_seqs_int_form)
34 |         protein_seqs_int_form = fasta_reader.get_alignment_int_form(
35 |             self.__protein_msa_file,
36 |             biomolecule = self.__protein,
37 |         )
38 | 
39 |         self.assertIsNotNone(protein_seqs_int_form)
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     unittest.main()
44 | 


--------------------------------------------------------------------------------
/tests/input_files_path.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | class InputFilesPath:
 4 |     """Defines paths to input files for testing.
 5 |     attributes:
 6 |         rna_msa_file:
 7 |             Absolute path to MSA FASTA file containg RNA sequences.
 8 |         rna_ref_file:
 9 |             Absolute path to FASTA file containing RNA reference sequence
10 |         protein_msa_file:
11 |             Absolute path to MSA FASTA file containing protein sequences
12 |         protein_ref_file:
13 | 
14 |     """
15 |     rna_msa_file = os.path.abspath("tests/tests_input/MSA_RF00059_trimmed_gap_treshold_50.fa")
16 |     rna_ref_file = os.path.abspath("tests/tests_input/ref_seq_RF00059.faa")
17 |     protein_msa_file = os.path.abspath("tests/tests_input/PF02826.faa")
18 |     protein_ref_file = os.path.abspath("tests/tests_input/ref_seq_PF02826.faa")
19 | 


--------------------------------------------------------------------------------
/tests/meanfield_dca_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from pydca.meanfield_dca.meanfield_dca import MeanFieldDCA
 3 | from .input_files_path import InputFilesPath
 4 | import unittest
 5 | 
 6 | from Bio import AlignIO
 7 | 
 8 | class MeanFieldDCATestCase(unittest.TestCase):
 9 |     """Test MeanFieldDCA instance behaviour
10 |     """
11 |     def setUp(self):
12 |         #rna test files
13 |         self.__rna_msa_file = InputFilesPath.rna_msa_file
14 |         self.__rna_ref_file = InputFilesPath.rna_ref_file
15 |         #protein test files
16 |         self.__protein_msa_file = InputFilesPath.protein_msa_file
17 |         self.__protein_ref_file = InputFilesPath.protein_ref_file
18 | 
19 | 
20 |         self.__mfdca_instance_protein = MeanFieldDCA(
21 |             self.__protein_msa_file,
22 |             'protein',
23 |         )
24 |         self.__mfdca_instance_rna = MeanFieldDCA(
25 |             self.__rna_msa_file,
26 |             'rna',
27 |         )
28 | 
29 | 
30 |     def test_compute_sorted_DI_rna(self):
31 |         """
32 |         """
33 |         #self.__mfdca_instance_protein.compute_sorted_DI()
34 |         sorted_DI = self.__mfdca_instance_rna.compute_sorted_DI()
35 | 
36 |     def test_compute_sorted_DI_protein(self):
37 |         """
38 |         """
39 |         sorted_DI = self.__mfdca_instance_protein.compute_sorted_DI()
40 | 
41 | 
42 | class MeanFieldDCAInputTestCase(unittest.TestCase):
43 |     """
44 |     Consistency test for decoupling msa handling from msa file format
45 |     """
46 |     def setUp(self):
47 |         self.__protein_msa_file = InputFilesPath.protein_msa_file
48 |         self.__protein_ref_file = InputFilesPath.protein_ref_file
49 | 
50 |     def test_input(self):
51 |         mfdca_file = MeanFieldDCA(self.__protein_msa_file, 'protein')
52 |         fnapc_file = mfdca_file.compute_sorted_FN_APC()
53 | 
54 |         # read MSA
55 |         msa = AlignIO.read(self.__protein_msa_file, 'fasta')
56 | 
57 | 
58 |         mfdca = MeanFieldDCA(msa, 'protein')
59 |         fnapc = mfdca.compute_sorted_FN_APC()
60 | 
61 |         self.assertEqual(fnapc, fnapc_file)
62 | 
63 | 
64 | if __name__ == '__main__':
65 |     unittest.main()
66 | 


--------------------------------------------------------------------------------
/tests/sequence_backmapper_test.py:
--------------------------------------------------------------------------------
 1 | from __future__ import  absolute_import
 2 | import unittest
 3 | from pydca.sequence_backmapper import sequence_backmapper as seq_backmapper
 4 | from pydca.fasta_reader import fasta_reader
 5 | from .input_files_path import  InputFilesPath
 6 | 
 7 | 
 8 | class SequenceBackmapperTestCase(unittest.TestCase):
 9 | 
10 |     def setUp(self):
11 |         """
12 |         """
13 |         self.__rna_msa_file = InputFilesPath.rna_msa_file
14 |         self.__rna_ref_file = InputFilesPath.rna_ref_file
15 |         self.__protein_msa_file = InputFilesPath.protein_msa_file
16 |         self.__protein_ref_file = InputFilesPath.protein_ref_file
17 | 
18 | 
19 |         rna_alignment_int_form = fasta_reader.get_alignment_int_form(
20 |             self.__rna_msa_file, biomolecule = 'rna')
21 | 
22 |         self.__rna_backmapper = seq_backmapper.SequenceBackmapper(
23 |             alignment_data = rna_alignment_int_form,
24 |             refseq_file = self.__rna_ref_file,
25 |             biomolecule = 'rna',
26 |         )
27 | 
28 |         protein_alignment_int_form = fasta_reader.get_alignment_int_form(
29 |             self.__protein_msa_file, biomolecule='protein')
30 | 
31 |         self.__protein_backmapper = seq_backmapper.SequenceBackmapper(
32 |             alignment_data = protein_alignment_int_form,
33 |             refseq_file = self.__protein_ref_file,
34 |             biomolecule = 'protein'
35 |         )
36 | 
37 | 
38 |     def test_map_to_reference_sequence(self):
39 |         mapped_sites_rna = self.__rna_backmapper.map_to_reference_sequence()
40 |         self.assertTrue(len(mapped_sites_rna) > 1)
41 |         mapped_sites_protein = self.__protein_backmapper.map_to_reference_sequence()
42 |         self.assertTrue(len(mapped_sites_protein) > 1)
43 | 
44 | 
45 | if __name__ == '__main__':
46 |     unittest.main()
47 | 


--------------------------------------------------------------------------------
/tests/tests_input/ref_seq_PF02826.faa:
--------------------------------------------------------------------------------
1 | >referenceseq
2 | FALIMAIARRVVELAEFVKQGQWvsHIDEPQFGSDVHGKTLGMVGMGRIGAAVARRGAGFGMRVCYSNASPKPVLEAELGARRCELDELLSQSDFVCATVPLTTETHHLLGAEEFRRMKPSAIFINIARGSVVDEQALIAALQAGQLRGAGLDVFEEEPVSLDSPLLRMPSVVALPHIG
3 | 


--------------------------------------------------------------------------------
/tests/tests_input/ref_seq_RF00059.faa:
--------------------------------------------------------------------------------
1 | >RF00059
2 | ggacucggggugcccuucugcgugaaggcugagaaauacccguaucaccugaucuggauaaugccagcguagggaaguuc
3 | >nonvalid-for-testing
4 | ggaacccggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggggg
5 | 


--------------------------------------------------------------------------------
/tests/tests_input/ref_seq_RF00059_test1.faa:
--------------------------------------------------------------------------------
1 | >CP002810.1/1659724-1659819 
2 | CCCCACGACACGGGGUGCGCCCGCCGGGCGCUGAGAUCACACCCGUCGAACCUGACUCAGCUCGUACUGACGGAGGGAUGUCGACCAUGACGCUG
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/tests_input/ref_seq_RF00059_test2.faa:
--------------------------------------------------------------------------------
1 | >CP002810.1/1659724-1659819 
2 | ACGACACGGGGUGCGCCCGCCGGGCGCUGAGAUCACACCCGUCGAACCUGACUCAGCUCGUACUGACGGAGGGAUGUCGACCAUGACGCUG
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/tests_input/ref_seq_RF00059_test3.faa:
--------------------------------------------------------------------------------
1 | >CP002810.1/1659724-1659819 
2 | CCCCACGACACGGGGUGCGCCCGCCGGGCGCUGAGAUCACACCCGUCGAACCUGACUCAGCUCGUACUGACGGAGGGAUGUCGACCA
3 | 
4 | 


--------------------------------------------------------------------------------
/tests/tests_input/ref_seq_RF00059_test4.faa:
--------------------------------------------------------------------------------
1 | >CP002810.1/1659724-1659819 
2 | ACGACACGGGGUGCGCCCGCCGGGCGCUGAGAUCACACCCGUCGAACCUGACUCAGCUCGUACUGACGGAGGGAUGUCGACCA
3 | 
4 | 


--------------------------------------------------------------------------------