├── .gitignore
├── .pep8
├── .ycm_extra_conf.py
├── LICENSE
├── README.md
├── azure-pipelines.yml
├── ccmpred
├── __init__.py
├── algorithm
│ ├── __init__.py
│ ├── gradient_descent.py
│ └── lbfgs.py
├── centering.py
├── counts
│ ├── __init__.py
│ └── msacounts.c
├── gaps
│ ├── __init__.py
│ └── cext
│ │ ├── __init__.py
│ │ ├── gaps.c
│ │ └── gaps.h
├── io
│ ├── __init__.py
│ ├── alignment.py
│ ├── contactmatrix.py
│ └── pdb.py
├── locmeth
│ ├── __init__.py
│ ├── mi
│ │ └── __init__.py
│ └── omes
│ │ └── __init__.py
├── logo.py
├── monitor
│ ├── __init__.py
│ └── progress.py
├── objfun
│ ├── __init__.py
│ ├── cd
│ │ ├── __init__.py
│ │ └── cext
│ │ │ ├── __init__.py
│ │ │ ├── cd.c
│ │ │ ├── cd.h
│ │ │ ├── cdutil.c
│ │ │ └── cdutil.h
│ └── pll
│ │ ├── __init__.py
│ │ └── cext
│ │ ├── __init__.py
│ │ ├── pll.c
│ │ └── pll.h
├── parameter_handling.py
├── plotting
│ └── __init__.py
├── pseudocounts.py
├── raw
│ ├── __init__.py
│ ├── ccmraw.py
│ ├── convert_msgpack.py
│ └── convert_raw.py
├── regularization.py
├── sampling
│ ├── __init__.py
│ └── cext
│ │ ├── __init__.py
│ │ ├── treecd.c
│ │ └── treecd.h
├── sanity_check.py
├── scripts
│ ├── __init__.py
│ ├── convert.py
│ ├── plot_ccmpred.py
│ ├── replace_gaps.py
│ ├── run_ccmgen.py
│ └── run_ccmpred.py
├── substitution_matrices.py
├── trees.py
└── weighting
│ ├── __init__.py
│ └── cext
│ ├── __init__.py
│ ├── weighting.c
│ └── weighting.h
├── ci_support
├── 1atzA.braw.gz
├── 1atzA.fas
├── 1atzA_rootname.tree
├── mrf_params.braw.gz
├── phylo.newick
├── random_start_sequence.py
└── run_tests.sh
├── example
├── 1atzA.alignment_statistics.mcmc_pcd_vs_original.png
├── 1atzA.apc.html
├── 1atzA.apc.mat
├── 1atzA.apc.png
├── 1atzA.braw.gz
├── 1atzA.ec.mat
├── 1atzA.fas
├── 1atzA.noapc.mat
├── 1atzA.pcd.apc.png
└── 1atzA.pdb
└── setup.py
/.gitignore:
--------------------------------------------------------------------------------
1 | ### https://raw.github.com/github/gitignore/master/Python.gitignore
2 |
3 | # Byte-compiled / optimized
4 | __pycache__/
5 | *.py[cod]
6 |
7 | # Distribution / packaging
8 | .Python
9 | env/
10 | build/
11 | develop-eggs/
12 | dist/
13 | eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 |
23 | # PyInstaller
24 | # Usually these files are written by a python script from a template
25 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
26 | *.manifest
27 | *.spec
28 |
29 | # Installer logs
30 | pip-log.txt
31 | pip-delete-this-directory.txt
32 |
33 | # Unit test / coverage reports
34 | htmlcov/
35 | .tox/
36 | .coverage
37 | .cache
38 | nosetests.xml
39 | coverage.xml
40 |
41 | # Translations
42 | *.mo
43 | *.pot
44 |
45 | # Django stuff:
46 | *.log
47 |
48 | #Pycharm
49 | .idea/
50 |
51 | # Sphinx documentation
52 | docs/_build/
53 |
54 | # PyBuilder
55 | target/
56 |
57 |
58 | ### https://raw.github.com/github/gitignore/master/C.gitignore
59 |
60 | # Object files
61 | *.o
62 | *.ko
63 | *.obj
64 | *.elf
65 |
66 | # Libraries
67 | *.lib
68 | *.a
69 | *.la
70 | *.lo
71 |
72 | # Shared objects (inc. Windows DLLs)
73 | *.dll
74 | *.so
75 | *.so.*
76 | *.dylib
77 |
78 | # Executables
79 | *.exe
80 | *.out
81 | *.app
82 | *.i*86
83 | *.x86_64
84 | *.hex
85 |
86 |
87 | ### https://raw.github.com/github/gitignore/master/Global/vim.gitignore
88 |
89 | [._]*.s[a-w][a-z]
90 | [._]s[a-w][a-z]
91 | *.un~
92 | Session.vim
93 | .netrwhist
94 | *~
95 |
--------------------------------------------------------------------------------
/.pep8:
--------------------------------------------------------------------------------
1 | [pep8]
2 | ignore = E501
3 |
--------------------------------------------------------------------------------
/.ycm_extra_conf.py:
--------------------------------------------------------------------------------
1 | import os
2 | import ycm_core
3 |
4 | flags = [
5 | '-Wall',
6 | '-Wextra',
7 | '-Werror',
8 | '-std=c99',
9 | '-x',
10 | 'c',
11 | '-Iccmpred/objfun/cd/cext'
12 | ]
13 |
14 |
15 | compilation_database_folder = ''
16 |
17 | if os.path.exists(compilation_database_folder):
18 | database = ycm_core.CompilationDatabase(compilation_database_folder)
19 | else:
20 | database = None
21 |
22 | SOURCE_EXTENSIONS = ['.cpp', '.cxx', '.cc', '.c', '.m', '.mm']
23 |
24 |
25 | def DirectoryOfThisScript():
26 | return os.path.dirname(os.path.abspath(__file__))
27 |
28 |
29 | def MakeRelativePathsInFlagsAbsolute(flags, working_directory):
30 | if not working_directory:
31 | return list(flags)
32 | new_flags = []
33 | make_next_absolute = False
34 | path_flags = ['-isystem', '-I', '-iquote', '--sysroot=']
35 | for flag in flags:
36 | new_flag = flag
37 |
38 | if make_next_absolute:
39 | make_next_absolute = False
40 | if not flag.startswith('/'):
41 | new_flag = os.path.join(working_directory, flag)
42 |
43 | for path_flag in path_flags:
44 | if flag == path_flag:
45 | make_next_absolute = True
46 | break
47 |
48 | if flag.startswith(path_flag):
49 | path = flag[len(path_flag):]
50 | new_flag = path_flag + os.path.join(working_directory, path)
51 | break
52 |
53 | if new_flag:
54 | new_flags.append(new_flag)
55 | return new_flags
56 |
57 |
58 | def IsHeaderFile(filename):
59 | extension = os.path.splitext(filename)[1]
60 | return extension in ['.h', '.hxx', '.hpp', '.hh']
61 |
62 |
63 | def GetCompilationInfoForFile(filename):
64 | # The compilation_commands.json file generated by CMake does not have entries
65 | # for header files. So we do our best by asking the db for flags for a
66 | # corresponding source file, if any. If one exists, the flags for that file
67 | # should be good enough.
68 | if IsHeaderFile(filename):
69 | basename = os.path.splitext(filename)[0]
70 | for extension in SOURCE_EXTENSIONS:
71 | replacement_file = basename + extension
72 | if os.path.exists(replacement_file):
73 | compilation_info = database.GetCompilationInfoForFile(
74 | replacement_file)
75 | if compilation_info.compiler_flags_:
76 | return compilation_info
77 | return None
78 | return database.GetCompilationInfoForFile(filename)
79 |
80 |
81 | def FlagsForFile(filename, **kwargs):
82 | if database:
83 | # Bear in mind that compilation_info.compiler_flags_ does NOT return a
84 | # python list, but a "list-like" StringVec object
85 | compilation_info = GetCompilationInfoForFile(filename)
86 | if not compilation_info:
87 | return None
88 |
89 | final_flags = MakeRelativePathsInFlagsAbsolute(
90 | compilation_info.compiler_flags_,
91 | compilation_info.compiler_working_dir_)
92 |
93 | else:
94 | relative_to = DirectoryOfThisScript()
95 | final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to)
96 |
97 | return {
98 | 'flags': final_flags,
99 | 'do_cache': True
100 | }
101 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # CCMgen and CCMpredPy
2 | [](https://dev.azure.com/christianroth0419/christianroth/_build/latest?definitionId=4&branchName=master) [](https://doi.org/10.1371/journal.pcbi.1006526)
3 |
4 | This repository provides a Python toolkit for learning second-order Markov Random Field (MRF) models from multiple sequence alignments of a protein families and using these models for generating realistic synthetic protein sequences.
5 |
6 | CCMpredPy is a fast implementation of an evolutionary coupling method for learning a Markov Randon Field (MRF) Model for a protein family. The parameters of the MRF can either be ingerred by pseudo-likelihood maximization or with persistent contrastive divergence.
7 | While state-of-the-art pseudo-likelihood models have consistenly been found to work best for the purpose of predicting residue-residue contacts, models learned with persistent contrastive divergence are much more accurate in their fine statistics and are recommended for the use with CCMgen to generate realistic sequence samples.
8 |
9 | CCMgen is a tool for sampling protein-like sequences from a second-order Markov Randon Field (MRF) model, such as it can be learned with CCMpredPy. The residues of generated sequences will obey the selection pressures described by the MRF with pairwise statistical couplings between residue positions. Furthermore, CCMgen provides full control over the generation of the synthetic alignment by allowing to specify the evolutionary times and phylogeny along which the sequences are sampled.
10 |
11 | ## Citation
12 | Vorberg S, Seemayer S, Söding J. Synthetic protein alignments by CCMgen quantify noise in residue-residue contact prediction. PLoS computational biology. 2018 Nov 5;14(11):e1006526.
13 |
14 | ## License
15 |
16 | CCMgen and CCMpredPy are released under the [GNU AGPLv3](https://choosealicense.com/licenses/agpl-3.0/) license.
17 |
18 | ## Dependencies
19 |
20 | - CCMgen/CCMpredPy was developed and tested with Python 3.6
21 | - There are some C libraries to speed up crucial parts of the calculations that need to be compiled with a C compiler.
22 | Note: When installing on osx, make sure to use an appropriate gcc compiler and not clang, e.g. by setting `export CC=/usr/local/Cellar/gcc/X.X.X/bin/gcc-X` if gcc was installed via brew.
23 |
24 | The following Python packages are required
25 |
26 | * NumPy
27 | * SciPy
28 | * BioPython
29 | * MsgPack
30 | * six
31 | * plotly
32 | * colorlover
33 |
34 | ## Download
35 |
36 | ### Release Versions
37 | Please check out the [GitHub releases page for CCMgen](https://github.com/soedinglab/CCMgen/releases/tag/v1.0.0-alpha) to download a stable CCMgen/CCMpredPy release. After you're done downloading and extracting, please follow the installation instructions below.
38 |
39 | ### Development Versions from Git
40 |
41 | To clone the latest development version of CCMgen/CCMpredPy, please use the following command line:
42 |
43 | ```bash
44 | git clone https://github.com/soedinglab/ccmgen.git
45 | ```
46 |
47 | ## Installation
48 |
49 | ### From cloned/downloaded repository
50 |
51 | CCMgen/CCmpredPy can be installed from the main directory into your local Python environment via `pip`:
52 |
53 | ```bash
54 | pip install .
55 | ```
56 |
57 | ### Directly from Github Repository
58 |
59 | Alternatively, you can install the latest development version of CCMgen/CCMpredPy with `pip` directly from this repository:
60 |
61 | ```bash
62 | pip install git+https://github.com/soedinglab/ccmgen@master
63 | ```
64 | and keep updated with:
65 |
66 | ```bash
67 | pip install git+https://github.com/soedinglab/ccmgen@master --upgrade
68 | ```
69 | ## Uninstall
70 |
71 | The CCMgen/CCmpredPy toolkit can be uninstalled with:
72 |
73 | ```bash
74 | pip uninstall ccmgen
75 | ```
76 |
77 |
78 |
79 | ## Next Steps
80 | Now you're ready to use CCMgen and CCMpredPy! You can have a look at the [getting started guide](https://github.com/soedinglab/CCMgen/wiki/Getting-Started-with-CCMgen-and-CCMpredPy) to learn how to use both tools.
81 |
--------------------------------------------------------------------------------
/azure-pipelines.yml:
--------------------------------------------------------------------------------
1 | trigger:
2 | - master
3 | pr:
4 | - master
5 |
6 | jobs:
7 | - job:
8 | displayName: ubuntu-16.04
9 | pool:
10 | vmImage: 'ubuntu-16.04'
11 | strategy:
12 | matrix:
13 | Python36_bp_170:
14 | python.version: '3.6'
15 | biopython.version: '1.70'
16 | Python36:
17 | python.version: '3.6'
18 | biopython.version: '*'
19 | Python39:
20 | python.version: '3.9'
21 | biopython.version: '*'
22 |
23 | steps:
24 | - bash: echo "##vso[task.prependpath]$CONDA/bin"
25 | displayName: Add conda to PATH
26 |
27 | - bash: conda create --yes --quiet --name env
28 | displayName: Create Anaconda environment
29 |
30 | - bash: |
31 | source activate env
32 | conda install --yes --quiet --name env -c conda-forge python=$PYTHON_VERSION biopython=$BIOPYTHON_VERSION pip numpy c-compiler openmp
33 | pip install .
34 | displayName: Install CCMgen
35 | - bash: |
36 | source activate env
37 | bash ci_support/run_tests.sh && test -f sequences.msa
38 | displayName: Run tests
39 |
40 | - job:
41 | displayName: macOS 10.14
42 | pool:
43 | vmImage: 'macOS-10.14'
44 | strategy:
45 | matrix:
46 | Python36_bp_170:
47 | python.version: '3.6'
48 | biopython.version: '1.70'
49 | Python36:
50 | python.version: '3.6'
51 | biopython.version: '*'
52 | Python39:
53 | python.version: '3.9'
54 | biopython.version: '*'
55 |
56 | steps:
57 | - bash: echo "##vso[task.prependpath]$CONDA/bin"
58 | displayName: Add conda to PATH
59 |
60 | - bash: conda create --yes --quiet --name env
61 | displayName: Create Anaconda environment
62 |
63 | - bash: |
64 | source activate env
65 | conda install --yes --quiet --name env -c conda-forge python=$PYTHON_VERSION biopython=$BIOPYTHON_VERSION pip numpy c-compiler openmp
66 | pip install .
67 | displayName: Install CCMgen
68 | - bash: |
69 | source activate env
70 | bash ci_support/run_tests.sh && test -f sequences.msa
71 | displayName: Run tests
--------------------------------------------------------------------------------
/ccmpred/algorithm/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/algorithm/__init__.py
--------------------------------------------------------------------------------
/ccmpred/algorithm/gradient_descent.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import ccmpred.logo
3 | import ccmpred.monitor.progress as pr
4 |
5 |
6 | class gradientDescent():
7 | """Optimize objective function using gradient descent"""
8 |
9 | def __init__(self, progress, neff, maxit=2000, alpha0=0,
10 | decay=True, decay_start=1e-1, decay_rate=5e-6, decay_type="sig",
11 | fix_v=True, epsilon=1e-8, convergence_prev=5, early_stopping=True,
12 | non_contact_indices=None):
13 |
14 |
15 | self.maxit = maxit
16 | self.alpha0 = alpha0
17 |
18 | #initial learning rate defined wrt to effective number of sequences
19 | if self.alpha0 == 0:
20 | self.alpha0 = 5e-2 / np.sqrt(neff)
21 |
22 | #decay settings
23 | self.decay=decay
24 | self.decay_start = decay_start
25 | self.decay_rate = np.float(decay_rate)
26 | self.decay_type = decay_type
27 | self.it_succesfull_stop_condition=-1
28 |
29 | #single potentials will not be optimized if fix_v=True
30 | self.fix_v=fix_v
31 |
32 | #convergence settings for optimization
33 | self.early_stopping = early_stopping
34 | self.epsilon = epsilon
35 | self.convergence_prev=convergence_prev
36 |
37 | #whether optimization is run with constraints (non-contacts are masked)
38 | self.non_contact_indices = non_contact_indices
39 |
40 | #optimization progress logger
41 | self.progress = progress
42 |
43 |
44 |
45 | def __repr__(self):
46 | rep_str="Gradient descent optimization (alpha0={0})\n".format( np.round(self.alpha0, decimals=8))
47 |
48 | rep_str+="\tconvergence criteria: maxit={0} early_stopping={1} epsilon={2} prev={3}\n".format(
49 | self.maxit, self.early_stopping, self.epsilon, self.convergence_prev)
50 |
51 | if self.decay:
52 | rep_str+="\tdecay: decay_type={0} decay_rate={1} decay_start={2} \n".format(
53 | self.decay_type, np.round(self.decay_rate, decimals=8), self.decay_start
54 | )
55 | else:
56 | rep_str+="no decay\n"
57 |
58 | return rep_str
59 |
60 | def minimize(self, objfun, x):
61 |
62 | subtitle = self.progress.title + self.__repr__().replace("\n", "
")
63 | subtitle += objfun.__repr__().replace("\n", "
")
64 | self.progress.set_plot_title(subtitle)
65 |
66 | ret = {
67 | "code": 2,
68 | "message": "Reached maximum number of iterations",
69 | "num_iterations": self.maxit
70 | }
71 |
72 | fx = -1
73 | alpha = self.alpha0
74 | persistent=False
75 | for i in range(self.maxit):
76 |
77 | #in case CD has property persistent=True
78 | #turn on persistent CD when learning rate is small enough
79 | if objfun.persistent and alpha < self.alpha0/10:
80 | persistent=True
81 |
82 | fx, gx, greg = objfun.evaluate(x, persistent)
83 | g = gx + greg
84 |
85 | #decompose gradients and parameters
86 | x_single, x_pair = objfun.linear_to_structured(x)
87 | g_single, g_pair = objfun.linear_to_structured(g)
88 | gx_single, gx_pair = objfun.linear_to_structured(gx)
89 | g_reg_single, g_reg_pair = objfun.linear_to_structured(greg)
90 |
91 | #masking: set coupling gradients for all pairs (i,j) with d_ij > contact_thr = 0
92 | if self.non_contact_indices is not None:
93 | g_pair[self.non_contact_indices[0], self.non_contact_indices[1], :, :] = 0
94 |
95 |
96 | #compute norm of coupling parameters
97 | xnorm_pair = np.sqrt(np.sum(x_pair * x_pair)/2)
98 |
99 | if i > self.convergence_prev:
100 | xnorm_prev = self.progress.optimization_log['||w||'][-self.convergence_prev]
101 | xnorm_diff = np.abs((xnorm_prev - xnorm_pair)) / xnorm_prev
102 | else:
103 | xnorm_diff = 1.0
104 |
105 | #start decay at iteration i
106 | if self.decay and xnorm_diff < self.decay_start and self.it_succesfull_stop_condition < 0:
107 | self.it_succesfull_stop_condition = i
108 |
109 | #new step size
110 | if self.it_succesfull_stop_condition > 0:
111 | t = i - self.it_succesfull_stop_condition + 1
112 | if self.decay_type == "lin":
113 | alpha = self.alpha0 / (1 + self.decay_rate * t)
114 | if self.decay_type == "sig":
115 | alpha *= 1.0 / (1 + self.decay_rate * t)
116 | if self.decay_type == "sqrt":
117 | alpha = self.alpha0 / np.sqrt(1 + self.decay_rate * t)
118 | if self.decay_type == "exp":
119 | alpha = self.alpha0 * np.exp(- self.decay_rate * t)
120 |
121 |
122 | #print out progress
123 | log_metrics={}
124 | log_metrics['||w||'] = xnorm_pair
125 | log_metrics['||g||'] = np.sqrt(np.sum(g_pair * g_pair)/2)
126 | log_metrics['||g_w||'] = np.sqrt(np.sum(gx_pair * gx_pair)/2)
127 | log_metrics['||greg_w||'] = np.sqrt(np.sum(g_reg_pair * g_reg_pair)/2)
128 | log_metrics['xnorm_diff'] = xnorm_diff
129 | log_metrics['max_g'] = np.max(np.abs(gx))
130 | log_metrics['alpha'] = alpha
131 | log_metrics['PCD'] = persistent
132 |
133 | if not self.fix_v:
134 | log_metrics['||v||'] = np.sqrt(np.sum(x_single * x_single))
135 | log_metrics['||v+w||'] = np.sqrt(np.sum(x * x))
136 | log_metrics['||g_v||'] = np.sqrt(np.sum(gx_single * gx_single))
137 | log_metrics['||g||'] = np.sqrt(np.sum(gx * gx))
138 | log_metrics['||g_reg_v||'] = np.sqrt(np.sum(g_reg_single * g_reg_single))
139 |
140 | self.progress.log_progress(i + 1, **log_metrics)
141 |
142 |
143 | #stop condition
144 | if self.early_stopping:
145 | if xnorm_diff < self.epsilon:
146 |
147 | ret = {
148 | "code": 0,
149 | "message": "Stopping condition (xnorm diff < {0}) successfull.".format(self.epsilon),
150 | "num_iterations": i
151 | }
152 | return fx, x, ret
153 |
154 | # update parameters
155 | if not self.fix_v:
156 | x_single -= alpha * g_single
157 | x_pair -= alpha * g_pair
158 |
159 | x = objfun.structured_to_linear(x_single, x_pair)
160 |
161 | return fx, x, ret
162 |
163 | def get_parameters(self):
164 | parameters={}
165 |
166 | parameters['convergence'] = {}
167 | parameters['convergence']['maxit'] = self.maxit
168 | parameters['convergence']['early_stopping'] = self.early_stopping
169 | parameters['convergence']['epsilon'] = self.epsilon
170 | parameters['convergence']['convergence_prev'] = self.convergence_prev
171 |
172 | parameters['decay'] = {}
173 | parameters['decay']['alpha0'] = self.alpha0
174 | parameters['decay']['decay'] = self.decay
175 | parameters['decay']['decay_start'] = self.decay_start
176 | parameters['decay']['decay_rate'] = self.decay_rate
177 | parameters['decay']['decay_type'] = self.decay_type
178 |
179 | parameters['fix_v'] = self.fix_v
180 |
181 | return parameters
182 |
--------------------------------------------------------------------------------
/ccmpred/algorithm/lbfgs.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import ccmpred.monitor.progress as pr
3 | from scipy.optimize import minimize as min
4 |
5 | class LBFGS(object):
6 | """Optimize objective function usign lbfgs"""
7 |
8 | def __init__(self, progress, maxit=100, ftol=1e-4, max_linesearch=20, maxcor=5, non_contact_indices=None):
9 |
10 | self.max_linesearch=max_linesearch
11 | self.ftol = ftol
12 | self.maxit = maxit
13 | self.maxcor = maxcor
14 |
15 | # whether optimization is run with constraints (non-contacts are masked)
16 | self.non_contact_indices = non_contact_indices
17 |
18 | # optimization progress logger
19 | self.progress = progress
20 |
21 | self.g_x = None
22 | self.objfun=None
23 | self.iteration=0
24 |
25 |
26 | def __repr__(self):
27 |
28 | repr_str = "LBFGS optimization (ftol={0}, maxcor={1}, max_ls={2})\n".format(
29 | self.ftol,self.maxcor,self.max_linesearch)
30 | repr_str += "\tconvergence criteria: maxit={0} \n".format(self.maxit)
31 |
32 | return repr_str
33 |
34 | def lbfgs_f(self, x):
35 |
36 | fx, g_x, g_reg = self.objfun.evaluate(x)
37 |
38 | #gradient is computed x 2 in pll.evaluate because of compatibility with conjugate gradient optimization!!
39 | g_x_single, g_x_pair = self.objfun.linear_to_structured(g_x)
40 | g_reg_single, g_reg_pair = self.objfun.linear_to_structured(g_reg)
41 | g = self.objfun.structured_to_linear(g_x_single+g_reg_single, (g_x_pair+g_reg_pair)/2)
42 |
43 | # masking: set coupling gradients for all pairs (i,j) with d_ij > contact_thr = 0
44 | if self.non_contact_indices is not None:
45 | g_single, g_pair = self.objfun.linear_to_structured(g)
46 | g_pair[self.non_contact_indices[0], self.non_contact_indices[1], :, :] = 0
47 | g = self.objfun.structured_to_linear(g_single, g_pair)
48 |
49 | return fx, g
50 |
51 | def print_and_plot(self, x):
52 |
53 | self.iteration += 1
54 |
55 | x_single, x_pair = self.objfun.finalize(x)
56 |
57 | log_metrics={}
58 | log_metrics['||v+w||'] = np.sqrt(np.sum(x_single * x_single) + np.sum(x_pair * x_pair)/2)
59 | log_metrics['||v||'] = np.sqrt(np.sum(x_single * x_single))
60 | log_metrics['||w||'] = np.sqrt(np.sum(x_pair * x_pair)/2)
61 | self.progress.log_progress(self.iteration, **log_metrics)
62 |
63 | def minimize(self, objfun, x):
64 |
65 | self.objfun = objfun
66 |
67 | subtitle = self.progress.title + self.__repr__().replace("\n", "
")
68 | subtitle += objfun.__repr__().replace("\n", "
")
69 | self.progress.set_plot_title(subtitle)
70 |
71 | res = min(self.lbfgs_f,
72 | x,
73 | method='L-BFGS-B',
74 | jac=True,
75 | options={
76 | 'maxls': self.max_linesearch,
77 | 'gtol': 1e-05,
78 | 'eps': 1e-08,
79 | 'maxiter': self.maxit,
80 | 'ftol': self.ftol,
81 | 'maxfun': 15000,
82 | 'maxcor': self.maxcor,
83 | 'disp': False
84 | },
85 | callback=self.print_and_plot
86 | )
87 |
88 |
89 | ret = {
90 | "code": res.status,
91 | "message": res.message.decode("utf-8"),
92 | "num_iterations": res.nit
93 | }
94 |
95 | return res.fun, res.x, ret
96 |
97 | def get_gradient_x(self):
98 |
99 | return(self.g_x)
100 |
101 | def get_parameters(self):
102 | parameters={}
103 |
104 | parameters['convergence']={}
105 | parameters['convergence']['maxit'] = self.maxit
106 | parameters['convergence']['max_linesearch'] = self.max_linesearch
107 | parameters['convergence']['ftol'] = self.ftol
108 |
109 |
110 | return parameters
--------------------------------------------------------------------------------
/ccmpred/centering.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | def center_v(freqs):
4 | single_freqs, _ = freqs
5 |
6 | #single_freqs either normalized with or without gaps --> same result due to subtraction of mean
7 |
8 |
9 | #hack when usign no pseudo counts to be able to take log of zero counts
10 | eps = 1e-10
11 | single_freqs[single_freqs
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | #define N_ALPHA 21
9 |
10 | void msa_count_single(double *counts, uint8_t *msa, double *weights, uint32_t nrow, uint32_t ncol) {
11 | int n, i;
12 | unsigned char a;
13 |
14 | memset(counts, 0, sizeof(double) * ncol * N_ALPHA);
15 |
16 | for(n = 0; n < nrow; n++) {
17 | for(i = 0; i < ncol; i++) {
18 | a = msa[n * ncol + i];
19 | counts[i * N_ALPHA + a] += weights[n];
20 | }
21 | }
22 | }
23 |
24 |
25 | void msa_count_pairs(double *counts, uint8_t *msa, double *weights, uint32_t nrow, uint32_t ncol) {
26 | memset(counts, 0, sizeof(double) * ncol * ncol * N_ALPHA * N_ALPHA);
27 |
28 | #pragma omp parallel
29 | #pragma omp for nowait
30 | for(int ij = 0; ij < ncol * ncol; ij++) {
31 | int i = ij / ncol;
32 | int j = ij % ncol;
33 | for(int n = 0; n < nrow; n++) {
34 |
35 | unsigned char a = msa[n * ncol + i];
36 | unsigned char b = msa[n * ncol + j];
37 | counts[((i * ncol + j) * N_ALPHA + a) * N_ALPHA + b] += weights[n];
38 | }
39 | }
40 | }
41 |
42 | void msa_char_to_index(uint8_t *msa, uint32_t nrow, uint32_t ncol) {
43 |
44 | int amino_indices[29];
45 | int n, i;
46 | unsigned char c;
47 |
48 | // Make hash lookup table for amino acid characters to amino acid numbers
49 | // hash keys are the ASCII codes of the upper-case amino acids, modulo 29.
50 | // hash values are the amino acid numbers.
51 | //
52 | // aa A R N D C Q E G H I L K M F P S T W Y V -
53 | // asc 65 82 78 68 67 81 69 71 72 73 76 75 77 70 80 83 84 87 89 86 45
54 | // mod 7 24 20 10 9 23 11 13 14 15 18 17 19 12 22 25 26 0 2 28 16
55 | for(c = 0; c < 29; c++) {
56 | amino_indices[c] = 20;
57 | }
58 |
59 | amino_indices[ 7] = 0; // A
60 | amino_indices[24] = 1; // R
61 | amino_indices[20] = 2; // N
62 | amino_indices[10] = 3; // D
63 | amino_indices[ 9] = 4; // C
64 | amino_indices[23] = 5; // Q
65 | amino_indices[11] = 6; // E
66 | amino_indices[13] = 7; // G
67 | amino_indices[14] = 8; // H
68 | amino_indices[15] = 9; // I
69 | amino_indices[18] = 10; // L
70 | amino_indices[17] = 11; // K
71 | amino_indices[19] = 12; // M
72 | amino_indices[12] = 13; // F
73 | amino_indices[22] = 14; // P
74 | amino_indices[25] = 15; // S
75 | amino_indices[26] = 16; // T
76 | amino_indices[ 0] = 17; // W
77 | amino_indices[ 2] = 18; // Y
78 | amino_indices[28] = 19; // V
79 | amino_indices[16] = 20; // -
80 |
81 | for(n = 0; n < nrow; n++) {
82 | for(i = 0; i < ncol; i++) {
83 | msa[n * ncol + i] = amino_indices[ toupper(msa[n * ncol + i]) % 29 ];
84 | }
85 | }
86 |
87 | }
88 |
89 |
90 | void msa_index_to_char(uint8_t *msa, uint32_t nrow, uint32_t ncol) {
91 | uint8_t char_indices[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '-' };
92 | int n, i;
93 |
94 | for(n = 0; n < nrow; n++) {
95 | for(i = 0; i < ncol; i++) {
96 | msa[n * ncol + i] = char_indices[msa[n * ncol + i]];
97 | }
98 | }
99 | }
100 |
--------------------------------------------------------------------------------
/ccmpred/gaps/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import ccmpred.counts
3 |
4 | from ccmpred.gaps.cext import remove_gaps_probs, remove_gaps_consensus
5 |
6 |
7 | def remove_gaps_col_freqs(msa):
8 | counts = ccmpred.counts.single_counts(msa)
9 | counts[:, 20] = 0
10 |
11 | counts /= np.sum(counts, axis=1)[:, np.newaxis]
12 |
13 | return remove_gaps_probs(msa, counts)
14 |
15 |
16 | def backinsert_gapped_positions_aln(msa, gapped_positions):
17 |
18 | for position in gapped_positions:
19 | msa = np.insert(msa, position, [20], axis=1)
20 |
21 | return msa
22 |
23 | def backinsert_gapped_positions_mat(mat, gapped_positions):
24 |
25 | for position in gapped_positions:
26 | mat = np.insert(mat, position, [0], axis=0)
27 | mat = np.insert(mat, position, [0], axis=1)
28 |
29 | return mat
30 |
31 | def backinsert_gapped_positions(x_single, x_pair, gapped_positions):
32 |
33 | for position in gapped_positions:
34 | x_single = np.insert(x_single,position, [0], axis=0)
35 | x_pair = np.insert(x_pair,position, [0], axis=0)
36 | x_pair = np.insert(x_pair,position, [0], axis=1)
37 |
38 | return x_single, x_pair
39 |
40 |
41 | def remove_gapped_sequences(msa, max_gap_seq):
42 |
43 | if max_gap_seq >= 100:
44 | return msa
45 |
46 | msa_gap_count_per_sequence = (msa == 20).sum(1)
47 |
48 | #how many positions per sequence are allowed to contain gaps?
49 | max_gap_percentage_per_sequence = ((max_gap_seq / 100.0) * msa.shape[1])
50 |
51 | high_coverage = np.where(msa_gap_count_per_sequence < max_gap_percentage_per_sequence)
52 |
53 | print("Removed {0} sequences with > {1} percent gaps.".format(
54 | msa.shape[0] - len(high_coverage[0]), max_gap_seq/100.0))
55 |
56 | return np.ascontiguousarray(msa[high_coverage[0], :])
57 |
58 | def remove_gapped_positions(msa, max_gap_percentage):
59 |
60 | if max_gap_percentage >= 100:
61 | return msa, []
62 |
63 | msa_gap_counts = (msa == 20).sum(0)
64 |
65 | max_gap_count = ((max_gap_percentage/100.0) * msa.shape[0])
66 |
67 | ungapped_positions = np.where(msa_gap_counts < max_gap_count)
68 | gapped_positions = np.where(msa_gap_counts >= max_gap_count)
69 |
70 |
71 | print("Removed {0} alignment positions with > {1} percent gaps.".format(
72 | len(gapped_positions[0]), max_gap_percentage/100.0))
73 |
74 | return np.ascontiguousarray(msa[:, ungapped_positions[0]]), gapped_positions[0]
--------------------------------------------------------------------------------
/ccmpred/gaps/cext/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy.ctypeslib as npct
3 | import ctypes
4 | import os.path
5 |
6 | import ccmpred.counts
7 |
8 | array_2d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=2, flags='CONTIGUOUS')
9 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS')
10 | array_1d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=1, flags='CONTIGUOUS')
11 |
12 | libgaps = npct.load_library('libgaps', os.path.join(os.path.dirname(__file__), '_build'))
13 |
14 | libgaps.remove_gaps_probs.restype = None
15 | libgaps.remove_gaps_probs.argtypes = [
16 | array_2d_float, # *x
17 | array_2d_char, # *msa
18 | ctypes.c_uint32, # nrow
19 | ctypes.c_uint32, # ncol
20 | ]
21 |
22 |
23 | libgaps.remove_gaps_consensus.restype = None
24 | libgaps.remove_gaps_consensus.argtypes = [
25 | array_2d_char, # *msa
26 | array_1d_char, # *consensus
27 | ctypes.c_uint32, # nrow
28 | ctypes.c_uint32, # ncol
29 | ]
30 |
31 |
32 | def compute_consensus(msa, ignore_gaps=True):
33 | counts = ccmpred.counts.single_counts(msa)
34 | if ignore_gaps:
35 | counts = counts[:, :20]
36 |
37 | return np.argmax(counts, axis=1).astype('uint8')
38 |
39 |
40 | def remove_gaps_probs(msa, probs):
41 | assert(probs.shape[0] == msa.shape[1])
42 | libgaps.remove_gaps_probs(np.ascontiguousarray(probs), msa, *msa.shape)
43 | return msa
44 |
45 |
46 | def remove_gaps_consensus(msa, consensus=None):
47 | if not consensus:
48 | consensus = compute_consensus(msa)
49 |
50 | assert(consensus.shape[0] == msa.shape[1])
51 | libgaps.remove_gaps_consensus(msa, consensus, *msa.shape)
52 |
53 | return msa
54 |
--------------------------------------------------------------------------------
/ccmpred/gaps/cext/gaps.c:
--------------------------------------------------------------------------------
1 | #include
2 |
3 | #include "gaps.h"
4 |
5 | int pick_random_weighted(flt *probs, int n) {
6 | int a;
7 | double p = (double)rand() / (double)RAND_MAX;
8 | for (a = 0; a < n; a++) {
9 | flt p_curr = probs[a];
10 | if (p < p_curr) {
11 | return a;
12 | }
13 | p -= p_curr;
14 | }
15 | return n - 1;
16 | }
17 |
18 |
19 | /**
20 | * substitute gaps in the sequence according to probability
21 | *
22 | * @param[in] p The MSA probabilities
23 | * @param[inout] msa The MSA to clean
24 | * @param[in] nrow The number of rows
25 | * @param[in] ncol The number of columns
26 | */
27 | void remove_gaps_probs(
28 | const flt *const p,
29 | unsigned char *const msa,
30 | int nrow,
31 | int ncol
32 | ) {
33 | int i, j;
34 | for(i = 0; i < nrow; i++) {
35 | for (j = 0; j < ncol; j++) {
36 | if (msa[i * ncol + j] != GAP) continue;
37 |
38 | msa[i * ncol + j] = pick_random_weighted((flt *)&p[j * N_ALPHA], N_ALPHA);
39 | }
40 | }
41 | }
42 |
43 | /**
44 | * remove gaps according to consensus sequence
45 | *
46 | * @param[inout] msa the MSA to clean (nrow x ncol)
47 | * @param[in] The consensus sequence to use as a replacement (ncol)
48 | * @param[in] nrow The number of rows
49 | * @param[in] ncol The number of columns
50 | */
51 | void remove_gaps_consensus(
52 | unsigned char *const msa,
53 | unsigned char *const consensus,
54 | int nrow,
55 | int ncol
56 | ) {
57 | int i, j;
58 | for(i = 0; i < nrow; i++) {
59 | for(j = 0; j < ncol; j++) {
60 | if(msa[i * ncol + j] != GAP) continue;
61 | msa[i * ncol + j] = consensus[j];
62 | }
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/ccmpred/gaps/cext/gaps.h:
--------------------------------------------------------------------------------
1 | #ifndef GAP_H
2 | #define GAP_H
3 |
4 | #define GAP 20
5 | #define N_ALPHA 21
6 | typedef double flt;
7 |
8 | void remove_gaps_probs(
9 | const flt *const p,
10 | unsigned char *const msa,
11 | int nrow,
12 | int ncol
13 | );
14 |
15 | void remove_gaps_consensus(
16 | unsigned char *const msa,
17 | unsigned char *const consensus,
18 | int nrow,
19 | int ncol
20 | );
21 |
22 | #endif
23 |
--------------------------------------------------------------------------------
/ccmpred/io/__init__.py:
--------------------------------------------------------------------------------
1 | from ccmpred.io.alignment import read_msa, read_msa_biopython, read_msa_psicov, write_msa_psicov, AMINO_ACIDS
2 | from ccmpred.io.contactmatrix import write_matrix
3 | from ccmpred.io.pdb import distance_map
--------------------------------------------------------------------------------
/ccmpred/io/alignment.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import ccmpred.counts
3 | import Bio.AlignIO as aio
4 |
5 | AMINO_ACIDS = "ARNDCQEGHILKMFPSTWYV-"
6 |
7 | def read_msa(f, format, return_indices=True, return_identifiers=False):
8 | if format == 'psicov':
9 | return read_msa_psicov(f, return_indices, return_identifiers)
10 | else:
11 | return read_msa_biopython(f, format, return_indices, return_identifiers)
12 |
13 | def read_msa_biopython(f, format, return_indices=True, return_identifiers=False):
14 |
15 | records = list(aio.read(f, format))
16 |
17 | msa = [str(r.seq) for r in records]
18 | msa = np.array([[ord(c) for c in x.strip()] for x in msa], dtype=np.uint8)
19 |
20 | if return_indices:
21 | ccmpred.counts.index_msa(msa, in_place=True)
22 |
23 | if return_identifiers:
24 | identifiers = [r.name for r in records]
25 | return msa, identifiers
26 | else:
27 | return msa
28 |
29 | def read_msa_psicov(f, return_indices=True, return_identifiers=False):
30 |
31 | if isinstance(f, str):
32 | with open(f, 'r') as o:
33 | msa = o.readlines()
34 | else:
35 | msa = f
36 |
37 | for i, line in enumerate(msa):
38 | if ">" in line:
39 | raise Exception("Line number {0} contains a '>' - please set the correct alignment format!:\n{1}".format(i + 1, line))
40 |
41 | msa = np.array([[ord(c) for c in x.strip()] for x in msa], dtype=np.uint8)
42 |
43 | if return_indices:
44 | ccmpred.counts.index_msa(msa, in_place=True)
45 |
46 | if return_identifiers:
47 | identifiers = ["seq{0}".format(i) for i in range(msa.shape[0])]
48 | return msa, identifiers
49 | else:
50 | return msa
51 |
52 |
53 | def write_msa(f, msa, ids, format, is_indices=True, descriptions=None):
54 |
55 | if format == 'psicov':
56 | write_msa_psicov(f, msa, is_indices=is_indices)
57 | else:
58 | write_msa_biopython(f, msa, ids, format, is_indices=is_indices, descriptions=descriptions)
59 |
60 | def write_msa_psicov(f, msa, is_indices=True):
61 |
62 | if is_indices:
63 | msa = ccmpred.counts.char_msa(msa)
64 |
65 | f.write("\n".join(["".join(chr(cell) for cell in row) for row in msa]))
66 |
67 | def write_msa_biopython(f, msa, ids, format, is_indices=True, descriptions=None):
68 | import Bio.SeqIO
69 | from Bio.SeqRecord import SeqRecord
70 | from Bio.Seq import Seq
71 |
72 | if is_indices:
73 | msa = ccmpred.counts.char_msa(msa)
74 |
75 | if descriptions is None:
76 | descriptions = ["" for _ in range(msa.shape[0])]
77 |
78 | msa = ["".join(chr(c) for c in row) for row in msa]
79 |
80 | records = [
81 | SeqRecord(Seq(seq), id=id, description=desc,
82 | annotations={"molecule_type": "protein"})
83 | for seq, id, desc in zip(msa, ids, descriptions)
84 | ]
85 |
86 | Bio.SeqIO.write(records, f, format)
87 |
--------------------------------------------------------------------------------
/ccmpred/io/contactmatrix.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import json
3 | import gzip
4 | import os
5 | import sys
6 |
7 | def frobenius_score(x):
8 | """
9 | Compute frobenius norm of couplig matrix
10 |
11 | :param x: pair potentials of dimension [ L x L x 20 x 20 ]
12 | :param squared:
13 | :return:
14 | """
15 |
16 | return np.sqrt(np.sum(x * x, axis=(2, 3)))
17 |
18 | def apc(cmat):
19 | """
20 | Compute average product correction (APC) according to Dunn et al 2004
21 |
22 | :param cmat: contact matrix
23 | :return: corrected contact matrix
24 | """
25 | print("Apply Average Product Correction (APC)")
26 |
27 | mean = np.mean(cmat, axis=0)
28 | apc_term = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat)
29 |
30 | return cmat - apc_term
31 |
32 | def compute_scaling_factor(x_pair, uij, nr_states, squared=True):
33 | """
34 | Set the strength of the entropy correction by optimization eta with least squares
35 |
36 | Minimize sum_i,j sum_a,b (w_ijab^2 - eta * u_ia * u_jb)^2
37 |
38 | :param x_pair: raw coupling scores
39 | :param uij:
40 | :param nr_states: normalize entropy wrt 20 or 21 characters
41 | :param squared:
42 | :return:
43 | """
44 |
45 | squared_sum_couplings = np.sum(x_pair[:,:,:20,:20] * x_pair[:,:,:20,:20], axis=(3,2))
46 |
47 | if squared:
48 |
49 | squared_sum_entropy = np.sum(uij[:,:,:nr_states,:nr_states], axis=(3,2))
50 | scaling_factor = np.sum(squared_sum_couplings * squared_sum_entropy)
51 |
52 | denominator = np.sum(uij * uij)
53 | scaling_factor /= denominator
54 |
55 | else:
56 |
57 | #According to Stefan's CCMgen paper
58 | #both are LxL matrices
59 | c_ij = np.sqrt(squared_sum_couplings)
60 | e_ij = np.sqrt(np.sum(uij[:,:,:nr_states,:nr_states], axis=(3,2)))
61 |
62 | scaling_factor = np.sum(c_ij * e_ij)
63 | denominator = np.sum(uij[:,:,:nr_states,:nr_states])
64 | scaling_factor /= denominator
65 |
66 | return scaling_factor
67 |
68 | def compute_local_correction(
69 | single_freq, x_pair, Neff, lambda_w, squared=True,
70 | entropy=False, nr_states=20, log=np.log2):
71 |
72 | print("Apply entropy correction (using {0} states and {1})".format(nr_states, log.__name__))
73 |
74 |
75 | if entropy:
76 | N_factor = 1
77 | ui = N_factor * single_freq[:, :nr_states] * log(single_freq[:, :nr_states])
78 | else:
79 | #correct for fractional counts
80 | N_factor = np.sqrt(Neff) * (1.0 / lambda_w)
81 | ui = N_factor * single_freq[:, :nr_states] * (1 - single_freq[:, :nr_states])
82 | uij = np.transpose(np.multiply.outer(ui, ui), (0,2,1,3))
83 |
84 | ### compute optimal scaling factor
85 | scaling_factor = compute_scaling_factor(x_pair, uij, nr_states, squared=squared)
86 |
87 | if not squared:
88 | mat = frobenius_score(x_pair)
89 | correction = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2)))
90 | else:
91 | mat = np.sum(x_pair * x_pair, axis=(2, 3))
92 | correction = scaling_factor * np.sum(uij, axis=(3, 2))
93 |
94 | return scaling_factor, mat - correction
95 |
96 |
97 | def write_matrix(matfile, mat, meta):
98 |
99 | if matfile.endswith(".gz"):
100 | with gzip.open(matfile, 'wb') as f:
101 | np.savetxt(f, mat)
102 | f.write("#>META> " + json.dumps(meta) + "\n")
103 | f.close()
104 | else:
105 | np.savetxt(matfile, mat)
106 | with open(matfile,'a') as f:
107 | f.write("#>META> " + json.dumps(meta) + "\n")
108 | f.close()
109 |
110 | def read_matrix(matfile):
111 | """
112 | Read matrix file
113 | :param mat_file: path to matrix file
114 | :return: matrix
115 | """
116 |
117 | if not os.path.exists(matfile):
118 | raise IOError("Matrix File " + str(matfile) + "cannot be found. ")
119 |
120 |
121 | ### Read contact map (matfile can also be compressed file)
122 | mat = np.genfromtxt(matfile, comments="#")
123 |
124 | ### Read meta data from mat file
125 | meta = {}
126 | with open(matfile) as f:
127 | for line in f:
128 | if '#>META>' in line:
129 | meta = json.loads(line.split("> ")[1])
130 |
131 | if len(meta) == 0:
132 | print(str(matfile) + " does not contain META info. (Line must start with #META!)")
133 |
134 | return mat, meta
135 |
136 | def find_dict_key(key, dictionary):
137 | for k, v in dictionary.items():
138 | if k == key:
139 | return v
140 | if isinstance(v, dict):
141 | res = find_dict_key(key, v)
142 | if res is not None:
143 | return res
144 | if isinstance(v, list):
145 | for d in v:
146 | if isinstance(d, list) or isinstance(d, dict):
147 | res = find_dict_key(key, d)
148 | if res is not None:
149 | return res
150 |
151 |
152 | return None
153 |
--------------------------------------------------------------------------------
/ccmpred/io/pdb.py:
--------------------------------------------------------------------------------
1 | from Bio.PDB import PDBParser
2 | import numpy as np
3 |
4 | def read_pdb(pdbfile):
5 | '''
6 | Read a PDB file as structure file with BIO.PDB
7 |
8 | :param pdbfile: path to pdb file
9 | :return: structure
10 | '''
11 |
12 | parser = PDBParser()
13 | structure = parser.get_structure('pdb', pdbfile)
14 |
15 | return structure
16 |
17 | def calc_residue_dist(residue_one, residue_two, distance_definition="Cb"):
18 | '''
19 | Calculate euclidian distance between C-beta (C-alpha in case of Glycine/missing C-beta)
20 | atoms of oth residues
21 | :param residue_one: BIO.PDB residue object 1
22 | :param residue_two: BIO.PDB residue object 2
23 | :return: float euclidian distance between residues
24 | '''
25 |
26 | if distance_definition == "Cb":
27 |
28 | if residue_one.has_id("CB"):
29 | residue_one_atom = residue_one["CB"]
30 | else:
31 | residue_one_atom = residue_one["CA"]
32 |
33 | if residue_two.has_id("CB"):
34 | residue_two_atom = residue_two["CB"]
35 | else:
36 | residue_two_atom = residue_two["CA"]
37 |
38 | diff_vector = residue_one_atom.coord - residue_two_atom.coord
39 | diff = np.sqrt(np.sum(diff_vector * diff_vector))
40 | else:
41 | diff_list = []
42 | for atom_1 in [atom for atom in residue_one if atom.name not in ['N', 'O', 'C']]:
43 | for atom_2 in [atom for atom in residue_two if atom.name not in ['N', 'O', 'C']]:
44 | diff_vector = atom_1.coord - atom_2.coord
45 | diff_list.append(np.sqrt(np.sum(diff_vector * diff_vector)))
46 |
47 | diff = np.min(diff_list)
48 |
49 | return diff
50 |
51 | def distance_map(pdb_file, L=None, distance_definition="Cb"):
52 | '''
53 | Compute the distances between Cbeta (Calpha for Glycine) atoms of all residue pairs
54 |
55 | :param pdb_file: PDB file (first chain of first model will be used)
56 | :return: LxL numpy array with distances (L= protein length)
57 | '''
58 |
59 | structure = read_pdb(pdb_file)
60 | structure.get_list()
61 | model = structure[0]
62 | chain = model.get_list()[0]
63 |
64 | # due to missing residues in the pdb file (or additionally solved??)
65 | # protein length L can differ from len(chain.get_list())
66 | if L is None:
67 | L = chain.get_list()[-1].id[1]
68 |
69 | distance_map = np.full((L, L), np.NaN)
70 |
71 | residues = chain.get_list()
72 | for i in range(np.min([L, len(chain.get_list())])):
73 | for j in range(np.min([L, len(chain.get_list())])):
74 | residue_one = residues[i]
75 | residue_two = residues[j]
76 | distance_map[residue_one.id[1] - 1, residue_two.id[1] - 1] = calc_residue_dist(residue_one, residue_two, distance_definition)
77 |
78 | return distance_map
--------------------------------------------------------------------------------
/ccmpred/locmeth/__init__.py:
--------------------------------------------------------------------------------
1 | from ccmpred.locmeth.mi import compute_mi, compute_mi_pseudocounts
2 | from ccmpred.locmeth.omes import compute_omes, compute_omes_freq
--------------------------------------------------------------------------------
/ccmpred/locmeth/mi/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import scipy.stats
3 |
4 | def compute_mi(counts, normalized=False):
5 | """
6 |
7 | :param counts: single and pairwise amino acid counts
8 | :param remove_gaps: do not count gaps
9 | :param normalized: According to Martin et al 2005
10 | (Using information theory to search for co-evolving residues in proteins)
11 | MI is normalized by joint entropy
12 | :return:
13 | """
14 |
15 | single_counts, pair_counts = counts
16 |
17 |
18 | L = pair_counts.shape[0]
19 | indices_i_less_j = np.triu_indices(L, k=1) #excluding diagonal
20 |
21 | #compute shannon and joint shannon entropy
22 | shannon_entropy = scipy.stats.entropy(single_counts.transpose(),base=2)
23 |
24 | joint_shannon_entropy = np.zeros((L, L))
25 | pair_counts_flat = pair_counts.reshape(L, L, pair_counts.shape[2]*pair_counts.shape[3])
26 | joint_shannon_entropy[indices_i_less_j] = scipy.stats.entropy(pair_counts_flat[indices_i_less_j].transpose(), base=2)
27 |
28 | #compute mutual information
29 | mi = np.zeros((L, L))
30 | mi[indices_i_less_j] = [shannon_entropy[i] + shannon_entropy[j] - joint_shannon_entropy[i,j] for i,j in zip(*indices_i_less_j)]
31 |
32 | #According to Martin et al 2005
33 | if normalized:
34 | mi[indices_i_less_j] /= joint_shannon_entropy[indices_i_less_j]
35 |
36 | #symmetrize
37 | mi += mi.transpose()
38 |
39 |
40 | return mi
41 |
42 | def compute_mi_pseudocounts(freqs):
43 |
44 | single_freqs, pair_freqs = freqs
45 |
46 | L = pair_freqs.shape[0]
47 | indices_i_less_j = np.triu_indices(L, k=1) #excluding diagonal
48 | mi = np.zeros((L, L))
49 |
50 | #works as it should
51 | # outer = single_freqs[indices_i_less_j[0]][10, :20, np.newaxis] * single_freqs[indices_i_less_j[1]][10, np.newaxis, :20]
52 | # print outer[4,7]
53 | # print outer[7,4]
54 | # print single_freqs[indices_i_less_j[0]][10,4] * single_freqs[indices_i_less_j[1]][10,7]
55 | # print single_freqs[indices_i_less_j[0]][10,7] * single_freqs[indices_i_less_j[1]][10,4]
56 |
57 | mi_raw = pair_freqs[indices_i_less_j][:, :20, :20] * np.log2(pair_freqs[indices_i_less_j][:, :20, :20] / (single_freqs[indices_i_less_j[0]][:, :20, np.newaxis] * single_freqs[indices_i_less_j[1]][:, np.newaxis, :20]) )
58 |
59 |
60 | mi[indices_i_less_j] = mi_raw.sum(2).sum(1)
61 |
62 | #symmetrize
63 | mi += mi.transpose()
64 |
65 | return mi
66 |
--------------------------------------------------------------------------------
/ccmpred/locmeth/omes/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def compute_omes(counts, fodoraldrich=False):
5 | """
6 |
7 | Chi squared statistic:
8 | X^2 = sum_{i=1}^N [(O_i - E_i)^2 / E_i ] # comparing counts
9 | = N sum_{i=1}^N [(O_i/N - p_i)^2 / p_i ] # comparing frequencies
10 |
11 | O_i = number of observations of type i => pairwise amino acid counts
12 | E_i = Np_i = the expected (theoretical) occurence of type i,
13 | asserted by the null hypothesis that the fraction of type i in the population is p_{i}
14 |
15 |
16 |
17 | According to Kass & Horovitz, 2002:
18 | Mapping Pathways of Allosteric Communication in GroEL by Analysis of Correlated Mutations
19 |
20 | omes(i,j) = [ count_ij(a,b) - (count_i(a) * count_j(b))/N_ij ] ^2
21 | sum_(a,b=1)^20 -----------------------------------------------------
22 | (count_i(a) * count_j(b))/N_ij
23 |
24 |
25 | According to Fodor & Aldrich, 2004:
26 | Influence of conservation on calculations of amino acid covariance in multiple sequence alignments.
27 | omes(i,j) = [ count_ij(a,b) - (count_i(a) * count_j(b))/N_ij ] ^2
28 | sum_(a,b=1)^20 -----------------------------------------------------
29 | N_ij
30 |
31 |
32 | Here we implement Kass & Horovitz! (see line 43)
33 |
34 | :return:
35 | """
36 |
37 | single_counts, pair_counts = counts
38 | Nij = pair_counts.sum(3).sum(2) #== Neff
39 | L = single_counts.shape[0]
40 |
41 | # gaps do not add
42 | # if gap_treatment:
43 | # Nij = pair_counts[:, :, :20, :20].sum(3).sum(2)
44 |
45 | # compute chi square statistic
46 | Nexp = np.outer(single_counts[:, :20], single_counts[:, :20]).reshape((L, L, 20, 20))
47 |
48 | #works as it should
49 | # print Nexp[0, 11, 2, 4]
50 | # print single_counts[0, 2] * single_counts[11, 4]
51 |
52 |
53 | Nexp /= Nij[:, :, np.newaxis, np.newaxis]
54 | diff = (pair_counts[:, :, :20, :20] - Nexp)
55 |
56 | if fodoraldrich:
57 | omes = (diff * diff) / Nij[:, :, np.newaxis, np.newaxis] # Fodor & Aldrich: we divide by Nij(neff)
58 | else:
59 | omes = (diff * diff) / Nexp # Kass & Horovitz: we divide by Nexp
60 |
61 | omes = omes.sum(3).sum(2)
62 |
63 |
64 | return omes
65 |
66 |
67 |
68 | def compute_omes_freq(counts, freqs, fodoraldrich=False, ignore_zero_counts=True):
69 |
70 |
71 | single_freqs, pair_freqs = freqs
72 | single_counts, pair_counts = counts
73 | Nij = pair_counts.sum(3).sum(2) #== Neff
74 | L = single_freqs.shape[0]
75 |
76 | # gaps do not add
77 | # if gap_treatment:
78 | # Nij = pair_counts[:, :, :20, :20].sum(3).sum(2)
79 |
80 | # compute chi square statistic
81 | Nexp = single_freqs[:, np.newaxis, :20, np.newaxis] * single_freqs[np.newaxis, :, np.newaxis, :20]
82 |
83 | #works as it should
84 | # print Nexp[0, 11, 2, 4]
85 | # print single_counts[0, 2] * single_counts[11, 4]
86 |
87 |
88 | Nexp *= Nij[:, :, np.newaxis, np.newaxis]
89 | diff = (pair_counts[:, :, :20, :20] - Nexp)
90 |
91 |
92 | if fodoraldrich:
93 | omes_full = (diff * diff) / Nij[:, :, np.newaxis, np.newaxis] # Fodor & Aldrich: we divide by Nij(neff)
94 | else:
95 | omes_full = (diff * diff) / Nexp # Kass & Horovitz: we divide by Nexp
96 |
97 |
98 |
99 | #compute statistics only for non-zero pair counts
100 | if ignore_zero_counts:
101 | ind_nonzero_ab = np.nonzero(pair_counts[:, :, :20, :20])
102 | omes = np.zeros((L, L, 20, 20))
103 | omes[ind_nonzero_ab] = omes_full[ind_nonzero_ab]
104 | else:
105 | omes = omes_full
106 |
107 | omes = omes.sum(3).sum(2)
108 |
109 | return omes
110 |
--------------------------------------------------------------------------------
/ccmpred/logo.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import ccmpred
3 | import sys
4 |
5 | is_tty = (sys.stdin.isatty()) and (sys.stdout.isatty())
6 |
7 | LOGOS = {}
8 | LOGOS['ccmpred', True] = """
9 | \x1b[32m┏━╸┏━╸┏┳┓\x1b[34m┏━┓┏━┓┏━╸╺┳┓\x1b[32m┏━┓╻ ╻\x1b[0m version {0}
10 | \x1b[32m┃ ┃ ┃┃┃\x1b[34m┣━┛┣┳┛┣╸ ┃┃\x1b[32m┣━┛┗┳┛\x1b[0m Vorberg, Seemayer and Soeding (2018)
11 | \x1b[32m┗━╸┗━╸╹ ╹\x1b[34m╹ ╹┗╸┗━╸╺┻┛\x1b[32m╹ ╹ \x1b[0m https://github.com/soedinglab/ccmgen
12 | """
13 |
14 | LOGOS['ccmpred', False] = """
15 | ┏━╸┏━╸┏┳┓┏━┓┏━┓┏━╸╺┳┓ ┏━┓╻ ╻ version {0}
16 | ┃ ┃ ┃┃┃┣━┛┣┳┛┣╸ ┃┃ ┣━┛┗┳┛ Vorberg, Seemayer and Soeding (2018)
17 | ┗━╸┗━╸╹ ╹╹ ╹┗╸┗━╸╺┻┛ ╹ ╹ https://github.com/soedinglab/ccmgen
18 | """
19 |
20 |
21 | LOGOS['ccmgen', True] = """
22 | \x1b[32m┏━╸┏━╸┏┳┓\x1b[34m┏━╸┏━╸┏┓╻\x1b[0m version {0}
23 | \x1b[32m┃ ┃ ┃┃┃\x1b[34m┃╺┓┣╸ ┃┗┫\x1b[0m Vorberg, Seemayer and Soeding (2018)
24 | \x1b[32m┗━╸┗━╸╹ ╹\x1b[34m┗━┛┗━╸╹ ╹\x1b[0m https://github.com/soedinglab/ccmgen
25 | """
26 |
27 | LOGOS['ccmgen', False] = """
28 | ┏━╸┏━╸┏┳┓┏━╸┏━╸┏┓╻ version {0}
29 | ┃ ┃ ┃┃┃┃╺┓┣╸ ┃┗┫ Vorberg, Seemayer and Soeding (2018)
30 | ┗━╸┗━╸╹ ╹┗━┛┗━╸╹ ╹ https://github.com/soedinglab/ccmgen
31 | """
32 |
33 |
34 |
35 | def logo(what_for="ccmpred", color=is_tty):
36 | version = ccmpred.__version__
37 |
38 | print(LOGOS[what_for, color].format(version))
39 |
--------------------------------------------------------------------------------
/ccmpred/monitor/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/monitor/__init__.py
--------------------------------------------------------------------------------
/ccmpred/monitor/progress.py:
--------------------------------------------------------------------------------
1 | import ccmpred.logo
2 | import plotly.graph_objs as go
3 | import os
4 | import sys
5 | from plotly.offline import plot as plotly_plot
6 |
7 |
8 | class Progress():
9 | """
10 | Plot the progress as plotly graph
11 | """
12 |
13 | def __init__(self):
14 |
15 | self.optimization_log={}
16 | self.plotfile=None
17 | self.title=""
18 |
19 | def print_header(self):
20 |
21 | headerline ="{0:>{1}s}".format('iter', 8)
22 | headerline += (" ".join("{0:>{1}s}".format(ht, 14) for ht in sorted(self.optimization_log.keys())))
23 |
24 | if ccmpred.logo.is_tty:
25 | print("\x1b[2;37m{0}\x1b[0m".format(headerline))
26 | else:
27 | print(headerline)
28 |
29 | def set_plot_title(self, title):
30 | self.title=title
31 |
32 | def set_plot_file(self, file):
33 | self.plotfile=file
34 |
35 | def init_log(self, **kwargs):
36 | for name in kwargs.keys():
37 | self.optimization_log[name] = []
38 |
39 | self.print_header()
40 |
41 | def log_progress(self, n_iter, **kwargs):
42 |
43 | if len(self.optimization_log) == 0:
44 | self.init_log(**kwargs)
45 |
46 | if (n_iter != 0) and (n_iter % 100 == 0):
47 | self.print_header()
48 |
49 |
50 | log = "{0:>{1}}".format(n_iter, '8g')
51 | for name, metric in sorted(kwargs.items()):
52 | self.optimization_log[name].append(metric)
53 | log += "{0:>{1}}".format(metric, '15g')
54 | print(log)
55 |
56 | # log = "{0:>{1}}".format(n_iter, '8g')
57 | # print(log + " ".join("{0:>{1}}".format(self.optimization_log[key][-1], '15g') for key in sorted(self.optimization_log.keys())))
58 |
59 | if self.plotfile is not None:
60 | self.plot_progress()
61 |
62 | sys.stdout.flush()
63 |
64 |
65 | def plot_progress(self ):
66 |
67 | if self.plotfile is not None:
68 |
69 | protein = os.path.basename(self.plotfile).split(".")[0]
70 | title = "Optimization Log for {0} ".format(protein)
71 | title += self.title
72 |
73 | data = []
74 | for name, metric in self.optimization_log.items():
75 | data.append(
76 | go.Scatter(
77 | x=list(range(1, len(self.optimization_log[name]) + 1)),
78 | y=metric,
79 | mode='lines',
80 | visible="legendonly",
81 | name=name
82 | )
83 | )
84 |
85 | plot = {
86 | "data": data,
87 | "layout": go.Layout(
88 | title=title,
89 | xaxis1=dict(
90 | title="iteration",
91 | exponentformat="e",
92 | showexponent='all'
93 | ),
94 | yaxis1=dict(
95 | title="metric",
96 | exponentformat="e",
97 | showexponent='all'
98 | ),
99 | font=dict(size=18),
100 | titlefont=dict(size=14)
101 | )
102 | }
103 |
104 | plotly_plot(plot, filename=self.plotfile, auto_open=False)
105 |
--------------------------------------------------------------------------------
/ccmpred/objfun/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/objfun/__init__.py
--------------------------------------------------------------------------------
/ccmpred/objfun/cd/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import ccmpred.raw
3 | import ccmpred.gaps
4 | import ccmpred.counts
5 | import ccmpred.objfun
6 | import ccmpred.objfun.cd.cext
7 | import ccmpred.parameter_handling
8 | from ccmpred.pseudocounts import PseudoCounts
9 | import ccmpred.sampling
10 |
11 | class ContrastiveDivergence():
12 |
13 | def __init__(self, msa, weights, regularization, pseudocounts, x_single, x_pair,
14 | gibbs_steps=1, nr_seq_sample=500, persistent=False):
15 |
16 |
17 | self.msa = msa
18 | self.nrow, self.ncol = self.msa.shape
19 | self.weights = weights
20 | self.neff = np.sum(weights)
21 | self.regularization = regularization
22 |
23 | self.pseudocount_type = pseudocounts.pseudocount_type
24 | self.pseudocount_n_single = pseudocounts.pseudocount_n_single
25 | self.pseudocount_n_pair = pseudocounts.pseudocount_n_pair
26 |
27 | self.structured_to_linear = lambda x_single, x_pair: \
28 | ccmpred.parameter_handling.structured_to_linear(
29 | x_single, x_pair, nogapstate=True, padding=False)
30 | self.linear_to_structured = lambda x: \
31 | ccmpred.parameter_handling.linear_to_structured(
32 | x, self.ncol, nogapstate=True, add_gap_state=False, padding=False)
33 |
34 |
35 | self.x_single = x_single
36 | self.x_pair = x_pair
37 | self.x = self.structured_to_linear(self.x_single, self.x_pair)
38 |
39 | self.nsingle = self.ncol * 20
40 | self.npair = self.ncol * self.ncol * 21 * 21
41 | self.nvar = self.nsingle + self.npair
42 |
43 | # get constant alignment counts - INCLUDING PSEUDO COUNTS
44 | # important for small alignments
45 | self.freqs_single, self.freqs_pair = pseudocounts.freqs
46 | self.msa_counts_single = self.freqs_single * self.neff
47 | self.msa_counts_pair = self.freqs_pair * self.neff
48 |
49 | # reset gap counts
50 | self.msa_counts_single[:, 20] = 0
51 | self.msa_counts_pair[:, :, :, 20] = 0
52 | self.msa_counts_pair[:, :, 20, :] = 0
53 |
54 | # non_gapped counts
55 | self.Ni = self.msa_counts_single.sum(1)
56 | self.Nij = self.msa_counts_pair.sum(3).sum(2)
57 |
58 | ### Setting for (Persistent) Contrastive Divergence
59 |
60 | #perform this many steps of Gibbs sampling per sequence
61 | # 1 Gibbs step == sample every sequence position once
62 | self.gibbs_steps = np.max([gibbs_steps, 1])
63 |
64 | #define how many markov chains are run in parallel
65 | # => how many sequences are sampled at each iteration
66 | # at least 500 sequences or 10% of sequences in MSA
67 | self.nr_seq_sample = np.max([int(self.nrow/10), nr_seq_sample])
68 |
69 | #prepare the persistent MSA (Markov chains are NOT reset after each iteration)
70 | self.persistent=persistent
71 | #ensure that msa has at least NR_SEQ_SAMPLE sequences
72 | seq_id = list(range(self.nrow)) * int(np.ceil(self.nr_seq_sample / float(self.nrow)))
73 | self.msa_persistent = self.msa[seq_id]
74 | self.weights_persistent = self.weights[seq_id]
75 |
76 | def __repr__(self):
77 |
78 | repr_str = ""
79 |
80 | if self.persistent:
81 | repr_str += "persistent "
82 |
83 | repr_str += "contrastive divergence: "
84 |
85 | repr_str += "\nnr of sampled sequences={0} ({1}xN and {2}xNeff and {3}xL) Gibbs steps={4} ".format(
86 | self.nr_seq_sample,
87 | np.round(self.nr_seq_sample / float(self.nrow), decimals=3),
88 | np.round(self.nr_seq_sample / self.neff, decimals=3),
89 | np.round(self.nr_seq_sample / float(self.ncol), decimals=3),
90 | self.gibbs_steps
91 | )
92 |
93 | return repr_str
94 |
95 | def init_sample_alignment(self, persistent=False):
96 | """
97 | in case of CD:
98 | Randomly choose NR_SEQ_SAMPLE sequences from the ORIGINAL alignment
99 | in case of persistent CD:
100 | Randomly choose NR_SEQ_SAMPLE sequences from the alignment containing previously sampled sequences
101 | use the sequence weights computed from the original alignment
102 | (recomputing sequence weights in each iteration is too expensive)
103 |
104 | :return:
105 | """
106 |
107 | if persistent:
108 | # in case of PERSISTENT CD, continue the Markov chain:
109 | #randomly select NR_SEQ_SAMPLE sequences from persistent MSA
110 | self.sample_seq_id = np.random.choice(self.msa_persistent.shape[0], self.nr_seq_sample, replace=False)
111 | msa = self.msa_persistent[self.sample_seq_id]
112 | weights = self.weights_persistent[self.sample_seq_id]
113 | else:
114 | # in case of plain CD, reinitialize the Markov chains from original sequences:
115 | # randomly select NR_SEQ_SAMPLE sequences from original MSA
116 | self.sample_seq_id = np.random.choice(self.nrow, self.nr_seq_sample, replace=True)
117 | msa = self.msa[self.sample_seq_id]
118 | weights = self.weights[self.sample_seq_id]
119 |
120 | return msa, weights
121 |
122 | def finalize(self, x):
123 | return ccmpred.parameter_handling.linear_to_structured(
124 | x, self.ncol, clip=False, nogapstate=True, add_gap_state=True, padding=False
125 | )
126 |
127 | def evaluate(self, x, persistent=False):
128 |
129 | #setup sequences for sampling
130 | self.msa_sampled, self.msa_sampled_weights = self.init_sample_alignment(persistent)
131 |
132 | #Gibbs Sampling of sequences (each position of each sequence will be sampled this often: GIBBS_STEPS)
133 | self.msa_sampled = ccmpred.sampling.gibbs_sample_sequences(x, self.msa_sampled, self.gibbs_steps)
134 |
135 | if persistent:
136 | self.msa_persistent[self.sample_seq_id] = self.msa_sampled
137 |
138 | # compute amino acid frequencies from sampled alignment
139 | # add pseudocounts for stability
140 | pseudocounts = PseudoCounts(self.msa_sampled, self.msa_sampled_weights)
141 | pseudocounts.calculate_frequencies(
142 | self.pseudocount_type,
143 | self.pseudocount_n_single,
144 | self.pseudocount_n_pair,
145 | remove_gaps=False)
146 |
147 | #compute frequencies excluding gap counts
148 | sampled_freq_single = pseudocounts.degap(pseudocounts.freqs[0], True)
149 | sampled_freq_pair = pseudocounts.degap(pseudocounts.freqs[1], True)
150 |
151 |
152 | #compute counts and scale them accordingly to size of original MSA
153 | sample_counts_single = sampled_freq_single * self.Ni[:, np.newaxis]
154 | sample_counts_pair = sampled_freq_pair * self.Nij[:, :, np.newaxis, np.newaxis]
155 |
156 | #actually compute the gradients
157 | g_single = sample_counts_single - self.msa_counts_single
158 | g_pair = sample_counts_pair - self.msa_counts_pair
159 |
160 | #sanity check
161 | if(np.abs(np.sum(sample_counts_single[1,:20]) - np.sum(self.msa_counts_single[1,:20])) > 1e-5):
162 | print("Warning: sample aa counts ({0}) do not equal input msa aa counts ({1})!".format(
163 | np.sum(sample_counts_single[1,:20]), np.sum(self.msa_counts_single[1,:20]))
164 | )
165 |
166 | # set gradients for gap states to 0
167 | g_single[:, 20] = 0
168 | g_pair[:, :, :, 20] = 0
169 | g_pair[:, :, 20, :] = 0
170 |
171 | # set diagonal elements to 0
172 | for i in range(self.ncol):
173 | g_pair[i, i, :, :] = 0
174 |
175 | #compute regularization
176 | x_single, x_pair = self.linear_to_structured(x) #x_single has dim L x 20
177 | _, g_single_reg, g_pair_reg = self.regularization(x_single, x_pair) #g_single_reg has dim L x 20
178 |
179 | #gradient for x_single only L x 20
180 | g = self.structured_to_linear(g_single[:, :20], g_pair)
181 | g_reg = self.structured_to_linear(g_single_reg[:, :20], g_pair_reg)
182 |
183 | return -1, g, g_reg
184 |
185 | def get_parameters(self):
186 | parameters = {}
187 | parameters['gibbs_steps'] = int(self.gibbs_steps)
188 | parameters['nr_seq_sample'] = int(self.nr_seq_sample)
189 |
190 |
191 | return parameters
--------------------------------------------------------------------------------
/ccmpred/objfun/cd/cext/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy.ctypeslib as npct
3 | import ctypes
4 | import os.path
5 |
6 | array_1d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=1, flags='CONTIGUOUS')
7 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS')
8 |
9 | libcd = npct.load_library('libcd', os.path.join(os.path.dirname(__file__), '_build'))
10 |
11 | libcd.sample_position_in_sequences.restype = None
12 | libcd.sample_position_in_sequences.argtypes = [
13 | array_2d_char, # *msa
14 | array_1d_float, # *x
15 | ctypes.c_uint64, # nrow
16 | ctypes.c_uint32, # ncol
17 | ]
18 |
19 | libcd.gibbs_sample_sequences.restype = None
20 | libcd.gibbs_sample_sequences.argtypes = [
21 | array_2d_char, # *msa
22 | array_1d_float, # *x
23 | ctypes.c_uint32, # steps
24 | ctypes.c_uint64, # nrow
25 | ctypes.c_uint32, # ncol
26 | ]
27 |
28 | libcd.gibbs_sample_sequences_nogaps.restype = None
29 | libcd.gibbs_sample_sequences_nogaps.argtypes = [
30 | array_2d_char, # *msa
31 | array_1d_float, # *x
32 | ctypes.c_uint32, # steps
33 | ctypes.c_uint64, # nrow
34 | ctypes.c_uint32, # ncol
35 | ]
36 |
37 | def sample_position_in_sequences(msa, x):
38 | libcd.sample_position_in_sequences(msa, x, *msa.shape)
39 | return msa
40 |
41 | def gibbs_sample_sequences(msa, x, steps):
42 | libcd.gibbs_sample_sequences(msa, x, steps, *msa.shape)
43 | return msa
44 |
45 | def gibbs_sample_sequences_nogaps(msa, x, steps):
46 | libcd.gibbs_sample_sequences(msa, x, steps, *msa.shape)
47 | return msa
--------------------------------------------------------------------------------
/ccmpred/objfun/cd/cext/cd.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 |
10 | #include "cd.h"
11 | #include "cdutil.h"
12 |
13 |
14 | /**
15 | * Compute conditional probabilities
16 | * $P(X_i = a | X^n_0, ... X^n_L \setminus X^n_i, v, w)$
17 | *
18 | * @param[in] i Index of the column to compute probabilities for
19 | * @param[out] cond_probs Returns a 20-field array of conditional probabilities
20 | * @param[in] x The current potentials
21 | * @param[in] last_seq The current sequence to condition on
22 | * @param[in] ncol The number of columns in the MSA
23 | */
24 | void compute_conditional_probs(
25 | const int i,
26 | flt *const cond_probs,
27 | const flt *const x,
28 | const unsigned char *const last_seq,
29 | const int ncol
30 | ) {
31 | int a, j;
32 | int nsingle = ncol * (N_ALPHA - 1);
33 |
34 | for (a = 0; a < N_ALPHA - 1; a++) {
35 | cond_probs[a] = E1(i,a);
36 | }
37 |
38 | for (a = 0; a < N_ALPHA - 1; a++) {
39 | for (j = 0; j < ncol; j++) {
40 | cond_probs[a] += E2(i, a, j, last_seq[j]);
41 | }
42 |
43 | // don't add up the case i = j
44 | cond_probs[a] -= E2(i, a, i, last_seq[i]);
45 | }
46 |
47 | cond_probs[GAP] = F0;
48 |
49 | flt denom = F0;
50 | for (a = 0; a < N_ALPHA - 1; a++) {
51 | cond_probs[a] = fexp(cond_probs[a]);
52 | denom += cond_probs[a];
53 | }
54 |
55 | for (a = 0; a < N_ALPHA - 1; a++) {
56 | cond_probs[a] /= denom;
57 | }
58 | }
59 |
60 | /**
61 | * Resample a multiple sequence alignment
62 | *
63 | * @param[inout] seq The MSA to work on
64 | * @param[in] x The current potentials
65 | * @param[in] ncol The number of columns in the MSA
66 | * @param[in] n_samples The number of samples to generate (also the number of rows in the MSA)
67 | */
68 | void sample_position_in_sequences(
69 | unsigned char *seq,
70 | const flt *const x,
71 | const unsigned long n_samples,
72 | const int ncol
73 | ) {
74 | seed_rng();
75 |
76 | #pragma omp parallel
77 | {
78 | int i;
79 | unsigned long k;
80 | flt *pcondcurr = fl_malloc(N_ALPHA);
81 |
82 | #pragma omp for
83 | for (k = 0; k < n_samples; k++) {
84 |
85 | do {
86 | i = pick_random_uniform(ncol - 1);
87 | } while(seq[k * ncol + i] == GAP);
88 |
89 | compute_conditional_probs(i, pcondcurr, x, &seq[k * ncol], ncol);
90 | seq[k * ncol + i] = pick_random_weighted(pcondcurr, N_ALPHA - 1);
91 |
92 | }
93 | fl_free(pcondcurr);
94 | }
95 | }
96 |
97 |
98 | void gibbs_sample_sequences(
99 | unsigned char *seq,
100 | const flt *const x,
101 | const int steps,
102 | const unsigned long n_samples,
103 | const int ncol
104 | ){
105 |
106 | seed_rng();
107 | omp_set_dynamic(0);
108 |
109 | #pragma omp parallel
110 | {
111 | int i;
112 | unsigned long k;
113 | flt *pcondcurr = fl_malloc(N_ALPHA);
114 |
115 | //int array with elements 1..L
116 | unsigned int sequence_position_vector[ncol];
117 | for (unsigned int p=0; p < ncol; p++) sequence_position_vector[p] = p;
118 |
119 | //int num_threads = omp_get_num_threads();
120 | //printf("max thread num %d ", num_threads);
121 |
122 | #pragma omp for private(k)
123 | for (k = 0; k < n_samples; k++) {
124 | //int this_thread = omp_get_thread_num();
125 | //printf("Compute seq %zu with thread %d \n", k, this_thread);
126 |
127 | for (int s=0; s < steps; s++){
128 | shuffle(sequence_position_vector, ncol);
129 |
130 | for (i=0; i < ncol; i++){
131 | if (seq[k * ncol + sequence_position_vector[i]] != GAP){
132 | compute_conditional_probs(sequence_position_vector[i], pcondcurr, x, &seq[k * ncol], ncol);
133 | seq[k * ncol + sequence_position_vector[i]] = pick_random_weighted(pcondcurr, N_ALPHA - 1);
134 | }
135 |
136 | }
137 | }
138 | }
139 | fl_free(pcondcurr);
140 | }
141 |
142 | }
143 |
144 |
145 | void gibbs_sample_sequences_nogaps(
146 | unsigned char *seq,
147 | const flt *const x,
148 | const int steps,
149 | const unsigned long n_samples,
150 | const int ncol
151 | ){
152 |
153 | seed_rng();
154 |
155 | #pragma omp parallel
156 | {
157 | int i;
158 | unsigned long k;
159 | flt *pcondcurr = fl_malloc(N_ALPHA);
160 |
161 | //int array with elements 1..L
162 | unsigned int sequence_position_vector[ncol];
163 | for (unsigned int p=0; p < ncol; p++) sequence_position_vector[p] = p;
164 |
165 |
166 | #pragma omp for
167 | for (int s=0; s < steps; s++){
168 | for (k = 0; k < n_samples; k++) {
169 | shuffle(sequence_position_vector, ncol);
170 |
171 | for (i=0; i < ncol; i++){
172 | compute_conditional_probs(sequence_position_vector[i], pcondcurr, x, &seq[k * ncol], ncol);
173 | seq[k * ncol + sequence_position_vector[i]] = pick_random_weighted(pcondcurr, N_ALPHA - 1);
174 | }
175 | }
176 | }
177 | fl_free(pcondcurr);
178 | }
179 |
180 | }
--------------------------------------------------------------------------------
/ccmpred/objfun/cd/cext/cd.h:
--------------------------------------------------------------------------------
1 | #ifndef CD_H
2 | #define CD_H
3 |
4 | #define N_ALPHA 21
5 | #define GAP 20
6 |
7 | typedef double flt;
8 | #define F0 0.0
9 | #define F1 1.0
10 | #define F2 2.0
11 | #define fexp exp
12 | #define flog log
13 |
14 |
15 | #define X1_INDEX(i,a) (i) * (N_ALPHA - 1) + (a)
16 | #define X2_INDEX(i,a,j,b) (((i) * N_ALPHA + (a)) * ncol + (j)) * N_ALPHA + (b)
17 |
18 | #define G1(i,a) g[X1_INDEX(i,a)]
19 | #define G2(i,a,j,b) g[nsingle + X2_INDEX(i,a,j,b)]
20 | #define E1(i,a) x[X1_INDEX(i,a)]
21 | #define E2(i,a,j,b) x[nsingle + X2_INDEX(i,a,j,b)]
22 | #define H1(i,a) h[X1_INDEX(i,a)]
23 | #define H2(i,a,j,b) h[nsingle + X2_INDEX(i,a,j,b)]
24 |
25 |
26 | #define MSA(n,i) msa[MSA_INDEX(n,i)]
27 |
28 | #define MSA_INDEX(n,i) (n) * ncol + (i)
29 |
30 | void compute_conditional_probs(
31 | const int i,
32 | flt *const cond_probs,
33 | const flt *const x,
34 | const unsigned char *const last_seq,
35 | const int ncol
36 | );
37 |
38 | #endif
39 |
--------------------------------------------------------------------------------
/ccmpred/objfun/cd/cext/cdutil.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 |
6 | #include "cd.h"
7 | #include "cdutil.h"
8 |
9 |
10 | void seed_rng() {
11 | int pid;
12 | struct timeval t;
13 | gettimeofday(&t, NULL);
14 | pid = getpid();
15 | srand(t.tv_usec * t.tv_sec * pid);
16 | }
17 |
18 |
19 | /* Arrange the N elements of ARRAY in random order.
20 | Only effective if N is much smaller than RAND_MAX;
21 | if this may not be the case, use a better random
22 | number generator. */
23 | void shuffle(unsigned int *array, size_t n)
24 | {
25 | if (n > 1)
26 | {
27 | size_t i;
28 | for (i = 0; i < n - 1; i++)
29 | {
30 | size_t j = i + rand() / (RAND_MAX / (n - i) + 1);
31 | int t = array[j];
32 | array[j] = array[i];
33 | array[i] = t;
34 | }
35 | }
36 | }
37 |
38 |
39 | int pick_random_uniform(int max) {
40 | int div = RAND_MAX / (max + 1);
41 | int retval;
42 |
43 | do {
44 | retval = rand() / div;
45 | } while (retval > max);
46 |
47 | return retval;
48 | }
49 |
50 | // A B C
51 | // 0.1 0.2 0.7
52 | // |----|--------|------------------------------------------|
53 | // | 0.1 0.3 1
54 |
55 | //p<0.1 --> A
56 | //0.1 < p < 0.3 --> p - 0.1 < 0.2 --> B
57 | //p>=0.3 --> p - 0.1 - 0.2 < 0.7 --> C
58 | int pick_random_weighted(flt *probs, int n) {
59 | int a;
60 | double p = (double)rand() / (double)RAND_MAX;
61 | for (a = 0; a < n; a++) {
62 | flt p_curr = probs[a];
63 | if (p < p_curr) {
64 | return a;
65 | }
66 | p -= p_curr;
67 | }
68 | return n - 1;
69 | }
70 |
71 | flt* fl_malloc(int n) {
72 | return (flt *)malloc(sizeof(flt) * n);
73 | }
74 |
75 | void fl_free(flt *dest) {
76 | free(dest);
77 | }
78 |
79 |
80 |
81 | void fl_memcpy(flt *dest, flt *src, int n) {
82 | memcpy(dest, src, sizeof(flt) * n);
83 | }
84 |
85 | void vecimulc(flt *dst, flt f, int n) {
86 | int i;
87 | for(i = 0; i < n; i++) {
88 | dst[i] *= f;
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/ccmpred/objfun/cd/cext/cdutil.h:
--------------------------------------------------------------------------------
1 | #ifndef CDUTIL_H
2 | #define CDUTIL_H
3 |
4 | void seed_rng(void);
5 |
6 | void shuffle(unsigned int *array, size_t n);
7 | int pick_random_uniform(int max);
8 | int pick_random_weighted(flt *probs, int n);
9 |
10 | flt* fl_malloc(int n);
11 | void fl_free(flt *dest);
12 | void fl_memcpy(flt *dest, flt *src, int n);
13 |
14 | void vecimulc(flt *dst, flt f, int n);
15 |
16 | #endif
17 |
--------------------------------------------------------------------------------
/ccmpred/objfun/pll/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import ccmpred.raw
4 | import ccmpred.regularization
5 | import ccmpred.objfun
6 | import ccmpred.objfun.pll.cext
7 | import ccmpred.counts
8 | import ccmpred.parameter_handling
9 |
10 | class PseudoLikelihood():
11 | def __init__(self, msa, weights, regularization, pseudocounts, x_single, x_pair):
12 |
13 | self.msa = msa
14 | self.nrow, self.ncol = msa.shape
15 | self.weights = weights
16 | self.neff = np.sum(weights)
17 | self.regularization = regularization
18 |
19 | self.structured_to_linear = lambda x_single, x_pair: \
20 | ccmpred.parameter_handling.structured_to_linear(
21 | x_single, x_pair, nogapstate=False, padding=True)
22 | self.linear_to_structured = lambda x: \
23 | ccmpred.parameter_handling.linear_to_structured(
24 | x, self.ncol, nogapstate=False, add_gap_state=False, padding=True)
25 |
26 | self.x_single = x_single
27 | self.x_pair = x_pair
28 | self.x = self.structured_to_linear(self.x_single, self.x_pair)
29 |
30 | #use msa counts with pseudo counts - numerically more stable?? but gradient does not fit ll fct!!
31 | #self.freqs_single, self.freqs_pair = ccm.pseudocounts.freqs
32 | #msa_counts_single, msa_counts_pair = neff * freqs_single, neff * freqs_pair
33 | #use msa counts without pseudo counts
34 | msa_counts_single, msa_counts_pair = pseudocounts.counts
35 |
36 | msa_counts_single[:, 20] = 0
37 | msa_counts_pair[:, :, 20, :] = 0
38 | msa_counts_pair[:, :, :, 20] = 0
39 |
40 | for i in range(self.ncol):
41 | msa_counts_pair[i, i, :, :] = 0
42 |
43 | #non_gapped counts
44 | # self.Ni = msa_counts_single.sum(1)
45 | # self.Nij = msa_counts_pair.sum(3).sum(2)
46 |
47 | #no pseudo counts in gradient calculation
48 | #pairwise gradient is two-fold
49 | self.g_init = ccmpred.parameter_handling.structured_to_linear(
50 | msa_counts_single, 2 * msa_counts_pair)
51 |
52 | self.nsingle = self.ncol * 21
53 | self.nsingle_padded = self.nsingle + 32 - (self.nsingle % 32)
54 | self.nvar = self.nsingle_padded + self.ncol * self.ncol * 21 * 32
55 |
56 | # memory allocation for intermediate variables
57 | #gradient for single and pair potentials
58 | self.g = np.empty((self.nsingle_padded + self.ncol * self.ncol * 21 * 32,), dtype=np.dtype('float64'))
59 | #gradient for only pair potentials
60 | self.g2 = np.empty((self.ncol * self.ncol * 21 * 32,), dtype=np.dtype('float64'))
61 |
62 |
63 | def finalize(self, x):
64 | return ccmpred.parameter_handling.linear_to_structured(
65 | x, self.ncol, clip=True, nogapstate=False, add_gap_state=False, padding=True)
66 |
67 | def evaluate(self, x):
68 |
69 | #fx is function value of objective function over w_ijab with i=j=1 < L
70 | #--> potentials are symmetric and counted twice!!
71 | #w_ijab will later be updated by gradient for i=1 therefore gradient for w_ijab is multiplied by 2!!
73 |
74 | #pointer to g == self.g
75 | fx, g = ccmpred.objfun.pll.cext.evaluate(x, self.g, self.g2, self.weights, self.msa)
76 | g -= self.g_init
77 |
78 | x_single, x_pair = self.linear_to_structured(x)
79 |
80 | #compute regularizer
81 | fx_reg, g_single_reg, g_pair_reg = self.regularization(x_single, x_pair)
82 | g_pair_reg *= 2 #gradient is multiplied by 2 because of issue mentioned above
83 | g_reg = self.structured_to_linear(g_single_reg, g_pair_reg)
84 | fx += fx_reg
85 |
86 | return fx, g, g_reg
87 |
88 | def get_parameters(self):
89 | return {'padding' : True,
90 | 'pseudocounts': False}
91 |
92 | def __repr__(self):
93 | return "PLL "
94 |
95 |
--------------------------------------------------------------------------------
/ccmpred/objfun/pll/cext/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy.ctypeslib as npct
3 | import ctypes
4 | import os.path
5 |
6 | array_1d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=1, flags='CONTIGUOUS')
7 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS')
8 |
9 | libpll = npct.load_library('libpll', os.path.join(os.path.dirname(__file__), '_build'))
10 |
11 | libpll.evaluate_pll.restype = ctypes.c_double
12 | libpll.evaluate_pll.argtypes = [
13 | array_1d_float, # *x
14 | array_1d_float, # *g
15 | array_1d_float, # *g2
16 | array_1d_float, # *weights
17 | array_2d_char, # *msa
18 | ctypes.c_uint32, # ncol
19 | ctypes.c_uint32, # nrow
20 | ]
21 |
22 |
23 | def evaluate(x, g, g2, weights, msa):
24 | nrow, ncol = msa.shape
25 | fx = libpll.evaluate_pll(x, g, g2, weights, msa, ncol, nrow)
26 | return fx, g
27 |
--------------------------------------------------------------------------------
/ccmpred/objfun/pll/cext/pll.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 | #include
8 |
9 | #include "pll.h"
10 |
11 | double evaluate_pll(
12 | const double *x,
13 | double *g,
14 | double *g2,
15 | double *weights,
16 | unsigned char *msa,
17 | const uint32_t ncol,
18 | const uint32_t nrow
19 | ) {
20 | uint32_t nsingle = ncol * N_ALPHA;
21 | uint32_t nsingle_padded = nsingle + N_ALPHA_PAD - (nsingle % N_ALPHA_PAD);
22 | uint64_t nvar_padded = nsingle_padded + ncol * ncol * N_ALPHA * N_ALPHA_PAD;
23 |
24 | const double *x1 = x;
25 | const double *x2 = &x[nsingle_padded];
26 |
27 | double *g1 = g;
28 | double *g2l = &g[nsingle_padded];
29 |
30 | // set fx and gradient to 0 initially
31 | double fx = 0.0;
32 |
33 | //gradient for single and pair potentials
34 | memset(g, 0, sizeof(double) * nvar_padded);
35 | //gradient only for pair potentials
36 | memset(g2, 0, sizeof(double) * (nvar_padded - nsingle_padded));
37 |
38 | double *precomp_norm = malloc(sizeof(double) * N_ALPHA * nrow * ncol);
39 |
40 | //#pragma omp parallel for reduction(+:fx)
41 | //iterate over ALL pairs (not only i log Z_nj
52 | for(int a = 0; a < N_ALPHA - 1; a++) {
53 | precomp[a] = V(a, j);
54 |
55 | for(uint32_t i = 0; i < ncol; i++) {
56 | unsigned char xni = X(n, i);
57 |
58 | //ignore gaps
59 | if (xni < N_ALPHA - 1) {
60 | precomp[a] += W(a, j, xni, i);
61 | }
62 | }
63 |
64 | precomp_sum += exp(precomp[a]);
65 | }
66 | precomp[N_ALPHA - 1] = 0; // set precomp(gap) to zero
67 | precomp_sum = log(precomp_sum);
68 |
69 |
70 | // compute exp(V_j(a) + sum(i < L) w_{ji}(a, x_ni)) / Z_nj
71 | // needed for gradient computation
72 | // --> exp(precomp) / exp(log(Z))
73 | // --> exp(precomp - log(Z))
74 | //ignore gaps!
75 | for(int a = 0; a < N_ALPHA - 1; a++) {
76 | precomp_norm[(n * N_ALPHA + a) * ncol + j] = exp(precomp[a] - precomp_sum);
77 | }
78 | precomp_norm[(n * N_ALPHA + N_ALPHA - 1) * ncol + j] = 0;
79 |
80 |
81 |
82 | unsigned char xnj = X(n,j);
83 |
84 | // actually add up the function value if x_nj is not a gap
85 | // * -1.0 because we are using negative log likelihood
86 | // weight(n) * (precomp( x_nj ) - log Z_nj)
87 | // weight(n) * ( V_j(x_nj) + sum(i < L) w_{ji}(x_nj, x_ni) - log Z_nj)
88 |
89 | if(xnj < N_ALPHA - 1) {
90 | fx += weight * (precomp_sum - precomp[xnj]);
91 | }
92 |
93 | } // nj
94 |
95 |
96 | //compute gradients for single emissions
97 | #pragma omp parallel for
98 | for(uint32_t nj = 0; nj < nrow * ncol; nj++) {
99 | uint32_t n = nj / ncol;
100 | uint32_t j = nj % ncol;
101 | unsigned char xnj = X(n,j);
102 | double weight = weights[n];
103 |
104 | //if xnj is not a gap: add second part of gradient
105 | if(xnj < N_ALPHA - 1) {
106 |
107 | for(uint32_t a = 0; a < N_ALPHA - 1; a++) {
108 | #pragma omp atomic
109 | G1(a, j) += weight * precomp_norm[(n * N_ALPHA + a) * ncol + j];
110 | }
111 | } else {
112 | //otherwise set precomp_norm to zero so that no count will be added to G2
113 | for(uint32_t a = 0; a < N_ALPHA; a++) {
114 | precomp_norm[(n * N_ALPHA + a) * ncol + j] = 0;
115 | }
116 | }
117 |
118 | } // nj
119 |
120 | //compute gradients for pair emissions
121 | #pragma omp parallel for
122 | //iterate over WHOLE matrix (not only iresidue j: {1}
score: {2}".format(
66 | plot_matrix.residue_i.tolist()[i],
67 | plot_matrix.residue_j.tolist()[i],
68 | np.round(plot_matrix.confidence.tolist()[i], decimals=3))
69 | for i in range(len(plot_matrix.residue_i.tolist()))]
70 |
71 | hover_text += ["residue i: {0}
residue j: {1}
score: {2}".format(
72 | plot_matrix.residue_j.tolist()[i],
73 | plot_matrix.residue_i.tolist()[i],
74 | np.round(plot_matrix.confidence.tolist()[i], decimals=3))
75 | for i in range(len(plot_matrix.residue_i.tolist()))]
76 |
77 | # add predicted contact map
78 | data.append(
79 | go.Heatmap(
80 | x=plot_matrix.residue_i.tolist() + plot_matrix.residue_j.tolist(),
81 | y=plot_matrix.residue_j.tolist() + plot_matrix.residue_i.tolist(),
82 | z=plot_matrix.confidence.tolist() + plot_matrix.confidence.tolist(),
83 | name='predicted',
84 | hoverinfo="text",
85 | text=hover_text,
86 | colorscale='Greys',
87 | reversescale=True,
88 | colorbar=dict(
89 | x = 1,
90 | y = 0.4,
91 | yanchor = 'bottom',
92 | len = 0.4,
93 | title = "Score"
94 | )
95 | )
96 | )
97 |
98 |
99 | # if distances and class are available
100 | if 'contact' in plot_matrix and 'distance' in plot_matrix:
101 |
102 | # colorscale from red (small distance) to blue(large distance)
103 | zmax = np.max(plot_matrix.distance)
104 | percent_at_contact_thr = 8 / zmax
105 | distance_colorscale = [[0, 'rgb(128, 0, 0)'],
106 | [percent_at_contact_thr, 'rgb(255, 255, 255)'],
107 | [1, 'rgb(22, 96, 167)']]
108 |
109 |
110 | hover_text = ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format(
111 | plot_matrix.residue_i.tolist()[i],
112 | plot_matrix.residue_j.tolist()[i],
113 | np.round(plot_matrix.confidence.tolist()[i], decimals=3),
114 | np.round(plot_matrix.distance.tolist()[i], decimals=3))
115 | for i in range(len(plot_matrix.residue_i.tolist()))]
116 |
117 | hover_text += ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format(
118 | plot_matrix.residue_j.tolist()[i],
119 | plot_matrix.residue_i.tolist()[i],
120 | np.round(plot_matrix.confidence.tolist()[i], decimals=3),
121 | np.round(plot_matrix.distance.tolist()[i], decimals=3))
122 | for i in range(len(plot_matrix.residue_i.tolist()))]
123 |
124 | # define triangle on opposite site of Predictions
125 | data.append(
126 | go.Heatmap(
127 | x=plot_matrix.residue_j.tolist(),
128 | y=plot_matrix.residue_i.tolist(),
129 | z=plot_matrix.distance.tolist(),
130 | name='observed',
131 | hoverinfo="text",
132 | text=hover_text,
133 | zmin=0,
134 | zmax=zmax,
135 | colorscale=distance_colorscale,
136 | colorbar=dict(
137 | x=1,
138 | y=0,
139 | yanchor='bottom',
140 | len=0.4,
141 | title="Distance [A]")
142 | )
143 | )
144 |
145 |
146 | # define true and false positives among the L/5 highest scores
147 | sub_L5_true = plot_matrix.query('distance > 0').head(int(L / 5)).query('contact > 0')
148 | sub_L5_false = plot_matrix.query('distance > 0').head(int(L / 5)).query('contact < 1')
149 |
150 | tp_text = ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format(
151 | sub_L5_true.residue_i.tolist()[i],
152 | sub_L5_true.residue_j.tolist()[i],
153 | np.round(sub_L5_true.confidence.tolist()[i], decimals=3),
154 | np.round(sub_L5_true.distance.tolist()[i], decimals=3))
155 | for i in range(len(sub_L5_true.residue_i.tolist()))]
156 |
157 | tp_text += ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format(
158 | sub_L5_true.residue_j.tolist()[i],
159 | sub_L5_true.residue_i.tolist()[i],
160 | np.round(sub_L5_true.confidence.tolist()[i], decimals=3),
161 | np.round(sub_L5_true.distance.tolist()[i], decimals=3))
162 | for i in range(len(sub_L5_true.residue_i.tolist()))]
163 |
164 | if len(sub_L5_true) > 0:
165 | # Mark TP and FP in the plot with little crosses
166 | data.append(
167 | go.Scatter(
168 | x=sub_L5_true['residue_i'].tolist() + sub_L5_true['residue_j'].tolist(),
169 | y=sub_L5_true['residue_j'].tolist() + sub_L5_true['residue_i'].tolist(),
170 | mode='markers',
171 | text=tp_text,
172 | hoverinfo="text",
173 | marker=dict(
174 | symbol=134,
175 | color="green",
176 | line=dict(width=2),
177 | size=12
178 | ),
179 | name="TP (L/5)"
180 | )
181 | )
182 |
183 | fp_text = ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format(
184 | sub_L5_false.residue_i.tolist()[i],
185 | sub_L5_false.residue_j.tolist()[i],
186 | np.round(sub_L5_false.confidence.tolist()[i], decimals=3),
187 | np.round(sub_L5_false.distance.tolist()[i], decimals=3))
188 | for i in range(len(sub_L5_false.residue_i.tolist()))]
189 |
190 | fp_text += ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format(
191 | sub_L5_false.residue_j.tolist()[i],
192 | sub_L5_false.residue_i.tolist()[i],
193 | np.round(sub_L5_false.confidence.tolist()[i], decimals=3),
194 | np.round(sub_L5_false.distance.tolist()[i], decimals=3))
195 | for i in range(len(sub_L5_false.residue_i.tolist()))]
196 |
197 | if len(sub_L5_false) > 0:
198 | data.append(
199 | go.Scatter(
200 | x=sub_L5_false['residue_i'].tolist() + sub_L5_false['residue_j'].tolist(),
201 | y=sub_L5_false['residue_j'].tolist() + sub_L5_false['residue_i'].tolist(),
202 | mode='markers',
203 | text=fp_text,
204 | hoverinfo="text",
205 | marker=dict(
206 | symbol=134,
207 | color="red",
208 | line=dict(width=2),
209 | size=12
210 | ),
211 | name="FP (L/5)"
212 |
213 | )
214 | )
215 |
216 |
217 |
218 |
219 | # add diagonal and diagonals marking sequence separation
220 | data.append(go.Scatter(
221 | x=[0, L], y=[0, L],
222 | mode='lines',
223 | line=dict(color=('rgb(0, 0, 0)'), width=4),
224 | hoverinfo=None,
225 | showlegend=False)
226 | )
227 | data.append(
228 | go.Scatter(
229 | x=[0, L - seqsep + 1], y=[seqsep - 1, L],
230 | mode='lines',
231 | line=dict(color=('rgb(0, 0, 0)'), width=2),
232 | showlegend=False)
233 | )
234 | data.append(
235 | go.Scatter(
236 | x=[seqsep - 1, L], y=[0, L - seqsep + 1],
237 | mode='lines',
238 | line=dict(color=('rgb(0, 0, 0)'), width=2),
239 | showlegend=False)
240 | )
241 |
242 |
243 | fig = tools.make_subplots(rows=2, cols=1, shared_xaxes=True, print_grid=False)
244 |
245 |
246 | if gaps_percentage_plot is not None:
247 | for trace in gaps_percentage_plot['data']:
248 | fig.append_trace(trace, 1, 1)
249 |
250 | for trace in data:
251 | fig.append_trace(trace, 2, 1)
252 |
253 | fig['layout']['title'] = title
254 | fig['layout']['width'] = 1000
255 | fig['layout']['height'] = 1000
256 | fig['layout']['legend'] = {'x': 1, 'y': 1} # places legend to the right of plot
257 | fig['layout']['hovermode'] = "closest"
258 |
259 | fig['layout']['xaxis1']['title'] = 'i'
260 | fig['layout']['xaxis1']['range'] = [0.5, L + 0.5]
261 | fig['layout']['xaxis1']['domain'] = [0.0, 1.0]
262 | fig['layout']['xaxis1']['zeroline'] = False
263 |
264 | fig['layout']['yaxis2']['title'] = 'j'
265 | fig['layout']['yaxis2']['range'] = [0.5, L + 0.5]
266 | fig['layout']['yaxis2']['domain'] = [0.0, 1.0]
267 | fig['layout']['yaxis2']['scaleanchor'] = "x"
268 | fig['layout']['yaxis2']['scaleratio'] = 1.0
269 | fig['layout']['yaxis2']['zeroline'] = False
270 |
271 | fig['layout']['font']['size'] = 18
272 |
273 | #percentage gaps and entropy plot
274 | if gaps_percentage_plot is not None:
275 | fig['layout']['yaxis2']['domain'] = [0.0, 0.9]
276 | #fig['layout']['xaxis1']['domain'] = [0.0, 0.9]
277 | fig['layout']['yaxis1']['domain'] = [0.9, 1.0]
278 |
279 |
280 |
281 | if plot_file:
282 | plotly_plot(fig, filename=plot_file, auto_open=False, show_link=False)
283 | else:
284 | return fig
285 |
286 | def plot_empirical_vs_model_statistics(
287 | single_freq_observed, single_freq_sampled,
288 | pairwise_freq_observed, pairwise_freq_sampled,
289 | plot_out):
290 |
291 | L = single_freq_observed.shape[0]
292 | indices_upper_triangle_i, indices_upper_triangle_j = np.triu_indices(L, k=1)
293 |
294 | x_single = single_freq_observed.flatten().tolist()
295 | y_single = single_freq_sampled.flatten().tolist()
296 | pair_freq_observed = pairwise_freq_observed[
297 | indices_upper_triangle_i,
298 | indices_upper_triangle_j, :, :].flatten().tolist()
299 | pair_freq_sampled = pairwise_freq_sampled[
300 | indices_upper_triangle_i,
301 | indices_upper_triangle_j, :, :].flatten().tolist()
302 | cov_observed = [pairwise_freq_observed[i, j, a, b] - (single_freq_observed[i, a] * single_freq_observed[j, b])
303 | for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)]
304 | cov_sampled = [pairwise_freq_sampled[i, j, a, b] - (single_freq_sampled[i, a] * single_freq_sampled[j, b])
305 | for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)]
306 |
307 |
308 | ## first trace: single amino acid frequencies
309 | trace_single_frequencies = go.Scattergl(
310 | x=x_single,
311 | y=y_single,
312 | mode='markers',
313 | name='single frequencies',
314 | text=["position: {0}
amino acid: {1}".format(i+1,io.AMINO_ACIDS[a]) for i in range(L) for a in range(20)],
315 | marker=dict(color='black'),
316 | opacity=0.1,
317 | showlegend=False
318 | )
319 | pearson_corr_single = np.corrcoef(x_single, y_single)[0,1]
320 |
321 | ## second trace: pairwise amino acid frequencies
322 | parir_freq_annotation = ["position: {0}-{1}
amino acid: {2}-{3}".format(
323 | i+1,
324 | j+1,
325 | io.AMINO_ACIDS[a],
326 | io.AMINO_ACIDS[b]) for i in range(L-1) for j in range(i+1, L) for a in range(20) for b in range(20)]
327 | trace_pairwise_frequencies = go.Scattergl(
328 | x=pair_freq_observed,
329 | y=pair_freq_sampled,
330 | mode='markers',
331 | name='pairwise frequencies',
332 | text=parir_freq_annotation,
333 | marker=dict(color='black'),
334 | opacity=0.1,
335 | showlegend=False
336 | )
337 | pearson_corr_pair = np.corrcoef(pair_freq_observed, pair_freq_sampled)[0, 1]
338 |
339 | ## third trace: covariances
340 | trace_cov = go.Scattergl(
341 | x=cov_observed,
342 | y=cov_sampled,
343 | mode='markers',
344 | name='covariances',
345 | text=parir_freq_annotation,
346 | marker=dict(color='black'),
347 | opacity=0.1,
348 | showlegend=False
349 | )
350 | pearson_corr_cov = np.corrcoef(cov_observed, cov_sampled)[0, 1]
351 |
352 |
353 | #define diagonals
354 | diag_single = [np.min(x_single + y_single), np.max(x_single + y_single)]
355 | diag_pair = [np.min(pair_freq_observed + pair_freq_sampled), np.max(pair_freq_observed + pair_freq_sampled)]
356 | diag_cov = [np.min(cov_observed + cov_sampled), np.max(cov_observed+ cov_sampled)]
357 |
358 |
359 | diagonal_single = go.Scattergl(
360 | x=diag_single,
361 | y=diag_single,
362 | mode="lines",
363 | showlegend=False,
364 | marker=dict(color='rgb(153, 204, 255)')
365 | )
366 |
367 | diagonal_pair = go.Scattergl(
368 | x=diag_pair,
369 | y=diag_pair,
370 | mode="lines",
371 | showlegend=False,
372 | marker=dict(color='rgb(153, 204, 255)')
373 | )
374 |
375 | diagonal_cov = go.Scattergl(
376 | x=diag_cov,
377 | y=diag_cov,
378 | mode="lines",
379 | showlegend=False,
380 | marker=dict(color='rgb(153, 204, 255)')
381 | )
382 |
383 |
384 |
385 | ## define subplots
386 | fig = tools.make_subplots(
387 | rows=1,
388 | cols=3,
389 | subplot_titles=["single site amino acid frequencies", "pairwise amino acid frequencies", "covariances"],
390 | horizontal_spacing = 0.05,
391 | print_grid=False
392 | )
393 |
394 | ## add traces as subplots
395 | fig.append_trace(trace_single_frequencies, 1, 1)
396 | fig.append_trace(diagonal_single, 1, 1)
397 | fig.append_trace(trace_pairwise_frequencies, 1, 2)
398 | fig.append_trace(diagonal_pair, 1, 2)
399 | fig.append_trace(trace_cov, 1, 3)
400 | fig.append_trace(diagonal_cov, 1, 3)
401 |
402 |
403 |
404 | #incresae size of subplot titles
405 | fig.layout.annotations[0].font.size = 20
406 | fig.layout.annotations[1].font.size = 20
407 | fig.layout.annotations[2].font.size = 20
408 |
409 |
410 | # add text to plot: Pearson correlation coefficient
411 | annotation_single = go.layout.Annotation(
412 | dict(
413 | x=0.13,#0.02,
414 | y=0.04,#0.95,
415 | xanchor="left",
416 | xref='paper',
417 | yref='paper',
418 | text='Pearson r = ' + str(np.round(pearson_corr_single, decimals=3)),
419 | bgcolor = "white",
420 | showarrow=False
421 | )
422 | )
423 |
424 | annotation_pair = go.layout.Annotation(
425 | dict(
426 | x=0.48,#0.37,
427 | y=0.04,#0.95,
428 | xanchor="left",
429 | xref='paper',
430 | yref='paper',
431 | text='Pearson r = ' + str(np.round(pearson_corr_pair, decimals=3)),
432 | bgcolor="white",
433 | showarrow=False
434 | )
435 | )
436 |
437 | annotation_cov = go.layout.Annotation(
438 | dict(
439 | x=0.85,#0.71,
440 | y=0.04,#0.95,
441 | xanchor="left",
442 | xref='paper',
443 | yref='paper',
444 | text='Pearson r = ' + str(np.round(pearson_corr_cov, decimals=3)),
445 | bgcolor="white",
446 | showarrow=False
447 | )
448 | )
449 |
450 | fig.layout.annotations += (annotation_single, annotation_pair, annotation_cov)
451 |
452 | #define layout
453 | fig['layout'].update(
454 | font = dict(size=20),
455 | hovermode = 'closest',
456 | width=1500,
457 | height=500,
458 | margin=dict(t=40)
459 |
460 | )
461 |
462 |
463 | #specify axis layout details
464 | fig['layout']['yaxis1'].update(
465 | title="statistics from MCMC sample",
466 | exponentformat="e",
467 | showexponent='all',
468 | scaleanchor="x1",
469 | scaleratio=1
470 | )
471 | fig['layout']['yaxis2'].update(
472 | exponentformat="e",
473 | showexponent='all',
474 | scaleanchor="x2",
475 | scaleratio=1
476 | )
477 | fig['layout']['yaxis3'].update(
478 | exponentformat="e",
479 | showexponent='all',
480 | scaleanchor="x3",
481 | scaleratio=1
482 | )
483 | fig['layout']['xaxis1'].update(
484 | exponentformat="e",
485 | showexponent='all',
486 | scaleanchor="y1",
487 | scaleratio=1,
488 | showspikes=True
489 | )
490 | fig['layout']['xaxis2'].update(
491 | title="statistics from natural sequences",
492 | exponentformat="e",
493 | showexponent='all',
494 | scaleanchor="y2",
495 | scaleratio=1
496 | )
497 | fig['layout']['xaxis3'].update(
498 | exponentformat="e",
499 | showexponent='all',
500 | scaleanchor="y3",
501 | scaleratio=1
502 | )
503 |
504 | fig['layout']['xaxis1']['range'] = [0, 1]
505 | fig['layout']['xaxis2']['range'] = [0, 1]
506 | fig['layout']['yaxis1']['range'] = [0, 1]
507 | fig['layout']['yaxis2']['range'] = [0, 1]
508 |
509 |
510 |
511 | plotly_plot(fig, filename=plot_out, auto_open=False, show_link=False, image_filename=plot_out.replace("html", ""))
512 |
513 | def plot_alignment(aa_counts_single, title, plot_file, freq=True):
514 |
515 | Neff = np.sum(aa_counts_single[0,:])
516 | L = aa_counts_single.shape[0]
517 |
518 | #create plot
519 | data = []
520 |
521 | if freq:
522 | aa_counts_single /= Neff
523 |
524 | #add bar for each amino acid for each position
525 | for aa in range(20):
526 | data.append(
527 | go.Bar(
528 | x= list(range(1,L+1)),
529 | y=aa_counts_single[:, aa].tolist(),
530 | showlegend=True,
531 | name=io.AMINO_ACIDS[aa]
532 | )
533 | )
534 |
535 |
536 | layout = go.Layout(
537 | barmode='stack',
538 | title=title,
539 | xaxis=dict(title="Alignment Position"),
540 | yaxis=dict(
541 | title="Amino Acid Distribution",
542 | exponentformat='e',
543 | showexponent='All'
544 | ),
545 | font=dict(size=18)
546 | )
547 |
548 | plot = {'data': data, 'layout': layout}
549 |
550 |
551 | plotly_plot(plot, filename=plot_file, auto_open=False, link_text='')
552 |
--------------------------------------------------------------------------------
/ccmpred/pseudocounts.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import ccmpred.counts
4 | import ccmpred.substitution_matrices
5 |
6 |
7 | class PseudoCounts(object):
8 | """Add pseudocounts to prevent vanishing amino acid frequencies"""
9 |
10 | def __init__(self, msa, weights):
11 |
12 | self.msa = msa
13 | self.N, self.L = self.msa.shape
14 | self.weights=weights
15 | self.neff = np.sum(weights) if self.weights is not None else self.N
16 |
17 | #with weights
18 | self.counts = ccmpred.counts.both_counts(self.msa, self.weights)
19 | self.freqs = None
20 |
21 | self.pseudocount_n_single = None
22 | self.pseudocount_n_pair = None
23 | self.pseudocount_type = None
24 | self.remove_gaps = None
25 | self.pseudocount_ratio_single = None
26 | self.pseudocount_ratio_pair = None
27 |
28 | #will be computed from Freq with pseudo-counts and Neff
29 | self.Ni = None
30 | self.Nij = None
31 |
32 |
33 | def calculate_Ni(self, freqs_single=None):
34 |
35 | if freqs_single is not None:
36 | #counts may include pseudo-counts
37 | single_counts = freqs_single * self.neff
38 | else:
39 | single_counts, pair_counts = self.counts
40 |
41 | # reset gap counts
42 | single_counts[:, 20] = 0
43 |
44 | Ni = single_counts.sum(1)
45 |
46 | self.Ni = Ni
47 |
48 | def calculate_Nij(self, freqs_pair=None):
49 |
50 | if freqs_pair is not None:
51 | #counts may include pseudo-counts
52 | pair_counts = freqs_pair * self.neff
53 | else:
54 | single_counts, pair_counts = self.counts
55 |
56 | # reset gap counts
57 | pair_counts[:, :, :, 20] = 0
58 | pair_counts[:, :, 20, :] = 0
59 |
60 | # non_gapped counts
61 | Nij = pair_counts.sum(3).sum(2)
62 |
63 | self.Nij = Nij
64 |
65 | def calculate_global_aa_freq(self):
66 |
67 | single_counts, _ = self.counts
68 |
69 | #normalized with gaps
70 | single_freq = single_counts / self.neff
71 |
72 | #single freq counts normalized without gaps
73 | single_freq = self.degap(single_freq, True)
74 |
75 |
76 | return np.mean(single_freq[:, :20], axis=0)[np.newaxis, :][0]
77 |
78 | def calculate_frequencies(self, pseudocount_type, pseudocount_n_single=1, pseudocount_n_pair=None, remove_gaps=False):
79 |
80 |
81 | self.pseudocount_n_single = pseudocount_n_single
82 | self.pseudocount_n_pair = pseudocount_n_pair
83 | self.pseudocount_type = pseudocount_type
84 | self.remove_gaps = remove_gaps
85 |
86 | single_counts, pair_counts = self.counts
87 |
88 | if pseudocount_n_pair is None:
89 | pseudocount_n_pair = pseudocount_n_single
90 |
91 |
92 | self.pseudocount_ratio_single = pseudocount_n_single / (self.neff + pseudocount_n_single)
93 | self.pseudocount_ratio_pair = pseudocount_n_pair / (self.neff + pseudocount_n_pair)
94 |
95 | #frequencies are normalized WITH gaps
96 | single_freq = single_counts / self.neff
97 | pair_freq = pair_counts / self.neff
98 |
99 | if (remove_gaps):
100 | single_freq = self.degap(single_freq,True)
101 | pair_freq = self.degap(pair_freq, True)
102 |
103 | pcounts = getattr(self, pseudocount_type)(single_freq)
104 |
105 | single_freq_pc = (1 - self.pseudocount_ratio_single) * single_freq + self.pseudocount_ratio_single * pcounts
106 | pair_freq_pc = ((1 - self.pseudocount_ratio_pair) ** 2) * \
107 | (pair_freq - single_freq[:, np.newaxis, :, np.newaxis] * single_freq[np.newaxis, :, np.newaxis, :]) + \
108 | (single_freq_pc[:, np.newaxis, :, np.newaxis] * single_freq_pc[np.newaxis, :, np.newaxis, :])
109 |
110 | self.freqs = single_freq_pc, pair_freq_pc
111 |
112 | #compute weighted non-gapped sequence counts
113 | self.calculate_Ni(single_freq_pc)
114 | self.calculate_Nij(pair_freq_pc)
115 |
116 | @staticmethod
117 | def degap(freq, keep_dims=False):
118 | if len(freq.shape) == 2 :
119 | out = freq[:, :20] / (1 - freq[:, 20])[:, np.newaxis]
120 | else:
121 | freq_sum = freq[:,:,:20, :20].sum(3).sum(2)[:, :, np.newaxis, np.newaxis]
122 | out = freq[:, :, :20, :20] / (freq_sum + 1e-10)
123 |
124 | if keep_dims:
125 | if len(freq.shape) == 2 :
126 | out2 = np.zeros((freq.shape[0], 21))
127 | out2[:, :20] = out
128 | else:
129 | out2 = np.zeros((freq.shape[0], freq.shape[1], 21, 21))
130 | out2[:, :, :20, :20] = out
131 | out = out2
132 |
133 | return out
134 |
135 | def uniform_pseudocounts(self, single_freq):
136 | uniform_pc = np.zeros_like(single_freq)
137 | uniform_pc.fill(1. / single_freq.shape[1])
138 | return uniform_pc
139 |
140 | def constant_pseudocounts(self, single_freq):
141 | return np.mean(single_freq, axis=0)[np.newaxis, :]
142 |
143 | def substitution_matrix_pseudocounts(self, single_freq, substitution_matrix=ccmpred.substitution_matrices.BLOSUM62):
144 | """
145 | Substitution matrix pseudocounts
146 |
147 | $\tilde{q}(x_i = a) = \sum_{b=1}^{20} p(a | b) q_0(x_i = b)$
148 | """
149 | single_freq_degap = self.degap(single_freq)
150 |
151 | # $p(b) = \sum{a=1}^{20} p(a, b)$
152 | pb = np.sum(substitution_matrix, axis=0)
153 |
154 | # p(a | b) = p(a, b) / p(b)
155 | cond_prob = substitution_matrix / pb[np.newaxis, :]
156 |
157 | freqs_pc = np.zeros_like(single_freq)
158 | freqs_pc[:, :20] = np.sum(cond_prob[np.newaxis, :, :] * single_freq_degap[:, np.newaxis, :], axis=2)
159 |
160 | return freqs_pc
161 |
162 | def no_pseudocounts(self, single_freq):
163 | return single_freq
164 |
--------------------------------------------------------------------------------
/ccmpred/raw/__init__.py:
--------------------------------------------------------------------------------
1 | from ccmpred.raw.ccmraw import parse_oldraw, parse_msgpack, parse, write_msgpack, write_oldraw, CCMRaw
2 |
--------------------------------------------------------------------------------
/ccmpred/raw/ccmraw.py:
--------------------------------------------------------------------------------
1 | import msgpack
2 | import functools
3 | import numpy as np
4 | import re
5 | import json
6 | import gzip
7 | from six import string_types, StringIO
8 |
9 |
10 | META_PREFIX = "#>META> "
11 |
12 |
13 | class CCMRaw(object):
14 | """Storage class for CCMpred raw prediction"""
15 | def __init__(self, ncol, x_single, x_pair, meta):
16 | self.ncol = ncol
17 | self.x_single = x_single
18 | self.x_pair = x_pair
19 | self.meta = meta
20 |
21 | def __repr__(self):
22 | return "".format(self.ncol)
23 |
24 |
25 | def stream_or_file(mode='r'):
26 | """Decorator for making a function accept either a filename or file-like object as a first argument"""
27 |
28 | def inner(fn):
29 | @functools.wraps(fn)
30 | def streamify(f, *args, **kwargs):
31 | if isinstance(f, string_types):
32 |
33 | open_fn = gzip.open if f.endswith(".gz") else open
34 |
35 | try:
36 | fh = open_fn(f, mode)
37 | res = fn(fh, *args, **kwargs)
38 | finally:
39 | fh.close()
40 |
41 | return res
42 | else:
43 | return fn(f, *args, **kwargs)
44 |
45 | return streamify
46 |
47 | return inner
48 |
49 |
50 | _PARSERS = []
51 |
52 |
53 | def parser(fn):
54 | _PARSERS.append(fn)
55 | return fn
56 |
57 |
58 | @parser
59 | @stream_or_file('rb')
60 | def parse_msgpack(f):
61 | """Parse a msgpack CCMpred prediction from a filename or file object"""
62 | x = msgpack.unpackb(f.read(), encoding="utf-8")
63 |
64 | assert(x['format'] == 'ccm-1')
65 |
66 | ncol = x['ncol']
67 | x_single = np.array(x['x_single']).reshape((ncol, 20))
68 | x_pair = np.zeros((ncol, ncol, 21, 21))
69 |
70 | meta = x['meta'] if 'meta' in x else None
71 |
72 | for p in x['x_pair'].values():
73 | i = p['i']
74 | j = p['j']
75 | mat = np.array(p['x']).reshape((21, 21))
76 | x_pair[i, j, :, :] = mat
77 | x_pair[j, i, :, :] = mat.T
78 |
79 | return CCMRaw(ncol, x_single, x_pair, meta)
80 |
81 |
82 | @parser
83 | @stream_or_file('r')
84 | def parse_oldraw(f):
85 | """Read raw emission potentials from rawfile"""
86 |
87 | buf = StringIO()
88 | re_identifier = re.compile("^#\s*(\d+)\s+(\d+)\s*$")
89 |
90 | x_single = None
91 | x_pair = None
92 | i, j = None, None
93 | meta = None
94 | for line_idx, line in enumerate(f):
95 | if line.startswith(META_PREFIX):
96 | meta = json.loads(line[len(META_PREFIX):].strip())
97 |
98 | elif line.startswith("#"):
99 |
100 | buf.seek(0)
101 |
102 | if x_single is not None:
103 | x_pair[i, j, :, :] = np.loadtxt(buf)
104 | x_pair[j, i, :, :] = x_pair[i, j, :, :].T
105 |
106 | else:
107 | x_single = np.loadtxt(buf)
108 |
109 | ncol = x_single.shape[0]
110 | x_pair = np.zeros((ncol, ncol, 21, 21))
111 |
112 | buf = StringIO()
113 |
114 | m = re_identifier.match(line)
115 | if m:
116 | i, j = int(m.group(1)), int(m.group(2))
117 |
118 | else:
119 | raise Exception("Line {0} starts with # but doesn't match regex!".format(line_idx + 1))
120 |
121 | else:
122 | buf.write(line)
123 |
124 | if x_single is not None and buf.tell():
125 | buf.seek(0)
126 | x_pair[i, j, :, :] = np.loadtxt(buf)
127 | x_pair[j, i, :, :] = x_pair[i, j, :, :].T
128 |
129 | return CCMRaw(ncol, x_single, x_pair, meta)
130 |
131 |
132 | def parse(f):
133 | r = None
134 | for parser in _PARSERS:
135 | try:
136 | if hasattr(f, 'seek'):
137 | f.seek(0)
138 |
139 | r = parser(f)
140 | except Exception as e:
141 | pass
142 |
143 | if r is not None:
144 | continue
145 | return r
146 |
147 |
148 | @stream_or_file('wb')
149 | def write_msgpack(f, data):
150 |
151 | x_single = data.x_single.reshape(data.ncol * 20).tolist()
152 | x_pair = {}
153 | for i in range(data.ncol):
154 | for j in range(i + 1, data.ncol):
155 | x_pair["{0}/{1}".format(i, j)] = {
156 | "i": i,
157 | "j": j,
158 | "x": data.x_pair[i, j, :, :].reshape(21 * 21).tolist()
159 | }
160 |
161 | out = {
162 | "format": "ccm-1",
163 | "ncol": data.ncol,
164 | "x_single": x_single,
165 | "x_pair": x_pair
166 | }
167 |
168 | if data.meta:
169 | out['meta'] = data.meta
170 |
171 | f.write(msgpack.packb(out))
172 |
173 |
174 | @stream_or_file('wb')
175 | def write_oldraw(f, data):
176 | np.savetxt(f, data.x_single, delimiter="\t")
177 |
178 | for i in range(data.ncol):
179 | for j in range(i + 1, data.ncol):
180 | f.write("# {0} {1}\n".format(i, j).encode("utf-8"))
181 | np.savetxt(f, data.x_pair[i, j], delimiter="\t")
182 |
183 | if data.meta:
184 | f.write(META_PREFIX.encode("utf-8") + json.dumps(data.meta).encode("utf-8") + b"\n")
185 |
186 | if __name__ == '__main__':
187 | # data = parse_oldraw("data/test.raw")
188 | data = parse_msgpack("data/test.braw")
189 |
190 | print("data:")
191 | print(data)
192 |
193 | print("data.x_single.shape:")
194 | print(data.x_single.shape)
195 |
196 | print("data.x_single:")
197 | print(data.x_single)
198 |
199 | print("data.x_pair.shape:")
200 | print(data.x_pair.shape)
201 |
202 | print("data.x_pair[3, 4]:")
203 | print(data.x_pair[3, 4])
204 |
--------------------------------------------------------------------------------
/ccmpred/raw/convert_msgpack.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Convert a msgpack potential file to flatfile format"""
3 |
4 | import ccmraw as cr
5 |
6 |
7 | def main():
8 | import argparse
9 | parser = argparse.ArgumentParser(description=__doc__)
10 | parser.add_argument("in_msgpack", help="Input raw file in new msgpack format")
11 | parser.add_argument("out_flat", help="Output raw file in old flatfile format")
12 |
13 | opt = parser.parse_args()
14 |
15 | cr.write_oldraw(opt.out_flat, cr.parse_msgpack(opt.in_msgpack))
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/ccmpred/raw/convert_raw.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | """Convert a raw potential file to msgpack format"""
3 |
4 | import ccmraw as cr
5 |
6 |
7 | def main():
8 | import argparse
9 | parser = argparse.ArgumentParser(description=__doc__)
10 | parser.add_argument("in_raw", help="Input raw file in old raw format")
11 | parser.add_argument("out_msgpack", help="Output raw file in new msgpack format")
12 |
13 | opt = parser.parse_args()
14 |
15 | cr.write_msgpack(opt.out_msgpack, cr.parse_oldraw(opt.in_raw))
16 |
17 |
18 | if __name__ == '__main__':
19 | main()
20 |
--------------------------------------------------------------------------------
/ccmpred/regularization.py:
--------------------------------------------------------------------------------
1 | # coding: utf-8
2 | import numpy as np
3 |
4 |
5 | class L2(object):
6 | """L2 regularization on single and pair emission potentials"""
7 |
8 | def __init__(self, lambda_single, lambda_pair_factor, scaling, center_x_single):
9 | self.lambda_single = lambda_single
10 | self.lambda_pair = lambda_pair_factor * scaling
11 | self.lambda_pair_factor = lambda_pair_factor
12 | self.center_x_single = center_x_single
13 |
14 |
15 | def __call__(self, x_single, x_pair):
16 | x_ofs = x_single - self.center_x_single[:, :x_single.shape[1]]
17 |
18 | # log likelihood uses:
19 | # - lambda_single * sum_i sum_a (v_ia - center_x_single)^2
20 | # - lambda_pair / 2 * sum_i sum_j sum_a sum_b (w_ijab)^2
21 | # w_ijab == w_jiba --> potentials are symmetric
22 |
23 | # gradient computes as:
24 | # - 2 * lambda_single * (v_ia - center_x_single)
25 | # - lambda_pair * w_ijab
26 |
27 | g_single = 2 * self.lambda_single * x_ofs
28 | g_pair = self.lambda_pair * x_pair
29 |
30 | fx_reg = self.lambda_single * np.sum(x_ofs * x_ofs) + 0.5 * self.lambda_pair * np.sum(x_pair * x_pair)
31 |
32 | return fx_reg, g_single, g_pair
33 |
34 | def __repr__(self):
35 | return "L₂ regularization (λsingle={0} λpairfactor={1} λpair={2})".format(self.lambda_single, self.lambda_pair_factor, self.lambda_pair)
36 |
37 |
--------------------------------------------------------------------------------
/ccmpred/sampling/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | import ccmpred.objfun.cd.cext
3 | import ccmpred.weighting
4 | import ccmpred.trees
5 | import ccmpred.sampling.cext
6 | import numpy as np
7 | import sys
8 | from ccmpred.io.alignment import AMINO_ACIDS
9 | from ccmpred.weighting.cext import count_ids, calculate_weights_simple
10 | import ccmpred.counts
11 | from ccmpred.pseudocounts import PseudoCounts
12 |
13 | def gibbs_sample_sequences(x, msa_sampled, gibbs_steps):
14 | return ccmpred.objfun.cd.cext.gibbs_sample_sequences(msa_sampled, x, gibbs_steps)
15 |
16 | def all_parents(tree):
17 | parents = {}
18 | for clade in tree.find_clades(order='level'):
19 | for child in clade:
20 | parents[child] = clade
21 | return parents
22 |
23 | def mutate_along_phylogeny(tree, seq0, mutation_rate, x):
24 |
25 | ncol = len(seq0)
26 |
27 | #assign ancestor sequence to root
28 | tree.clade.seq = seq0
29 |
30 | #get all parents
31 | parents = all_parents(tree)
32 |
33 | #iterate breadth first over tree and mutate sequences
34 | for clade in tree.find_clades(order="level"):
35 | if clade.name != "root":
36 | #print("parent name: {0} parent seq: {1}".format( parents[clade], parents[clade].seq))
37 | nmut = int(clade.branch_length * mutation_rate * ncol)
38 | clade.seq =ccmpred.sampling.cext.mutate_sequence(parents[clade].seq, x, nmut, ncol)
39 | #print("clade name: {0} clade seq: {1}".format(clade.name, clade.seq))
40 | #print("---")
41 |
42 | #get sequences of all leave nodes
43 | msa_sampled = np.array([clade.seq for clade in tree.get_terminals()])
44 |
45 | return msa_sampled
46 |
47 | def generate_mcmc_sample(x, ncol, msa, size=10000, burn_in=500, sample_type="original"):
48 |
49 | print("Start sampling {0} sequences according to model starting with {1} sequences using burn-in={2}.".format(
50 | size, sample_type, burn_in))
51 | sys.stdout.flush()
52 |
53 | if msa is not None:
54 | N = msa.shape[0]
55 | else:
56 | N = 1000
57 |
58 | # sample at max 1000 sequences per iteration
59 | sample_size_per_it = np.min([N, 1000])
60 |
61 | ##repeat sampling until 10k sequences are obtained
62 | repeat = int(np.ceil(size / sample_size_per_it))
63 | samples = np.empty([repeat * sample_size_per_it, ncol], dtype="uint8")
64 | for i in range(repeat):
65 |
66 | if sample_type == "aln":
67 |
68 | #random selection of sequences from original MSA
69 | sample_seq_id = np.random.choice(ncol, sample_size_per_it, replace=False)
70 | msa_sampled = msa[sample_seq_id]
71 |
72 | elif sample_type == "random":
73 |
74 | #generate random sequences of length L
75 | msa_sampled = np.ascontiguousarray(
76 | [np.random.choice(20, ncol, replace=True) for _ in range(sample_size_per_it)], dtype="uint8")
77 |
78 | elif sample_type == "random-gapped":
79 |
80 | #generate random sequences of length L
81 | msa_sampled = np.ascontiguousarray(
82 | [np.random.choice(20, ncol, replace=True) for _ in range(sample_size_per_it)], dtype="uint8")
83 |
84 | #find gaps in randomly selected original sequences
85 | sample_seq_id = np.random.choice(N, sample_size_per_it, replace=False)
86 | msa_sampled_orig = msa[sample_seq_id]
87 | gap_indices = np.where(msa_sampled_orig == AMINO_ACIDS.index('-'))
88 |
89 | #assign gap states to random sequences
90 | msa_sampled[gap_indices] = AMINO_ACIDS.index('-')
91 |
92 |
93 | # burn in phase to move away from initial sequences
94 | msa_sampled = ccmpred.sampling.gibbs_sample_sequences(x, msa_sampled, gibbs_steps=burn_in)
95 |
96 | # add newly sampled sequences
97 | samples[i * sample_size_per_it: (i + 1) * sample_size_per_it] = msa_sampled
98 | print("sampled alignment has {0} sequences...".format((i + 1) * sample_size_per_it))
99 | sys.stdout.flush()
100 |
101 | #compute neff of sampled sequences
102 | neff = ccmpred.weighting.get_HHsuite_neff(msa_sampled)
103 |
104 | print("Sampled alignment has Neff {0:.6g}".format(neff))
105 |
106 | return samples, neff
107 |
108 | def sample_with_mutation_rate(tree, nseq, seq0, x, mutation_rate):
109 | """
110 |
111 | Parameters
112 | ----------
113 | tree: Tree object
114 | nseq: int
115 | seq0: 2dim array
116 | x:
117 | mutation_rate: float
118 |
119 | Returns
120 | -------
121 |
122 | """
123 |
124 | branch_lengths = tree.branch_lengths
125 |
126 | #how many substitutions per sequence will be performed
127 | nmut = [0]*(len(branch_lengths)-2)
128 | for i, bl in enumerate(branch_lengths[2:]):
129 | nmut[i] = bl * mutation_rate * seq0.shape[1]
130 | print("avg number of amino acid substitutions (parent -> child): {0}".format(
131 | np.round(np.mean(nmut), decimals=0)))
132 |
133 |
134 | # get the average number of amino acid substitution from root --> leave
135 | if tree.type == "binary" or tree.type == "star":
136 | number_splits = 1
137 | if tree.type == "binary":
138 | number_splits = np.log2(nseq)
139 | depth_per_clade = 1.0 /np.ceil(number_splits)
140 | print("avg number of amino acid substitutions (root -> leave): {0}".format(
141 | np.round(1 / depth_per_clade * np.mean(nmut), decimals=0)))
142 |
143 |
144 | # sample sequences according to tree topology
145 | msa_sampled = mutate_along_phylogeny(tree.tree, seq0[0], mutation_rate, x)
146 |
147 | # randomly choose nseq sequences from sampled msa
148 | if msa_sampled.shape[0] > nseq:
149 | msa_sampled = msa_sampled[sorted(np.random.choice(msa_sampled.shape[0], size=nseq, replace=False))]
150 |
151 | # compute neff of sampled sequences
152 | neff = ccmpred.weighting.get_HHsuite_neff(msa_sampled)
153 |
154 | print("\nAlignment with {0} sequences was sampled with mutation rate {1} and has Neff {2:.6g}".format(
155 | nseq, mutation_rate, neff))
156 |
157 | return msa_sampled, neff
158 |
159 | def sample_to_neff_increasingly(tree, nseq, target_neff, ncol, x, gibbs_steps, root_seq=None):
160 |
161 | branch_lengths = tree.branch_lengths
162 |
163 | print("\nSample alignment of {0} protein sequences with target Neff~{1:.6g}...\n".format(
164 | nseq, target_neff))
165 |
166 | # keep increasing MR until we are within 1% of target neff
167 | mutation_rate = 1.0
168 | neff = -np.inf
169 | msa_sampled = np.empty((nseq, ncol), dtype="uint8")
170 | while np.abs(target_neff - neff) > 1e-2 * target_neff:
171 |
172 | if root_seq is None:
173 | # sample a new start sequence
174 | seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, gibbs_steps)
175 | print("Ancestor sequence (polyA --> {0} gibbs steps --> seq0) :\n{1}".format(gibbs_steps, "".join(
176 | [AMINO_ACIDS[c] for c in seq0[0]])))
177 | else:
178 | # start from the specified sequence
179 | seq0 = root_seq
180 |
181 | # how many substitutions per sequence will be performed
182 | nmut = [0] * (len(branch_lengths) - 2)
183 | for i, bl in enumerate(branch_lengths[2:]):
184 | nmut[i] = bl * mutation_rate * ncol
185 | print("avg number of amino acid substitutions (parent -> child): {0}".format(
186 | np.round(np.mean(nmut), decimals=0)))
187 |
188 | # get the average number of amino acid substitution from root --> leave
189 | if tree.type == "binary" or tree.type == "star":
190 | number_splits = 1
191 | if tree.type == "binary":
192 | number_splits = np.log2(nseq)
193 | depth_per_clade = 1.0 / np.ceil(number_splits)
194 | print("avg number of amino acid substitutions (root -> leave): {0}".format(
195 | np.round(1 / depth_per_clade * np.mean(nmut), decimals=0)))
196 |
197 | # sample sequences according to tree topology
198 | msa_sampled = mutate_along_phylogeny(tree.tree, seq0[0], mutation_rate, x)
199 |
200 | # randomly choose nseq sequences from sampled msa
201 | if msa_sampled.shape[0] > nseq:
202 | msa_sampled = msa_sampled[sorted(np.random.choice(msa_sampled.shape[0], size=nseq, replace=False))]
203 |
204 | # compute neff of sampled sequences
205 | neff = ccmpred.weighting.get_HHsuite_neff(msa_sampled)
206 | print("Alignment with {0} sequences was sampled with mutation rate {1:.3g} and has Neff {2:.5g} (ΔNeff [%] = {3:.5g})\n".format(
207 | nseq, mutation_rate, neff, (target_neff - neff)/target_neff*100))
208 | sys.stdout.flush()
209 |
210 | # inrease mutation rate
211 | if target_neff > neff:
212 | mutation_rate += np.random.random()
213 |
214 | # decrease mutation rate
215 | if target_neff < neff:
216 | mutation_rate -= np.random.random()
217 |
218 | #reset mutation rate if it becomes negative
219 | if mutation_rate < 0 or mutation_rate > 100:
220 | mutation_rate = 1
221 |
222 | return msa_sampled, neff
223 |
--------------------------------------------------------------------------------
/ccmpred/sampling/cext/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy.ctypeslib as npct
3 | import ctypes
4 | import os.path
5 |
6 | array_1d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=1, flags='CONTIGUOUS')
7 | array_1d_uint8 = npct.ndpointer(dtype=np.dtype('uint8'), ndim=1, flags='CONTIGUOUS')
8 | array_1d_uint32 = npct.ndpointer(dtype=np.dtype('uint32'), ndim=1, flags='CONTIGUOUS')
9 | array_1d_uint64 = npct.ndpointer(dtype=np.dtype('uint64'), ndim=1, flags='CONTIGUOUS')
10 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS')
11 |
12 | libtreecd = npct.load_library('libtreecd', os.path.join(os.path.dirname(__file__), '_build'))
13 |
14 | libtreecd.mutate_along_tree.restype = None
15 | libtreecd.mutate_along_tree.argtypes = [
16 | array_1d_uint64, # int32_t *n_children,
17 | array_1d_float, # flt *branch_lengths,
18 | array_1d_float, # flt *x,
19 | ctypes.c_uint64, # uint32_t nvert,
20 | array_2d_char, # uint8_t *seqs,
21 | ctypes.c_uint32, # uint32_t ncol,
22 | ctypes.c_double # flt mutation_rate
23 | ]
24 |
25 | libtreecd.mutate_sequence.restype = None
26 | libtreecd.mutate_sequence.argtypes = [
27 | array_1d_uint8, # int32_t seq,
28 | array_1d_float, # flt *x,
29 | ctypes.c_uint16, # uint32_t nmut,
30 | ctypes.c_uint32 # uint32_t ncol,
31 | ]
32 |
33 | def mutate_sequence(parent_seq, x, nmut, ncol):
34 | seq = parent_seq.copy()
35 | libtreecd.mutate_sequence(seq, x, nmut, ncol)
36 |
37 | return seq
38 |
39 | def mutate_along_tree(msa_sampled, n_children, branch_lengths, x, nvert, seq0, mutation_rate):
40 | msa_sampled[:, :] = 0
41 | msa_sampled[:seq0.shape[0], :] = seq0
42 | libtreecd.mutate_along_tree(n_children, branch_lengths, x, nvert, msa_sampled, seq0.shape[1], mutation_rate)
43 |
44 | return msa_sampled
45 |
--------------------------------------------------------------------------------
/ccmpred/sampling/cext/treecd.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 |
7 | #include "treecd.h"
8 | #include "cd.h"
9 | #include "cdutil.h"
10 |
11 | /**
12 | * Mutate a sequence seq nmut times according to potentials in x
13 | *
14 | * @param[inout] seq The sequence to work on
15 | * @param[in] x The single and pairwise emission potentials for computing conditional probabilities
16 | * @param[in] nmut The number of substitutions to perform
17 | * @param[in] ncol The length of the sequence
18 | */
19 | void mutate_sequence(uint8_t *seq, flt *x, uint16_t nmut, int ncol) {
20 |
21 | flt* pcond = fl_malloc(N_ALPHA);
22 | int i;
23 |
24 | for(int m = 0; m < nmut; m++) {
25 |
26 | //ignore gap positions for sampling
27 | do {
28 | i = pick_random_uniform(ncol - 1);
29 | } while(seq[i] == GAP);
30 |
31 | compute_conditional_probs(i, pcond, x, seq, ncol);
32 |
33 | seq[i] = pick_random_weighted(pcond, N_ALPHA - 1);
34 | // sample gaps as well (need to adjust E2 and X1 in cd.h but single potentials only have dim 20:
35 | // compute_conditional_probs_gaps(i, pcond, x, seq, ncol);
36 | // seq[i] = pick_random_weighted(pcond, N_ALPHA);
37 | }
38 |
39 | fl_free(pcond);
40 | }
41 |
42 | /**
43 | * Mutate a sequence seq nmut times according to potentials in x
44 | *
45 | * @param[inout] seq The sequence to work on
46 | * @param[in] x The single and pairwise emission potentials for computing conditional probabilities
47 | * @param[in] nmut The number of substitutions to perform
48 | * @param[in] ncol The length of the sequence
49 | */
50 | void mutate_sequence_gibbs(uint8_t *seq, flt *x, uint16_t nmut, int ncol) {
51 |
52 | flt* pcond = fl_malloc(N_ALPHA);
53 |
54 | //int array with elements 1..L
55 | unsigned int sequence_position_vector[ncol];
56 | for (unsigned int p=0; p < ncol; p++) sequence_position_vector[p] = p;
57 |
58 | for(int m = 0; m < nmut; m++) {
59 |
60 | shuffle(sequence_position_vector, ncol);
61 |
62 | for (int i=0; i < ncol; i++){
63 | compute_conditional_probs(sequence_position_vector[i], pcond, x, seq, ncol);
64 | seq[sequence_position_vector[i]] = pick_random_weighted(pcond, N_ALPHA - 1);
65 | }
66 | }
67 |
68 | fl_free(pcond);
69 | }
70 |
71 |
72 |
73 | void swap(void **a, void **b) {
74 | void *temp = *a;
75 | *a = *b;
76 | *b = temp;
77 | }
78 |
79 | /**
80 | * Mutate an ancestral sequence along a tree
81 | *
82 | * @param[in] n_children At index i, stores the number of child vertices for vertex i
83 | * @param[in] branch_lengths At index i, stores the length of the branch leading to vertex i
84 | * @param[in] x The single and pairwise emission potentials for computing conditional probabilities
85 | * @param[in] nvert The total number of vertices in the tree
86 | * @param[in] nleaves The total number of leaves in the tree
87 | * @param[inout] seqs The ancestral sequence at the beginning of the array. After this method returns, stores all leaf sequences.
88 | * @param[in] ncol The length of individual sequences
89 | * @param[in] mutation_rate Coefficient to tune the number of substitutions to make per evolutionary time unit
90 | */
91 | void mutate_along_tree(
92 | uint64_t *n_children,
93 | flt *branch_lengths,
94 | flt *x,
95 | uint64_t nvert,
96 | uint8_t *seqs,
97 | uint32_t ncol,
98 | flt mutation_rate
99 | ) {
100 |
101 | seed_rng();
102 |
103 | // Preprocessing: Count number of leaves and compute index of first children
104 | uint64_t *first_child_index = (uint64_t *)malloc(sizeof(uint64_t) * nvert);
105 | uint64_t fci = 1;
106 | uint64_t nleaves = 0;
107 |
108 | for(uint64_t i = 0; i < nvert; i++) {
109 | if(n_children[i] == 0) { nleaves++; }
110 | first_child_index[i] = fci;
111 | fci += n_children[i];
112 | }
113 |
114 | // nc: number of children for vertex at index i of current BFS level
115 | uint64_t *nc_in = (uint64_t *)malloc(sizeof(uint64_t) * nleaves);
116 | uint64_t *nc_out = (uint64_t *)malloc(sizeof(uint64_t) * nleaves);
117 |
118 | // ni: index of vertex at index i of current BFS level
119 | uint64_t *ni_in = (uint64_t *)malloc(sizeof(uint64_t) * nleaves);
120 | uint64_t *ni_out = (uint64_t *)malloc(sizeof(uint64_t) * nleaves);
121 |
122 | // seqs: sequences at index i of current BFS level
123 | uint8_t *seqs_in = (uint8_t *)malloc(sizeof(uint8_t) * ncol * nleaves);
124 | uint8_t *seqs_out = (uint8_t *)malloc(sizeof(uint8_t) * ncol * nleaves);
125 |
126 | // bl: branch length at index i of current BFS level
127 | flt *bl = fl_malloc(nleaves);
128 |
129 | // fill initial level with root nodes and ancestral sequences
130 | uint64_t nn = n_children[0];
131 | memcpy(nc_in, &n_children[1], sizeof(uint64_t) * nn);
132 | memcpy(seqs_in, seqs, sizeof(uint8_t) * ncol * nn);
133 | for(uint64_t i = 0; i < nn; i++) {
134 | ni_in[i] = i + 1;
135 | }
136 |
137 | // BFS over tree levels
138 | while(nn < nleaves) {
139 |
140 | // Phase 1: grow nc_out, ni_out, bl and seqs_out
141 | uint64_t pos = 0;
142 | for(uint64_t i = 0; i < nn; i++) {
143 |
144 | uint64_t nci = nc_in[i];
145 |
146 | if(nci == 0) {
147 | // we have no children - copy the leaf node to keep it in next level
148 | nc_out[pos] = nc_in[i];
149 | ni_out[pos] = ni_in[i];
150 | bl[pos] = 0;
151 | memcpy(&seqs_out[pos * ncol], &seqs_in[i * ncol], sizeof(uint8_t) * ncol);
152 |
153 | pos++;
154 |
155 | } else {
156 |
157 | // we have one or more children - grow out arrays to make room for descendants
158 | // mutation to descendant sequences will be handled in phase 2
159 | for(uint64_t j = 0; j < nci; j++) {
160 | uint64_t inew = first_child_index[ni_in[i]] + j;
161 |
162 | nc_out[pos] = n_children[inew];
163 | ni_out[pos] = inew;
164 | bl[pos] = branch_lengths[inew];
165 | memcpy(&seqs_out[pos * ncol], &seqs_in[i * ncol], sizeof(uint8_t) * ncol);
166 |
167 | pos++;
168 | }
169 |
170 | }
171 |
172 | }
173 |
174 | // Phase 2: evolve seq according to bl
175 | #pragma omp parallel for
176 | for(uint64_t i = 0; i < pos; i++) {
177 | int nmut = bl[i] * mutation_rate * ncol;
178 | //printf("nn = %i, i = %i, nmut = %i, bl[i]=%f\n", nn, i, nmut, bl[i]);
179 | mutate_sequence(&seqs_out[i * ncol], x, nmut, ncol);
180 | }
181 |
182 | nn = pos;
183 | //printf("nn = %i.\n", nn);
184 | swap((void **)&nc_in, (void **)&nc_out);
185 | swap((void **)&ni_in, (void **)&ni_out);
186 | swap((void **)&seqs_in, (void **)&seqs_out);
187 |
188 | }
189 |
190 | memcpy(seqs, seqs_in, sizeof(uint8_t) * ncol * nleaves);
191 |
192 | free(first_child_index);
193 | free(nc_in);
194 | free(nc_out);
195 | free(ni_in);
196 | free(ni_out);
197 | free(seqs_in);
198 | free(seqs_out);
199 | fl_free(bl);
200 | }
201 |
--------------------------------------------------------------------------------
/ccmpred/sampling/cext/treecd.h:
--------------------------------------------------------------------------------
1 | #ifndef TREECD_H
2 | #define TREECD_H
3 |
4 | #include
5 | #include "cd.h"
6 |
7 | void mutate_along_tree(
8 | uint64_t *n_children,
9 | flt *branch_lengths,
10 | flt *x,
11 | uint64_t nvert,
12 | uint8_t *seqs,
13 | uint32_t ncol,
14 | flt mutation_rate
15 | );
16 |
17 | #endif
18 |
--------------------------------------------------------------------------------
/ccmpred/sanity_check.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def check_single_potentials(x_single, verbose=0, epsilon=1e-5):
5 |
6 | nr_pot_sum_not_zero = np.where(np.abs(x_single.sum(1)) > epsilon)[0]
7 | if len(nr_pot_sum_not_zero) > 0:
8 | print("Warning: {0} single potentials do not sum to 0 (eps={1}).".format(len(nr_pot_sum_not_zero), epsilon))
9 |
10 | if verbose:
11 | for ind in nr_pot_sum_not_zero[:10]:
12 | print("e.g.: i={0:<2} has sum_a(v_ia)={1}".format(ind+1, np.sum(x_single[ind])))
13 |
14 | return 0
15 |
16 | return 1
17 |
18 | def check_pair_potentials(x_pair, verbose=0, epsilon=1e-5):
19 |
20 | indices_triu = np.triu_indices(x_pair.shape[0], 1)
21 | nr_pot_sum_not_zero = np.where(np.abs(x_pair.sum(2).sum(2)[indices_triu]) > epsilon)[0]
22 | if len(nr_pot_sum_not_zero):
23 | print("Warning: {0}/{1} pair potentials do not sum to 0 (eps={2}).".format(len(nr_pot_sum_not_zero), len(indices_triu[0]), epsilon))
24 |
25 | if verbose:
26 | for ind in nr_pot_sum_not_zero[:10]:
27 | i = indices_triu[0][ind]
28 | j = indices_triu[1][ind]
29 | print("e.g.: i={0:<2} j={1:<2} has sum_ab(w_ijab)={2}".format(i+1, j+1, np.sum(x_pair[i,j])))
30 |
31 | return 0
32 |
33 | return 1
34 |
35 |
36 | def centering_potentials( x_single, x_pair):
37 | """
38 |
39 | Enforce gauge choice
40 |
41 | :param x_single:
42 | :param x_pair:
43 | :return:
44 | """
45 |
46 | means = np.mean(np.mean(x_pair[:, :, :20, :20], axis=2), axis=2)
47 | x_pair[:, :, :20, :20] -= means[:, :, np.newaxis, np.newaxis]
48 |
49 | means = np.mean(x_single[: , :20], axis=1)
50 | x_single[: , :20] -= means[:, np.newaxis]
51 |
52 |
53 | return x_single, x_pair
54 |
--------------------------------------------------------------------------------
/ccmpred/scripts/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/scripts/__init__.py
--------------------------------------------------------------------------------
/ccmpred/scripts/convert.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import ccmpred.io.alignment
5 |
6 |
7 | def main():
8 |
9 | parser = argparse.ArgumentParser(description='Convert Fasta to Psicov format and vice versa.')
10 |
11 |
12 | parser.add_argument("infile", type=str, help="MSA input file")
13 | parser.add_argument("outfile", type=str, help="MSA output file")
14 | parser.add_argument("--msa-in-format", dest="msa_in_format", default="psicov",
15 | help="Input alignment format [default: '%default']")
16 | parser.add_argument("--msa-out-format", dest="msa_out_format", default="fasta",
17 | help="Output alignment format [default: '%default']")
18 |
19 | args = parser.parse_args()
20 |
21 |
22 | msa = ccmpred.io.alignment.read_msa(args.infile, args.msa_in_format)
23 |
24 | with open(args.outfile, "w") as f:
25 | ccmpred.io.alignment.write_msa(f, msa,
26 | ids=["seq_"+str(i) for i in range(msa.shape[0])],
27 | format=args.msa_out_format
28 | )
29 |
30 |
31 |
32 | if __name__ == '__main__':
33 | main()
34 |
--------------------------------------------------------------------------------
/ccmpred/scripts/plot_ccmpred.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | """
4 | Usage: plot_ccmpred.py
5 |
6 | Various plotting functionalities
7 | """
8 |
9 | import os
10 | import sys
11 | import argparse
12 | import ccmpred.raw as raw
13 | import ccmpred.weighting
14 | from ccmpred.pseudocounts import PseudoCounts
15 | import ccmpred.io as io
16 | import ccmpred.io.contactmatrix as io_cm
17 | import ccmpred.plotting as plot
18 | import ccmpred.gaps as gaps
19 | import pandas as pd
20 | import numpy as np
21 |
22 | def parse_args():
23 |
24 | parser = argparse.ArgumentParser(description='Various Plotting Functionalities.')
25 | subparsers = parser.add_subparsers(title="Plot types", dest="plot_types")
26 |
27 |
28 | #parent parsers for common flags
29 | parent_parser_out = argparse.ArgumentParser(add_help=False)
30 | requiredNamed = parent_parser_out.add_argument_group('Required Output Arguments')
31 | requiredNamed.add_argument('-o', '--plot-file', dest='plot_file', type=str, required=True,
32 | help='Path to plot file')
33 |
34 |
35 |
36 | #parser for contact map
37 | parser_cmap = subparsers.add_parser('cmap', parents=[parent_parser_out],
38 | help="Specify options for plotting a Contact Map")
39 |
40 | cmap_in_req = parser_cmap.add_argument_group('Required Inputs')
41 | mutual_excl = cmap_in_req.add_mutually_exclusive_group(required=True)
42 | mutual_excl.add_argument('--mat-file', dest='mat_file', type=str, help='path to mat file')
43 | mutual_excl.add_argument('--braw-file', dest='braw_file', type=str,help='path to binary raw coupling file')
44 |
45 | cmap_in = parser_cmap.add_argument_group('Optional Inputs')
46 | cmap_in.add_argument('-p', '--pdb-file', dest='pdb_file', type=str, default=None,
47 | help=' PDB file (renumbered starting from 1) for distance matrix.')
48 | cmap_in.add_argument('-a', '--alignment-file', dest='aln_file', type=str, default=None,
49 | help='path to alignment file')
50 | cmap_in.add_argument("--aln-format", dest="aln_format", default="fasta",
51 | help="File format for MSAs [default: \"%(default)s\"]")
52 |
53 | cmap_options = parser_cmap.add_argument_group('Further Settings for Contact Map Plot')
54 | cmap_options.add_argument('--seq-sep', dest='seqsep', type=int, default=6, help='Minimal sequence separation')
55 | cmap_options.add_argument('--contact-threshold', dest='contact_threshold', type=int, default=8,
56 | help='Contact definition as maximal C_beta distance between residue pairs.')
57 | cmap_options.add_argument("--apc", action="store_true", default=False, help="Apply average product correction")
58 | cmap_options.add_argument("--entropy-correction", dest='entropy_correction', action="store_true", default=False, help="Apply entropy correction")
59 |
60 |
61 | # parser for aa distribution plot
62 | parser_aa_dist = subparsers.add_parser('aa-dist', parents=[parent_parser_out],
63 | help="Specify options for plotting the amino acid distribution in an alignment")
64 |
65 | aadist_in_req = parser_aa_dist.add_argument_group('Required Inputs')
66 | aadist_in_req.add_argument('-a', '--alignment-file', dest='aln_file', type=str, required=True,
67 | help='path to alignment file')
68 | aadist_in_req.add_argument("--aln-format", dest="aln_format", default="psicov",
69 | help="File format for MSAs [default: \"%(default)s\"]")
70 |
71 |
72 | # parser for alignment statistics plot
73 | parser_aln_stats = subparsers.add_parser(
74 | 'aln-stats', parents=[parent_parser_out],
75 | help="Specify options for plotting the alignment statistics of two alignments against each other")
76 |
77 | alnstats_in_req = parser_aln_stats.add_argument_group('Required Inputs')
78 | alnstats_in_req.add_argument('-a', '--alignment-file', dest='aln_file', type=str, required=True,
79 | help='path to alignment file')
80 | alnstats_in_req.add_argument("--aln-format", dest="aln_format", default="psicov",
81 | help="File format for MSAs [default: \"%(default)s\"]")
82 | alnstats_in_req.add_argument('-s', '--sampled-alignment-file', dest='sample_aln_file', type=str, required=True,
83 | help='path to sampled alignment' )
84 |
85 | parser_aln_stats.add_argument("--max-gap-pos", dest="max_gap_pos", default=100, type=int,
86 | help="Ignore alignment positions with > MAX_GAP_POS percent gaps. "
87 | "[default: %(default)s == no removal of positions]")
88 |
89 |
90 |
91 | args = parser.parse_args()
92 |
93 | if args.plot_types == "cmap":
94 | if args.entropy_correction and args.alignment_file is None:
95 | print("Alignment file (-a) must be specified to compute entropy correction!")
96 |
97 | if args.entropy_correction and args.braw_file is None:
98 | print("Binary Raw file (-b) must be specified to compute entropy correction!")
99 |
100 | return args
101 |
102 | def plot_contact_map(alignment_file, aln_format, braw_file, mat_file, pdb_file, plot_file,
103 | entropy_correction, apc, seqsep, contact_threshold):
104 |
105 | pseudocounts = None
106 | mat = None
107 | gaps_percentage_plot = None
108 | protein = None
109 |
110 |
111 | if entropy_correction and (alignment_file is None or braw_file is None):
112 | print("Entropy correction requires specification of alignment file and binary raw couplign file!")
113 | sys.exit(1)
114 |
115 | if alignment_file is not None:
116 | protein = os.path.basename(alignment_file).split(".")[0]
117 | alignment = io.read_msa(alignment_file, aln_format)
118 |
119 | # compute sequence weights
120 | weights = ccmpred.weighting.weights_simple(alignment, 0.8)
121 |
122 | # compute frequencies
123 | pseudocounts = PseudoCounts(alignment, weights)
124 | pseudocounts.calculate_frequencies(
125 | 'uniform_pseudocounts', 1, 1, remove_gaps=False
126 | )
127 |
128 | gaps_percentage_plot = plot.plot_percentage_gaps_per_position(pseudocounts.counts[0], plot_file=None)
129 |
130 | if braw_file is not None:
131 |
132 | protein = os.path.basename(braw_file).split(".")[0]
133 |
134 | braw = raw.parse_msgpack(braw_file)
135 | meta_info = braw.meta
136 |
137 | # compute frobenius score from couplings
138 | mat = io_cm.frobenius_score(braw.x_pair)
139 |
140 | if entropy_correction:
141 |
142 | scaling_factor_eta, mat = io_cm.compute_local_correction(
143 | pseudocounts.freqs[0],
144 | braw.x_pair,
145 | meta_info['workflow'][0]['msafile']['neff'],
146 | meta_info['workflow'][0]['regularization']['lambda_pair'],
147 | mat,
148 | entropy=True
149 | )
150 | elif apc:
151 | mat = io_cm.apc(mat)
152 |
153 | if mat_file is not None:
154 |
155 | protein = os.path.basename(mat_file).split(".")[0]
156 |
157 | mat, meta_info = io_cm.read_matrix(mat_file)
158 |
159 | if apc:
160 | mat = io_cm.apc(mat)
161 |
162 | L = len(mat)
163 | indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, seqsep)
164 |
165 | plot_matrix = pd.DataFrame()
166 | plot_matrix['residue_i'] = indices_upper_tri_i + 1
167 | plot_matrix['residue_j'] = indices_upper_tri_j + 1
168 | plot_matrix['confidence'] = mat[indices_upper_tri_i, indices_upper_tri_j]
169 |
170 | if pdb_file is not None:
171 | # compute distance map from pdb file
172 | observed_distances = io.distance_map(pdb_file, L)
173 | plot_matrix['distance'] = observed_distances[indices_upper_tri_i, indices_upper_tri_j]
174 | plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist()
175 |
176 |
177 | plot_title="Contact Map for protein {0}".format(protein)
178 |
179 | # Plot Contact Map
180 | plot.plot_contact_map_someScore_plotly(plot_matrix, plot_title, seqsep, gaps_percentage_plot, plot_file)
181 |
182 | def plot_aminoacid_distribution(alignment_file, aln_format, plot_file):
183 |
184 | protein = os.path.basename(alignment_file).split(".")[0]
185 |
186 | #read alignment
187 | try:
188 | alignment = io.read_msa(alignment_file, aln_format)
189 | except OSError as e:
190 | print("Problems reading alignment file {0}: {1}!".format(alignment_file, e))
191 | sys.exit(0)
192 |
193 | N = alignment.shape[0]
194 | L = alignment.shape[1]
195 | diversity = np.sqrt(N) / L
196 |
197 | # compute sequence weights
198 | weights = ccmpred.weighting.weights_simple(alignment, 0.8, False)
199 |
200 | # compute frequencies
201 | pseudocounts = PseudoCounts(alignment, weights)
202 | pseudocounts.calculate_frequencies(
203 | 'uniform_pseudocounts', 1, 1, remove_gaps=False
204 | )
205 |
206 | #plot
207 | plot.plot_alignment(
208 | pseudocounts.counts[0],
209 | "Amino Acid Distribution in Alignment for {0} (N={1}, L={2}, diversity={3})".format(
210 | protein, N, L, np.round(diversity, decimals=3)), plot_file
211 | )
212 |
213 | def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format, max_gap_pos, plot_file):
214 |
215 |
216 | #read alignment
217 | try:
218 | alignment = io.read_msa(alignment_file, aln_format)
219 | except OSError as e:
220 | print("Problems reading alignment file {0}: {1}!".format(alignment_file, e))
221 | sys.exit(0)
222 |
223 | try:
224 | sampled_alignment = io.read_msa(sample_aln_file, aln_format)
225 | except OSError as e:
226 | print("Problems reading alignment file {0}: {1}!".format(sample_aln_file, e))
227 | sys.exit(0)
228 |
229 |
230 | #Remove positions with > MAX_GAP_POS % gaps
231 | if max_gap_pos < 100:
232 | alignment, gapped_positions = gaps.remove_gapped_positions(alignment, max_gap_pos)
233 | non_gapped_positions = [i for i in range(sampled_alignment.shape[1]) if i not in gapped_positions]
234 | sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions])
235 |
236 | # compute sequence weights for observed sequences
237 | weights = ccmpred.weighting.weights_simple(alignment, 0.8)
238 |
239 | # compute observed amino acid frequencies
240 | pseudocounts = PseudoCounts(alignment, weights)
241 | pseudocounts.calculate_frequencies(
242 | 'uniform_pseudocounts', 1, 1, remove_gaps=False
243 | )
244 | single_freq_observed, pairwise_freq_observed = pseudocounts.freqs
245 |
246 |
247 | # compute sequence weights for sampled sequences (usually all sampled sequences obtain weight = 1 )
248 | weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8)
249 |
250 | # compute sampled amino acid frequencies
251 | pseudocounts = PseudoCounts(sampled_alignment, weights_sampled)
252 | pseudocounts.calculate_frequencies(
253 | 'uniform_pseudocounts', 1, 1, remove_gaps=False
254 | )
255 | single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs
256 |
257 | # degap the frequencies (ignore gap frequencies)
258 | single_freq_observed = pseudocounts.degap(single_freq_observed, False)
259 | single_freq_sampled = pseudocounts.degap(single_freq_sampled, False)
260 | pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False)
261 | pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False)
262 |
263 | # plot
264 | plot.plot_empirical_vs_model_statistics(
265 | single_freq_observed, single_freq_sampled,
266 | pairwise_freq_observed, pairwise_freq_sampled,
267 | plot_file)
268 |
269 |
270 |
271 | def main():
272 |
273 | args = parse_args()
274 |
275 | if args.plot_types == "cmap":
276 | print("Write plot for contact map to {0}".format(args.plot_file))
277 |
278 | plot_contact_map(
279 | args.aln_file, args.aln_format,
280 | args.braw_file, args.mat_file, args.pdb_file, args.plot_file,
281 | args.entropy_correction, args.apc,
282 | args.seqsep, args.contact_threshold
283 | )
284 |
285 | if args.plot_types == "aa-dist":
286 | print("Write plot for amino acid distribution in alignment to {0}".format(args.plot_file))
287 |
288 | plot_aminoacid_distribution(
289 | args.aln_file, args.aln_format,
290 | args.plot_file
291 | )
292 |
293 | if args.plot_types == "aln-stats":
294 | print("Write plot for alignment statistics to {0}".format(args.plot_file))
295 |
296 | plot_alignment_statistics(
297 | args.aln_file, args.sample_aln_file, args.aln_format, args.max_gap_pos,
298 | args.plot_file
299 | )
300 |
301 |
302 |
303 | if __name__ == '__main__':
304 | main()
--------------------------------------------------------------------------------
/ccmpred/scripts/replace_gaps.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import optparse
3 |
4 | import ccmpred.io.alignment
5 | import ccmpred.gaps
6 |
7 |
8 | def main():
9 | parser = optparse.OptionParser(usage="%prog [options] msa_in_file msa_out_file")
10 |
11 | parser.add_option("--with-consensus", dest="replacement", action="store_const", const=ccmpred.gaps.remove_gaps_consensus, help="Remove gaps with consensus characters")
12 | parser.add_option("--with-col-freqs", dest="replacement", action="store_const", const=ccmpred.gaps.remove_gaps_col_freqs, help="Remove gaps with column character frequencies")
13 | parser.add_option("--msa-in-format", dest="msa_in_format", default="psicov", help="Input alignment format [default: '%default']")
14 |
15 | opt, args = parser.parse_args()
16 |
17 | if not opt.replacement:
18 | parser.error("Need to specify one of the --with-* options!")
19 |
20 | if not len(args) == 2:
21 | parser.error("Need exactly two positional arguments!")
22 |
23 | msa_in_file, msa_out_file = args
24 |
25 | msa = ccmpred.io.alignment.read_msa(msa_in_file, opt.msa_in_format)
26 | msa_nogaps = opt.replacement(msa)
27 |
28 | with open(msa_out_file, "w") as f:
29 | ccmpred.io.alignment.write_msa_psicov(f, msa_nogaps)
30 |
31 |
32 | if __name__ == '__main__':
33 | main()
34 |
--------------------------------------------------------------------------------
/ccmpred/scripts/run_ccmgen.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | import argparse
4 | import os
5 | from ccmpred import CCMpred
6 | import ccmpred.logo
7 | import ccmpred.io.alignment
8 | import ccmpred.raw
9 | import ccmpred.weighting
10 | import ccmpred.sampling
11 | import ccmpred.gaps
12 | import ccmpred.trees
13 | import ccmpred.parameter_handling
14 | import numpy as np
15 |
16 | EPILOG = """
17 | Generate a realistic synthetic multiple sequence alignment (MSA) of protein sequences
18 | complying constraints from a Markov Random Field model.
19 |
20 | In a first step, a Markov Random Field Model will have to be learned from a source protein MSA using
21 | e.g. CCMpredPy with the -b command.
22 | This learned model can then be passed to the CCMgen call as RAWFILE.
23 |
24 | """
25 |
26 |
27 |
28 | def parse_args():
29 | parser = argparse.ArgumentParser(epilog=EPILOG)
30 |
31 | parser.add_argument("rawfile", help="Raw coupling potential file as generated by the CCMpredPy -b option")
32 | parser.add_argument("outalnfile", help="Output alignment file for sampled sequences.")
33 |
34 |
35 |
36 | grp_opt = parser.add_argument_group("General Options")
37 | grp_opt.add_argument("--alnfile", dest="alnfile", metavar="ALN_FILE", type=str,
38 | help="Reference alignment file that is used to specify NEFF and NSEQ")
39 | grp_opt.add_argument("--num-sequences", dest="nseq", type=int, default=2**10,
40 | help="Specify the number of sequences to generate to NSEQ "
41 | "(does not apply when newick file is specified) [default: %(default)s]")
42 | grp_opt.add_argument("--max-gap-pos", dest="max_gap_pos", default=100, type=int,
43 | help="Ignore alignment positions with > MAX_GAP_POS percent gaps when reading ALN_FILE. "
44 | "[default: %(default)s == no removal of gaps]")
45 | grp_opt.add_argument("--max-gap-seq", dest="max_gap_seq", default=100, type=int,
46 | help="Remove sequences with >X percent gaps when reading ALN_FILE. "
47 | "[default: %(default)s == no removal of sequences]")
48 | grp_opt.add_argument("--aln-format", dest="aln_format", type=str, default="fasta",
49 | help="Specify format for alignment files [default: %(default)s]")
50 | grp_opt.add_argument("--num-threads", dest="num_threads", type=int, default=1,
51 | help="Specify the number of threads. [default: %(default)s]")
52 |
53 |
54 |
55 |
56 | grp_tr = parser.add_argument_group("Phylogenetic Tree Options")
57 | grp_tr_me = grp_tr.add_mutually_exclusive_group()
58 | grp_tr_me.add_argument("--tree-newick", dest="tree_file", type=str,
59 | help="Load tree from newick-formatted file")
60 | grp_tr_me.add_argument("--tree-binary", dest="tree_source", action="store_const", const="binary",
61 | help="Generate a binary tree with equally distributed branch lengths.")
62 | grp_tr_me.add_argument("--tree-star", dest="tree_source", action="store_const", const="star",
63 | help="Generate a tree where all leaf nodes are direct descendants of the root node.")
64 | grp_tr_me.add_argument("--mcmc-sampling", dest="mcmc", action="store_true", default=False,
65 | help="Generate MCMC sample without following tree topology.")
66 |
67 |
68 |
69 | grp_tr_opt = parser.add_argument_group("Tree Sampling Options")
70 | grp_tr_opt_me = grp_tr_opt.add_mutually_exclusive_group()
71 | grp_tr_opt_me.add_argument("--mutation-rate", dest="mutation_rate", type=float,
72 | help="Specify constant mutation rate")
73 | grp_tr_opt_me.add_argument("--mutation-rate-neff", dest="neff", nargs='?', type=float, const=0, default=None,
74 | help="Set the mutation rate to approximately hit a target number of effective sequences, Neff "
75 | "(calculated as in the HHsuite package (https://github.com/soedinglab/hh-suite)). "
76 | "Without specifying NEFF, the value will be determined from ALN_FILE." )
77 |
78 |
79 | grp_s0 = parser.add_argument_group("Initial Sequence Options")
80 | grp_s0_me = grp_s0.add_mutually_exclusive_group()
81 | grp_s0_me.add_argument("--seq0-mrf", dest="seq0_mrf", metavar="NMUT", type=int, default=10,
82 | help="Start out with an all-alanine sequence and use the MRF model to evolve "
83 | "the sequence for NMUT Gibbs steps. [default: NMUT=%(default)s]")
84 | grp_s0_me.add_argument("--seq0-file", dest="seq0_file", metavar="SEQ_FILE", type=str,
85 | help="Specify ancestor sequence in SEQ_FILE.")
86 |
87 |
88 |
89 | grp_mcmc = parser.add_argument_group("MCMC Sampling Options")
90 | grp_mcmc_me = grp_mcmc.add_mutually_exclusive_group()
91 | grp_mcmc_me.add_argument("--mcmc-sample-random-gapped", dest="mcmc_sample_type", action="store_const", const="random-gapped",
92 | default="random-gapped",
93 | help="Sample sequences starting from random sequences. Gap structure of randomly selected "
94 | "input sequences will be copied. Gap positions are not sampled. "
95 | "(requires --alnfile option)[default]")
96 | grp_mcmc_me.add_argument("--mcmc-sample-random", dest="mcmc_sample_type", action="store_const", const="random",
97 | help="Sample sequences starting from random sequences comprised of 20 amino acids. ")
98 | grp_mcmc_me.add_argument("--mcmc-sample-aln", dest="mcmc_sample_type", action="store_const", const="aln",
99 | help="Sample sequences starting from original sequences (requires setting ALN_FILE).")
100 | grp_mcmc.add_argument("--mcmc-burn-in", dest="mcmc_burn_in", type=int, default=500,
101 | help="Number of Gibbs sampling steps to evolve a Markov chain before a sample is obtained.")
102 |
103 |
104 |
105 |
106 | opt = parser.parse_args()
107 |
108 | if not opt.mcmc:
109 |
110 | if not opt.tree_source and not opt.tree_file:
111 | parser.error("Need one of the --tree-* options or --mcmc-sampling!")
112 |
113 | if not opt.mutation_rate and opt.neff is None:
114 | parser.error("Need one of the --mutation-rate* options!")
115 |
116 | if not opt.mutation_rate and opt.neff == 0 and not opt.alnfile:
117 | parser.error("Need to specify Neff with either --mutation-rate-neff or via an alignment file (--alnfile)!")
118 |
119 |
120 | if opt.mcmc:
121 | if (opt.mcmc_sample_type == "aln" or opt.mcmc_sample_type == "random-gapped") and not opt.alnfile:
122 | parser.error("Need an alignment file (--alnfile) for use with "
123 | "--mcmc-sample-aln and --mcmc-sample-random-gapped!")
124 |
125 | return opt
126 |
127 |
128 |
129 | def main():
130 |
131 | def read_root_sequence(seq0_file, aln_format, print_sequence=True):
132 | seq0 = ccmpred.io.alignment.read_msa(seq0_file, aln_format)
133 | seq_N, seq_L = seq0.shape
134 |
135 | if seq_L != ncol:
136 | print("Length of ancestor sequence must match dimension of MRF model!")
137 | exit(0)
138 |
139 | if seq_N>1:
140 | print("You passed a fasta file with more than one sequence as a root sequences! We took the first sequence.")
141 | print_sequence = True
142 |
143 | if print_sequence:
144 | print("Ancestor sequence:\n{0}".format("".join([ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]])))
145 |
146 | return seq0
147 |
148 | # read command line options
149 | opt = parse_args()
150 |
151 | ccmpred.logo.logo(what_for="ccmgen")
152 |
153 | # set OMP environment variable for number of threads
154 | os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
155 | print("Using {0} threads for OMP parallelization.".format(os.environ["OMP_NUM_THREADS"]))
156 |
157 | # instantiate CCMpred
158 | ccm = CCMpred()
159 |
160 | # specify possible file paths
161 | ccm.set_initraw_file(opt.rawfile)
162 |
163 |
164 | # read alignment and remove gapped sequences and positions
165 | if opt.alnfile:
166 | ccm.set_alignment_file(opt.alnfile)
167 | ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq)
168 |
169 |
170 | #read potentials from binary raw file (possibly remove positions with many gaps)
171 | ccm.intialise_potentials()
172 | x = ccmpred.parameter_handling.structured_to_linear(ccm.x_single, ccm.x_pair, nogapstate=True, padding=False)
173 | ncol = ccm.x_single.shape[0]
174 |
175 |
176 | #if MCMC sampling is specified
177 | if opt.mcmc:
178 | msa_sampled, neff = ccmpred.sampling.generate_mcmc_sample(
179 | x, ncol, ccm.msa, size=opt.nseq, burn_in=opt.mcmc_burn_in, sample_type=opt.mcmc_sample_type)
180 |
181 | ids = ["seq {0}".format(i) for i in range(msa_sampled.shape[0])]
182 |
183 | else:
184 |
185 | tree = ccmpred.trees.CCMTree()
186 |
187 | #prepare tree topology
188 | if opt.tree_file:
189 |
190 | tree.load_tree(opt.tree_file)
191 | nseq = tree.n_leaves
192 |
193 | else:
194 |
195 | if opt.alnfile:
196 | nseq = ccm.N
197 | else:
198 | nseq = opt.nseq
199 | tree.specify_tree(nseq, opt.tree_source)
200 |
201 |
202 | ids = tree.ids
203 |
204 |
205 | # sample alignment with specified mutation rate
206 | if opt.mutation_rate:
207 | seq0 = np.zeros((1, ncol), dtype="uint8")
208 |
209 | if opt.seq0_mrf and not opt.seq0_file:
210 | seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, opt.seq0_mrf)
211 | print("Ancestor sequence (polyA --> {0} gibbs steps --> seq0) :\n{1}".format(
212 | opt.seq0_mrf, "".join([ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]])))
213 |
214 | elif opt.seq0_file:
215 | seq0 = read_root_sequence(opt.seq0_file, opt.aln_format)
216 |
217 | msa_sampled, neff = ccmpred.sampling.sample_with_mutation_rate(
218 | tree, nseq, seq0, x, opt.mutation_rate)
219 |
220 | # sample an alignment that has approximately the specified Neff
221 | else:
222 | seq0 = None
223 |
224 | if opt.alnfile:
225 | neff = ccm.neff_entropy
226 | else:
227 | neff = opt.neff
228 |
229 | if opt.seq0_file:
230 | seq0 = read_root_sequence(opt.seq0_file, opt.aln_format)
231 |
232 | msa_sampled, neff = ccmpred.sampling.sample_to_neff_increasingly(
233 | tree, nseq, neff, ncol, x, opt.seq0_mrf, root_seq=seq0)
234 |
235 |
236 |
237 | # if gappy positions have been removed
238 | # insert columns with gaps at that position
239 | if ccm.max_gap_pos < 100:
240 | msa_sampled = ccmpred.gaps.backinsert_gapped_positions_aln(
241 | msa_sampled, ccm.gapped_positions
242 | )
243 |
244 |
245 | print("\nWriting sampled alignment to {0}".format(opt.outalnfile))
246 | with open(opt.outalnfile, "w") as f:
247 | descs=["synthetic sequence generated with CCMgen" for _ in range(msa_sampled.shape[0])]
248 | ccmpred.io.alignment.write_msa(f, msa_sampled, ids, is_indices=True, format=opt.aln_format, descriptions=descs)
249 |
250 |
251 | if __name__ == '__main__':
252 | main()
253 |
--------------------------------------------------------------------------------
/ccmpred/scripts/run_ccmpred.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import argparse
3 | import sys
4 | import os
5 |
6 | from ccmpred import CCMpred
7 | import ccmpred.logo
8 |
9 |
10 | EPILOG = """
11 | CCMpredPy is a fast python implementation of contact prediction method based on correlated mutations.
12 | From an alignment given as alnfile, it will infer the parameters of a Potts model with 21 states for amino acids and gaps.
13 | Either pseudo-likelihood maximization or contrastive divergence can be chosen as inference algorithm.
14 | The L2 norms of the pairwise coupling potentials will be written to the output matfile.
15 | """
16 |
17 |
18 | class StoreConstParametersAction(argparse.Action):
19 | def __init__(self, option_strings, dest, nargs=None, arg_default=None, default=None, **kwargs):
20 | self.arg_default = arg_default
21 | default = (default, arg_default)
22 | super(StoreConstParametersAction, self).__init__(option_strings, dest, nargs=nargs, default=default, **kwargs)
23 |
24 | def __call__(self, parser, namespace, values, option_string=None):
25 | if values is None or values == self.const:
26 | values = self.arg_default
27 | setattr(namespace, self.dest, (self.const, values))
28 |
29 |
30 | def parse_args():
31 | parser = argparse.ArgumentParser(description="Recover direct couplings from a multiple sequence alignment", epilog=EPILOG)
32 |
33 | parser.add_argument("alnfile", help="Input alignment file to use")
34 |
35 | grp_general = parser.add_argument_group("General Options")
36 | grp_general.add_argument("--num-threads", dest="num_threads", type=int, default=1,
37 | help="Specify the number of threads. [default: %(default)s]")
38 | grp_general.add_argument("--aln-format", dest="aln_format", default="fasta",
39 | help="File format for MSAs [default: \"%(default)s\"]")
40 | grp_general.add_argument("--no-logo", dest="logo", default=True, action="store_false",
41 | help="Disable showing the CCMpred logo [default: %(default)s]")
42 |
43 |
44 | grp_out = parser.add_argument_group("Output Options")
45 | grp_out.add_argument("-m", "--mat-file", dest="matfile", type=str,
46 | help="Write contact score matrix to file. [default: %(default)s]")
47 | grp_out.add_argument("-b", "--write-binary-raw", dest="out_binary_raw_file", type=str,
48 | help="Write single and pairwise potentials as binary MessagePack file. [default: %(default)s]")
49 | grp_out.add_argument("--plot-opt-progress", dest="plot_opt_progress", type=str,
50 | help="Continously plot optimization progress as an interactive HTML. [default: %(default)s]")
51 |
52 |
53 | grp_in = parser.add_argument_group("Optional Input Options")
54 | grp_in.add_argument("-i", "--init-from-raw", dest="initrawfile", default=None,
55 | help="Init single and pair potentials from a binary raw file")
56 | grp_in.add_argument("--do-not-optimize", dest="optimize", action="store_false", default=True,
57 | help="Do not optimize potentials. Requires providing initial model parameters with -i.")
58 |
59 |
60 |
61 | grp_pll = parser.add_argument_group("Pseudo-Likelihood Options")
62 | grp_pll.add_argument("--ofn-pll", dest="objfun", action="store_const", const="pll", default="pll",
63 | help="Use pseudo-log-likelihood(pLL)")
64 | grp_pll.add_argument("--lbfgs-ftol", dest="ftol", default=1e-4, type=float,
65 | help="LBFGS: convergence criterion ftol. [default: %(default)s]")
66 | grp_pll.add_argument("--lbfgs-max-linesearch", dest="max_linesearch", default=5, type=int,
67 | help="LBFGS: maximum number of linesearch steps. [default: %(default)s]")
68 | grp_pll.add_argument("--lbfgs-maxcor", dest="max_cor", default=5, type=int,
69 | help="LBFGS: maximum number of corrections for memory. [default: %(default)s]")
70 |
71 |
72 | grp_cd = parser.add_argument_group("(Persistent) Contrastive Divergence Options")
73 | grp_cd.add_argument("--ofn-cd",dest="objfun",action="store_const",const="cd",help="Use contrastive divergence (CD)")
74 | grp_cd.add_argument("--nr-markov-chains", dest="nr_seq_sample", type=int, default=500, help="Number of parallel "
75 | "Markov chains used for sampling at each iteration. [default: %(default)s] ")
76 | grp_cd.add_argument("--gibbs_steps", dest="cd_gibbs_steps", type=int, default=1,
77 | help="Number of Gibbs steps used to evolve each Markov chain "
78 | "in each iteration of the optimization. [default: %(default)s]")
79 | grp_cd.add_argument("--persistent", dest="cd_persistent", action="store_true", default=False, help="Switch on "
80 | "PERSISTENT CD once the learning rate is small enough (< alpha_0 / 10) [default: %(default)s]")
81 | grp_cd.add_argument("--alpha0", dest="alpha0", default=1e-3, type=float,
82 | help="GD: Set initial learning rate. [default: %(default)s]")
83 | grp_cd.add_argument("--no-decay", dest="decay", action="store_false", default=True,
84 | help="GD: Do not use decaying learning rate** (`--no-decay`): Do not use decaying learnign "
85 | "rates. Decay is started when convergence criteria falls below value of START_DECAY. "
86 | "[default: %(default)s]")
87 | grp_cd.add_argument("--decay-start", dest="decay_start", default=1e-1, type=float,
88 | help="GD: Start decay when convergence criteria < START_DECAY."
89 | "[default: %(default)s]")
90 | grp_cd.add_argument("--decay-rate", dest="decay_rate", default=5e-6, type=float,
91 | help="GD: Set rate of decay for learning rate. [default: %(default)s]")
92 | grp_cd.add_argument("--decay-type", dest="decay_type", default="sig", type=str,
93 | choices=['sig', 'sqrt', 'exp', 'lin'],
94 | help="GD: Decay type. [default: %(default)s]")
95 |
96 |
97 | grp_con = parser.add_argument_group("Convergence Settings")
98 | grp_con.add_argument("--maxit", dest="maxit", default=2000, type=int,
99 | help="Stop when MAXIT number of iterations is reached. [default: %(default)s]")
100 | grp_con.add_argument("--early-stopping", dest="early_stopping", default=False, action="store_true",
101 | help="Apply convergence criteria instead of only maxit. [default: %(default)s]")
102 | grp_con.add_argument("--epsilon", dest="epsilon", default=1e-5, type=float,
103 | help="Converged when relative change in f (or xnorm) in last CONVERGENCE_PREV iterations "
104 | "< EPSILON. [default: %(default)s]")
105 | grp_con.add_argument("--convergence_prev", dest="convergence_prev", default=5, type=int,
106 | help="Set CONVERGENCE_PREV parameter. [default: %(default)s]")
107 |
108 |
109 |
110 | grp_constraints = parser.add_argument_group("Use with Contraints (non-contacts will obtain zero couplings)")
111 | grp_constraints.add_argument("--pdb-file", dest="pdbfile", help="Input PDB file")
112 | grp_constraints.add_argument("--contact-threshold", dest="contact_threshold", type=int, default=8,
113 | help="Definition of residue pairs forming a contact wrt distance of their Cbeta atoms in "
114 | "angstrom. [default: %(default)s]")
115 |
116 |
117 |
118 | grp_corr = parser.add_argument_group("Corrections applied to Contact Score")
119 | grp_corr.add_argument("--apc", dest="apc_file", type=str, default=None,
120 | help="Path to contact matrix file corrected with average product correction (APC). "
121 | "[default: %(default)s] ")
122 | grp_corr.add_argument("--entropy-correction", dest="entropy_correction_file", type=str, default=None,
123 | help="Path to contact matrix file corrected with entropy correction. "
124 | "[default: %(default)s]")
125 |
126 |
127 | grp_wt = parser.add_argument_group("Sequence Weighting")
128 | grp_wt.add_argument("--wt-simple", dest="weight", action="store_const", const="simple",
129 | default="simple", help='Use simple weighting [default: %(default)s]')
130 | grp_wt.add_argument("--wt-uniform", dest="weight", action="store_const", const="uniform",
131 | help='Use uniform weighting')
132 | grp_wt.add_argument("--wt-cutoff", dest="wt_cutoff", type=float, default=0.8,
133 | help="Sequence identity threshold. [default: %(default)s]")
134 |
135 |
136 | grp_rg = parser.add_argument_group("Regularization")
137 | grp_rg.add_argument("--reg-lambda-single", dest="lambda_single", type=float, default=10,
138 | help='Regularization coefficient for single potentials (L2 regularization) '
139 | '[default: %(default)s]')
140 | grp_rg.add_argument("--reg-lambda-pair-factor", dest="lambda_pair_factor", type=float, default=0.2,
141 | help='Regularization parameter for pair potentials (L2 regularization with '
142 | 'lambda_pair = lambda_pair-factor * scaling) [default: %(default)s]')
143 | grp_rg.add_argument("--v-center", dest="single_prior", action="store_const", const="v-center", default="v-center",
144 | help="Use mu=v* in Gaussian prior for single emissions and initialization. [default: %(default)s]")
145 | grp_rg.add_argument("--v-zero", dest="single_prior", action="store_const", const="v-zero",
146 | help="Use mu=0 in Gaussian prior for single emissions and initialisation.")
147 |
148 |
149 |
150 | grp_gap = parser.add_argument_group("Gap Treatment")
151 | grp_gap.add_argument("--max-gap-pos", dest="max_gap_pos", default=100, type=int,
152 | help="Ignore alignment positions with > MAX_GAP_POS percent gaps. "
153 | "[default: %(default)s == no removal of positions]")
154 | grp_gap.add_argument("--max-gap-seq", dest="max_gap_seq", default=100, type=int,
155 | help="Remove sequences with > MAX_GAP_SEQ percent gaps. [default: %(default)s == no removal of sequences]")
156 |
157 |
158 | grp_pc = parser.add_argument_group("Pseudocounts")
159 | grp_pc.add_argument("--pc-uniform", dest="pseudocounts", action="store_const", const="uniform_pseudocounts",
160 | default="uniform_pseudocounts",
161 | help="Use uniform pseudocounts, e.g 1/21 [default: %(default)s]")
162 | grp_pc.add_argument("--pc-submat", dest="pseudocounts", action="store_const",
163 | const="substitution_matrix_pseudocounts", help="Use substitution matrix pseudocounts")
164 | grp_pc.add_argument("--pc-constant", dest="pseudocounts", action="store_const",
165 | const="constant_pseudocounts", help="Use constant pseudocounts ")
166 | grp_pc.add_argument("--pc-none", dest="pseudocounts", action="store_const",
167 | const="no_pseudocounts", help="Use no pseudocounts")
168 | grp_pc.add_argument("--pc-single-count", dest="pseudocount_single", default=1, type=int,
169 | help="Specify number of pseudocounts [default: %(default)s]")
170 | grp_pc.add_argument("--pc-pair-count", dest="pseudocount_pair", default=1, type=int,
171 | help="Specify number of pseudocounts for pairwise frequencies [default: %(default)s]")
172 |
173 |
174 | scores = parser.add_argument_group("Alternative Coevolution Scores")
175 | scores.add_argument("--compute-omes", dest="omes", action="store_true", default=False,
176 | help="Compute OMES scores as in Kass and Horovitz 2002. [default: %(default)s]")
177 | scores.add_argument("--omes-fodoraldrich", dest="omes_fodoraldrich", action="store_true", default=False,
178 | help="OMES option: according to Fodor & Aldrich 2004. [default: %(default)s]")
179 | scores.add_argument("--compute-mi", dest="mi", action="store_true", default=False,
180 | help="Compute mutual information (MI) . [default: %(default)s]")
181 | scores.add_argument("--mi-normalized", dest="mi_normalized", action="store_true", default=False,
182 | help="MI option: Compute normalized MI according to Martin et al 2005 . [default: %(default)s]")
183 | scores.add_argument("--mi-pseudocounts", dest="mi_pseudocounts", action="store_true", default=False,
184 | help="MI option: Compute MI with pseudocounts . [default: %(default)s]")
185 |
186 |
187 |
188 | args = parser.parse_args()
189 |
190 |
191 | if not args.optimize and not args.initrawfile:
192 | parser.error("--do-not-optimize is only supported when -i (--init-from-raw) is specified!")
193 |
194 | return args
195 |
196 |
197 | def main():
198 |
199 | # read command line options
200 | opt = parse_args()
201 |
202 | # print logo
203 | if opt.logo:
204 | ccmpred.logo.logo()
205 |
206 | # set OMP environment variable for number of threads
207 | os.environ['OMP_NUM_THREADS'] = str(opt.num_threads)
208 | print("Using {0} threads for OMP parallelization.".format(os.environ["OMP_NUM_THREADS"]))
209 |
210 | # instantiate CCMpred
211 | ccm = CCMpred()
212 |
213 | # specify possible file paths
214 | ccm.set_alignment_file(opt.alnfile)
215 | ccm.set_matfile(opt.matfile)
216 | ccm.set_pdb_file(opt.pdbfile)
217 | ccm.set_initraw_file(opt.initrawfile)
218 |
219 | # read alignment and possible remove gapped sequences and positions
220 | ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq)
221 |
222 | # compute sequence weights (in order to reduce sampling bias)
223 | ccm.compute_sequence_weights(opt.weight, opt.wt_cutoff)
224 |
225 | # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids
226 | ccm.compute_frequencies(opt.pseudocounts, opt.pseudocount_single, opt.pseudocount_pair)
227 |
228 | # read pdb file if CCMpred is setup as a constrained run
229 | if opt.pdbfile:
230 | ccm.read_pdb(opt.contact_threshold)
231 |
232 |
233 | # if alternative scores are specified: compute these and exit
234 | if opt.omes:
235 | ccm.compute_omes(opt.omes_fodoraldrich)
236 | ccm.write_matrix()
237 | sys.exit(0)
238 |
239 | if opt.mi:
240 | ccm.compute_mutual_info(opt.mi_normalized, opt.mi_pseudocounts)
241 | ccm.write_matrix()
242 | sys.exit(0)
243 |
244 | # setup L2 regularization
245 | ccm.specify_regularization(opt.lambda_single, opt.lambda_pair_factor,
246 | reg_type="L2", scaling="L", single_prior=opt.single_prior)
247 |
248 | # intialise single and pair potentials either:
249 | # - according to regularization priors
250 | # - from initrawfile (accounting for removal of many gapped positions, if applicable)
251 | ccm.intialise_potentials()
252 |
253 |
254 | # optimize objective function (pLL or CD/PCD) with optimization algorithm (LBFGS, CG, GD or ADAM)
255 | if opt.optimize:
256 |
257 | #initialize log object
258 | ccm.initiate_logging(opt.plot_opt_progress)
259 |
260 | #minimize objective function with corresponding optimization algorithm
261 | ccm.minimize(opt)
262 | else:
263 | print("\nDo not optimize but use model parameters provided by {0}\n".format(opt.initrawfile))
264 |
265 |
266 |
267 |
268 | ### Post Processing
269 |
270 |
271 | #specify meta data, and write (corrected) contact matrices to files
272 | if opt.matfile:
273 |
274 | # Compute contact score (frobenius norm) by recentering potentials
275 | # TODO: other scores can be added ...
276 | ccm.compute_contact_matrix(recenter_potentials=True, frob=True)
277 |
278 | # compute corrected contact maps (removing entropy/phylogenetic biases)
279 | # TODO: other corrections can be added ...
280 | ccm.compute_correction(
281 | apc_file=opt.apc_file,
282 | entropy_correction_file=opt.entropy_correction_file
283 | )
284 |
285 | ccm.write_matrix()
286 |
287 | # write model parameters in binary format
288 | if opt.out_binary_raw_file:
289 | ccm.write_binary_raw(opt.out_binary_raw_file)
290 |
291 |
292 | exitcode = 0
293 | if opt.optimize:
294 | if ccm.algret['code'] < 0:
295 | exitcode =-ccm.algret['code']
296 | sys.exit(exitcode)
297 |
298 |
299 |
300 | if __name__ == '__main__':
301 | main()
302 |
--------------------------------------------------------------------------------
/ccmpred/substitution_matrices.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | def matrianglify(data, size=20):
5 | """ Make a symmetric size * size matrix out of an array of triangle array data"""
6 | mat = np.zeros((size, size))
7 | mat[np.tril_indices(size)] = data
8 | mat[np.triu_indices(size)] = data
9 |
10 | return mat
11 |
12 |
13 | # BLOSUM62 matrix as P(a, b)
14 | # from HH-suite hhmatrices.h @ https://github.com/soedinglab/hh-suite/blob/5015f267a051f9b7ebffcec04b7f0596ad01bbbd/src/hhmatrices.cpp
15 | BLOSUM62 = matrianglify([
16 | # A R N D C Q E G H I L K M F P S T W Y V
17 | 0.0215,
18 | 0.0023, 0.0178,
19 | 0.0019, 0.0020, 0.0141,
20 | 0.0022, 0.0016, 0.0037, 0.0213,
21 | 0.0016, 0.0004, 0.0004, 0.0004, 0.0119,
22 | 0.0019, 0.0025, 0.0015, 0.0016, 0.0003, 0.0073,
23 | 0.0030, 0.0027, 0.0022, 0.0049, 0.0004, 0.0035, 0.0161,
24 | 0.0058, 0.0017, 0.0029, 0.0025, 0.0008, 0.0014, 0.0019, 0.0378,
25 | 0.0011, 0.0012, 0.0014, 0.0010, 0.0002, 0.0010, 0.0014, 0.0010, 0.0093,
26 | 0.0032, 0.0012, 0.0010, 0.0012, 0.0011, 0.0009, 0.0012, 0.0014, 0.0006, 0.0184,
27 | 0.0044, 0.0024, 0.0014, 0.0015, 0.0016, 0.0016, 0.0020, 0.0021, 0.0010, 0.0114, 0.0371,
28 | 0.0033, 0.0062, 0.0024, 0.0024, 0.0005, 0.0031, 0.0041, 0.0025, 0.0012, 0.0016, 0.0025, 0.0161,
29 | 0.0013, 0.0008, 0.0005, 0.0005, 0.0004, 0.0007, 0.0007, 0.0007, 0.0004, 0.0025, 0.0049, 0.0009, 0.0040,
30 | 0.0016, 0.0009, 0.0008, 0.0008, 0.0005, 0.0005, 0.0009, 0.0012, 0.0008, 0.0030, 0.0054, 0.0009, 0.0012, 0.0183,
31 | 0.0022, 0.0010, 0.0009, 0.0012, 0.0004, 0.0008, 0.0014, 0.0014, 0.0005, 0.0010, 0.0014, 0.0016, 0.0004, 0.0005, 0.0191,
32 | 0.0063, 0.0023, 0.0031, 0.0028, 0.0010, 0.0019, 0.0030, 0.0038, 0.0011, 0.0017, 0.0024, 0.0031, 0.0009, 0.0012, 0.0017, 0.0126,
33 | 0.0037, 0.0018, 0.0022, 0.0019, 0.0009, 0.0014, 0.0020, 0.0022, 0.0007, 0.0027, 0.0033, 0.0023, 0.0010, 0.0012, 0.0014, 0.0047, 0.0125,
34 | 0.0004, 0.0003, 0.0002, 0.0002, 0.0001, 0.0002, 0.0003, 0.0004, 0.0002, 0.0004, 0.0007, 0.0003, 0.0002, 0.0008, 0.0001, 0.0003, 0.0003, 0.0065,
35 | 0.0013, 0.0009, 0.0007, 0.0006, 0.0003, 0.0007, 0.0009, 0.0008, 0.0015, 0.0014, 0.0022, 0.0010, 0.0006, 0.0042, 0.0005, 0.0010, 0.0009, 0.0009, 0.0102,
36 | 0.0051, 0.0016, 0.0012, 0.0013, 0.0014, 0.0012, 0.0017, 0.0018, 0.0006, 0.0120, 0.0095, 0.0019, 0.0023, 0.0026, 0.0012, 0.0024, 0.0036, 0.0004, 0.0015, 0.0196
37 | ])
38 |
--------------------------------------------------------------------------------
/ccmpred/trees.py:
--------------------------------------------------------------------------------
1 | import Bio.Phylo.BaseTree
2 | import Bio.Phylo
3 | import numpy as np
4 | import ccmpred.sampling
5 |
6 |
7 | class CCMTree(object):
8 | """This class represents an empty phylogenetic tree according to some specific topology"""
9 |
10 | def __init__(self):
11 | """Initialise all class attributes"""
12 |
13 | self.id0 = ["root"]
14 | self.ids = None
15 | self.branch_lengths = None
16 | self.n_vertices = None
17 | self.n_leaves = None
18 | self.tree = None
19 | self.type = None
20 |
21 | def load_tree(self, tree_file):
22 | """
23 |
24 | Parameters
25 | ----------
26 | tree_file: str
27 | path to a newick type tree topology file
28 |
29 | Returns
30 | -------
31 | bool
32 | True if successful, False otherwise.
33 |
34 | """
35 |
36 | self.type = "newick"
37 |
38 | try:
39 | self.tree = Bio.Phylo.read(tree_file, "newick")
40 | except ValueError as e:
41 | print("Error while reading tree file {0} : {1}".format(tree_file, e))
42 | return False
43 | except OSError as e:
44 | print("Error while reading tree file {0} : {1}".format(tree_file, e))
45 | return False
46 |
47 |
48 | self.determine_tree_properties()
49 |
50 |
51 | def specify_tree(self, nseq, tree_source):
52 | """
53 | Parameters
54 | ----------
55 | nseq: int
56 | Specifies the number of leave nodes representing sequences
57 | tree_source: str
58 | specifies the tree topology [star|binary]
59 |
60 | Returns
61 | -------
62 | bool
63 | True if successful, False otherwise.
64 |
65 | """
66 |
67 |
68 | if tree_source == "binary":
69 | self.type = "binary"
70 | self.tree = create_binary_tree(nseq, root_name=self.id0[0])
71 | elif tree_source == "star":
72 | self.type = "star"
73 | self.tree = create_star_tree(nseq, root_name=self.id0[0])
74 |
75 | self.determine_tree_properties()
76 |
77 | return True
78 |
79 |
80 | def determine_tree_properties(self):
81 |
82 | tree_split = split_tree(self.tree, self.id0)
83 | tree_bfs = [c for c in bfs_iterator(tree_split.clade)]
84 |
85 | self.n_children = np.array([len(c.clades) for c in tree_bfs], dtype='uint64')
86 | self.branch_lengths = np.array([c.branch_length for c in tree_bfs], dtype=np.dtype('float64'))
87 | self.n_vertices = len(tree_bfs)
88 | self.n_leaves = len(tree_split.get_terminals())
89 | self.ids = [l.name for l in tree_split.get_terminals()]
90 |
91 | depth_min, depth_max = get_child_depth_range(tree_split.clade)
92 | print(
93 | "Created {0} tree with {1} leaves, {2} nodes, avg branch length={3}, depth_min={4:.4e}, depth_max={5:.4e}\n".format(
94 | self.type, self.n_leaves, self.n_vertices, np.round(np.mean(self.branch_lengths[2:]), decimals=3),
95 | depth_min, depth_max))
96 |
97 |
98 |
99 | def split_tree(tree, id0):
100 | """Reroot tree so that the clades in id0 are direct descendants of the root node"""
101 |
102 | id_to_node = dict((cl.name, cl) for cl in bfs_iterator(tree.clade))
103 |
104 | new_tree = Bio.Phylo.BaseTree.Tree()
105 | new_tree.clade.clades = [id_to_node[i] for i in id0]
106 |
107 | for cl in new_tree.clade.clades:
108 | cl.branch_length = 0
109 |
110 | new_tree.clade.branch_length = 0
111 |
112 | return new_tree
113 |
114 | def bfs_iterator(clade):
115 | """Breadth-first iterator along a tree clade"""
116 |
117 | def inner(clade):
118 | for c in clade.clades:
119 | yield c
120 |
121 | for c in clade.clades:
122 | for ci in inner(c):
123 | yield ci
124 |
125 | yield clade
126 |
127 | for ci in inner(clade):
128 | yield ci
129 |
130 | def get_child_depth_range(clade):
131 | """Return the minimum and maximum child depth"""
132 | level = [(0, clade)]
133 |
134 | mn = float('inf')
135 | mx = float('-inf')
136 | while level:
137 | new_level = []
138 |
139 | for d, parent in level:
140 | dc = d + parent.branch_length
141 |
142 | if parent.clades:
143 | for c in parent.clades:
144 | new_level.append((dc, c))
145 | else:
146 | mn = min(mn, dc)
147 | mx = max(mx, dc)
148 |
149 | level = new_level
150 |
151 | return mn, mx
152 |
153 | def get_seq0_mrf(x, ncol, gibbs_steps):
154 | """
155 | Specify the root sequence in the tree representing the common ancestor.
156 |
157 | A new sequence of length NCOL will be sampled from a poly-A sequence of length NCOL
158 | according to a Markov-Random-Field (MRF aka Potts) model specified by parameters X.
159 |
160 | Parameters
161 | ----------
162 | x : ndarray
163 | 1D float containing concatenation of single and pair potentials specifiying the MRF
164 | ncol : int
165 | protein/sequence length
166 | gibbs_steps: int
167 | number of Gibbs steps used in Gibbs sampling procedure
168 | (one Gibbs step corresponds to sampling a new amino acid for every position)
169 |
170 | Returns
171 | -------
172 | ndarray
173 | 1D integer array representing the newly sampled sequence
174 |
175 | """
176 |
177 | # generate a poly-A alignment
178 | seq0 = np.zeros((1, ncol), dtype="uint8")
179 |
180 | # gibbs sample a new sequence
181 | seq0 = ccmpred.sampling.gibbs_sample_sequences(x, seq0, gibbs_steps)
182 |
183 | return seq0
184 |
185 | def create_binary_tree(nseqs, depth=1, root_name="root"):
186 | """
187 | Create a binary tree topology.
188 |
189 | The depth of the tree is specified by DEPTH and the number of leave nodes by NSEQS (should be a power of 2).
190 |
191 |
192 | Parameters
193 | ----------
194 | nseqs : int
195 | the number of leave nodes that represent sequences
196 | depth : int, optional(default=1)
197 | the depth of the tree
198 | root_name: str, optional(default="")
199 | name of the root sequence
200 |
201 | Returns
202 | -------
203 | Bio.Phylo.BaseTree.Tree
204 | topology of a binary tree
205 |
206 | """
207 |
208 | splits = np.ceil(np.log2(nseqs))
209 |
210 | depth_per_clade = float(depth) / splits
211 |
212 | def fill_tree_rec(parent, splits):
213 | if splits == 0:
214 | return
215 |
216 | c1 = Bio.Phylo.BaseTree.Clade(name=parent.name + "A", branch_length=depth_per_clade)
217 | c2 = Bio.Phylo.BaseTree.Clade(name=parent.name + "B", branch_length=depth_per_clade)
218 |
219 | fill_tree_rec(c1, splits - 1)
220 | fill_tree_rec(c2, splits - 1)
221 |
222 | parent.clades = [c1, c2]
223 |
224 | t = Bio.Phylo.BaseTree.Tree(rooted=False)
225 | t.clade.name = root_name
226 | t.clade.branch_length = 0
227 | fill_tree_rec(t.clade, splits)
228 |
229 | return t
230 |
231 | def create_star_tree(nseqs, depth=1, root_name="root"):
232 | """
233 | Create a star tree topology.
234 |
235 | The depth of the tree is specified by DEPTH and the number of leave nodes by NSEQS (should be a power of 2).
236 |
237 |
238 | Parameters
239 | ----------
240 | nseqs : int
241 | the number of leave nodes that represent sequences
242 | depth : int, optional(default=1)
243 | the depth of the tree
244 | root_name: str, optional(default="")
245 | name of the root sequence
246 |
247 | Returns
248 | -------
249 | Bio.Phylo.BaseTree.Tree
250 | topology of a star tree
251 |
252 | """
253 |
254 | t = Bio.Phylo.BaseTree.Tree(rooted=False)
255 | t.clade.name = root_name
256 | t.clade.branch_length = 0
257 |
258 | t.clade.clades = [
259 | Bio.Phylo.BaseTree.Clade(name="C{0}".format(i), branch_length=depth)
260 | for i in range(nseqs)
261 | ]
262 |
263 | return t
264 |
--------------------------------------------------------------------------------
/ccmpred/weighting/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from ccmpred.weighting.cext import count_ids, calculate_weights_simple
3 | import ccmpred.counts
4 | from ccmpred.pseudocounts import PseudoCounts
5 |
6 | def get_HHsuite_neff(msa):
7 | """
8 | Adapted from the HHsuite manual:
9 |
10 | The number of effective sequences is exp of the average sequence entropy over all columns of the alignment.
11 | Hence, Neff is bounded by 0 from below and 20 from above.
12 | In practice, it is bounded by the entropy of a column with background amino acid distribution f_a:
13 | Neff < sum_a=1^20 f_a log f_a approx 16
14 |
15 | Parameters
16 | ----------
17 | msa
18 |
19 | Returns
20 | -------
21 |
22 | """
23 |
24 | # frequencies including gaps
25 | single_counts = ccmpred.counts.single_counts(msa)
26 | single_freqs = (single_counts + 1e-3) / np.sum(single_counts, axis=1)[:, np.newaxis]
27 |
28 |
29 | single_freqs = single_freqs[:, :20]
30 | entropies = - np.sum(single_freqs * np.log2(single_freqs), axis=1)
31 |
32 | neff = 2 ** np.mean(entropies)
33 |
34 | return neff
35 |
36 | def weights_uniform(msa):
37 | """Uniform weights"""
38 | return np.ones((msa.shape[0],), dtype="float64")
39 |
40 |
41 | def weights_simple(msa, cutoff=0.8):
42 | """Simple sequence reweighting from the Morcos et al. 2011 DCA paper"""
43 |
44 | if cutoff >= 1:
45 | return weights_uniform(msa)
46 |
47 | return calculate_weights_simple(msa, cutoff)
48 |
49 |
50 |
51 | WEIGHTING_TYPE = {
52 | 'simple': lambda msa, cutoff: weights_simple(msa, cutoff),
53 | 'uniform': lambda msa, cutoff: weights_uniform(msa)
54 | }
55 |
--------------------------------------------------------------------------------
/ccmpred/weighting/cext/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import numpy.ctypeslib as npct
3 | import ctypes
4 | import os.path
5 |
6 | array_1d_double = npct.ndpointer(dtype=np.dtype('double'), ndim=1, flags='CONTIGUOUS')
7 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS')
8 | array_2d_uint64 = npct.ndpointer(dtype=np.dtype('uint64'), ndim=2, flags='CONTIGUOUS')
9 |
10 | libweighting = npct.load_library('libweighting', os.path.join(os.path.dirname(__file__), '_build'))
11 |
12 | libweighting.count_ids.restype = None
13 | libweighting.count_ids.argtypes = [
14 | array_2d_char, # *msa
15 | array_2d_uint64, # *n_ids
16 | ctypes.c_uint64, # nrow
17 | ctypes.c_uint64, # ncol
18 | ]
19 |
20 | libweighting.calculate_weights_simple.restype = None
21 | libweighting.calculate_weights_simple.argtypes = [
22 | array_2d_char, # *msa
23 | array_1d_double, # *weights
24 | ctypes.c_double, # cutoff
25 | ctypes.c_uint64, # nrow
26 | ctypes.c_uint64, # ncol
27 | ]
28 |
29 |
30 | def count_ids(msa):
31 | nrow = msa.shape[0]
32 | ids = np.zeros((nrow, nrow), dtype="uint64")
33 | libweighting.count_ids(msa, ids, *msa.shape)
34 |
35 | return ids + ids.T - np.diag(ids.diagonal())
36 |
37 |
38 | def calculate_weights_simple(msa, cutoff):
39 | nrow = msa.shape[0]
40 | weights = np.zeros((nrow,), dtype='double')
41 | libweighting.calculate_weights_simple(msa, weights, cutoff, *msa.shape)
42 |
43 | return weights
44 |
45 |
46 | if __name__ == '__main__':
47 | msa = np.array(
48 | [
49 | [0, 1, 2],
50 | [0, 3, 4],
51 | [0, 3, 2],
52 | [5, 6, 7]
53 | ],
54 | dtype=np.uint8
55 | )
56 |
57 | print(msa)
58 | print(count_ids(msa))
59 |
--------------------------------------------------------------------------------
/ccmpred/weighting/cext/weighting.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include
4 | #include
5 | #include
6 | #include
7 |
8 | #include "weighting.h"
9 |
10 | /**
11 | * Count the number of sequence identities for all rows in an MSA
12 | *
13 | * @param[in] seq The MSA to work on
14 | * @param[out] counts The number of sequence identities
15 | * @param[in] nrow The number of columns in the MSA
16 | * @param[in] ncol The number of rows in the MSA
17 | */
18 | void count_ids(
19 | const uint8_t *msa,
20 | uint64_t *ids,
21 | const uint64_t nrow,
22 | const uint64_t ncol
23 | ) {
24 | uint64_t nij = nrow * (nrow + 1) / 2;
25 |
26 | omp_set_dynamic(0);
27 |
28 | #pragma omp parallel
29 | {
30 | uint64_t ij;
31 |
32 | #pragma omp for nowait private(ij)
33 | for(ij = 0; ij < nij; ij++) {
34 |
35 | // compute i and j from ij
36 | // http://stackoverflow.com/a/244550/1181102
37 | uint64_t i, j;
38 | {
39 | uint64_t ii = nrow * (nrow + 1) / 2 - 1 - ij;
40 | uint64_t K = floor((sqrt(8 * ii + 1) - 1) / 2);
41 | i = nrow - 1 - K;
42 | j = ij - nrow * i + i * (i + 1) / 2;
43 | }
44 |
45 | uint64_t my_ids = 0;
46 | for(uint64_t k = 0; k < ncol; k++) {
47 | if(msa[i * ncol + k] == msa[j * ncol + k]) {
48 | my_ids++;
49 | }
50 | }
51 |
52 | ids[i * nrow + j] = my_ids;
53 | }
54 | }
55 | }
56 |
57 |
58 | void calculate_weights_simple(
59 | const uint8_t *msa,
60 | double *weights,
61 | double cutoff,
62 | const uint64_t nrow,
63 | const uint64_t ncol
64 | ) {
65 | uint64_t nij = nrow * (nrow + 1) / 2;
66 |
67 | omp_set_dynamic(0);
68 |
69 | #pragma omp parallel
70 | {
71 |
72 | uint64_t ij;
73 |
74 | #pragma omp for nowait private(ij)
75 | for(ij = 0; ij < nij; ij++) {
76 |
77 | // compute i and j from ij
78 | // http://stackoverflow.com/a/244550/1181102
79 | uint64_t i, j;
80 | {
81 | uint64_t ii = nrow * (nrow + 1) / 2 - 1 - ij;
82 | uint64_t K = floor((sqrt(8 * ii + 1) - 1) / 2);
83 | i = nrow - 1 - K;
84 | j = ij - nrow * i + i * (i + 1) / 2;
85 | }
86 |
87 |
88 | uint64_t my_ids = 0;
89 | uint64_t idthres = ceil(cutoff * ncol);
90 | for(uint64_t k = 0; k < ncol; k++) {
91 | if(msa[i * ncol + k] == msa[j * ncol + k] ) {
92 | my_ids++;
93 | }
94 | }
95 |
96 |
97 |
98 |
99 | if(my_ids >= idthres) {
100 | #pragma omp atomic
101 | weights[i]++;
102 | #pragma omp atomic
103 | weights[j]++;
104 | }
105 |
106 | }
107 | }
108 |
109 | for(uint64_t i = 0; i < nrow; i++) {
110 | weights[i] = 1.0 / (weights[i] - 1);
111 | }
112 |
113 | fflush(stdout);
114 | }
115 |
--------------------------------------------------------------------------------
/ccmpred/weighting/cext/weighting.h:
--------------------------------------------------------------------------------
1 | #ifndef WEIGHTING_H
2 | #define WEIGHTING_H
3 |
4 | #include
5 | #define GAP 20
6 |
7 | void count_ids(
8 | const uint8_t *msa,
9 | uint64_t *ids,
10 | const uint64_t nrow,
11 | const uint64_t ncol
12 | );
13 |
14 | void calculate_weights_simple(
15 | const uint8_t *msa,
16 | double *weights,
17 | double cutoff,
18 | const uint64_t nrow,
19 | const uint64_t ncol
20 | );
21 |
22 |
23 | #endif
24 |
--------------------------------------------------------------------------------
/ci_support/1atzA.braw.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ci_support/1atzA.braw.gz
--------------------------------------------------------------------------------
/ci_support/mrf_params.braw.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ci_support/mrf_params.braw.gz
--------------------------------------------------------------------------------
/ci_support/phylo.newick:
--------------------------------------------------------------------------------
1 | ((((((0:0.16667,1:0.16667)6df929e2-2da7-4bf7-8648-c74e118a778d:0.16667,(2:0.16667,3:0.16667)577d47bf-2a72-4aa8-87d3-acd4231f7d54:0.16667)98786d16-1782-4fac-89a2-f52090da3639:0.16667,((4:0.16667,5:0.16667)87e05f7c-af92-4197-ac3a-3f10577b35d1:0.16667,(6:0.16667,7:0.16667)63567a61-133e-4784-abcb-515c013f10be:0.16667)8e05d0d4-521b-430b-ba71-d0220b1e8350:0.16667)4c7b4015-83bc-41c3-ba34-535ea40f7cde:0.16667,(((8:0.16667,9:0.16667)34d6ba60-ba0f-4464-86fa-5401c1682d9b:0.16667,(10:0.16667,11:0.16667)27f94871-c557-413c-b369-7d96e0534a70:0.16667)e7d7f0e9-8189-46b9-83b0-a14d12081fa1:0.16667,((12:0.16667,13:0.16667)c4f0c5ea-a173-4339-925c-8c3884be556d:0.16667,(14:0.16667,15:0.16667)6b05b82c-3a54-4afa-933f-91eb7255d79d:0.16667)5a9c9457-ea99-43dd-8c10-79a479aaf44a:0.16667)a1d6236e-7432-418e-8050-fc21edaa6bb8:0.16667)9bf01622-715d-454d-85a9-713ac626c7c0:0.16667,((((16:0.16667,17:0.16667)f289fcdf-7e24-4f45-aa0e-f5e4f82fcb29:0.16667,(18:0.16667,19:0.16667)3a35be31-4498-4cb6-a519-7d02726362be:0.16667)658a8453-4e75-4274-8a02-172832b23dd3:0.16667,((20:0.16667,21:0.16667)71a0d481-f62f-421e-b6c6-23ef55867a9c:0.16667,(22:0.16667,23:0.16667)95275b51-fdc9-48a1-a8cc-39c15390d4a7:0.16667)394eb006-4b24-449d-96ef-114be429a0ce:0.16667)ed1128f1-11a7-4803-8c6f-6c3b7bb6da0f:0.16667,(((24:0.16667,25:0.16667)5989f16f-0d2a-4f4a-ae34-ddc91afe5f93:0.16667,(26:0.16667,27:0.16667)a82cf7ed-3e95-4712-8d10-0bc26bdfb87c:0.16667)39012f67-76c6-4c81-9ba8-a53e15de5e5a:0.16667,((28:0.16667,29:0.16667)339aba95-a80b-4252-9424-a37acfa31d6d:0.16667,(30:0.16667,31:0.16667)a96d5b22-5276-4d78-aae0-bde31e0aeb55:0.16667)90768ce1-6a1c-41d3-974c-ebff07c6cc91:0.16667)0dcf1544-72f3-447b-8682-82d20306d281:0.16667)1900092e-1d1c-4d2b-96c8-befa5ddad166:0.16667)ae31cd88-a99f-4476-9876-49b525993d65:0.16667,(((((32:0.16667,33:0.16667)d13716f4-c0a6-481f-b055-574c1acb4e61:0.16667,(34:0.16667,35:0.16667)1ecd3fbd-5333-4000-a17a-d3006174406e:0.16667)99f6c263-a33d-412e-8fa5-e612b6692646:0.16667,((36:0.16667,37:0.16667)b860915d-c75c-4da5-b2e9-e0a41cb0b520:0.16667,(38:0.16667,39:0.16667)fd10aa23-344e-4b4f-a37a-f48548b5ae0c:0.16667)a6c60d0a-e005-43eb-9827-16900b414fcd:0.16667)9daac0ef-9d25-4947-a531-d74d27753432:0.16667,(((40:0.16667,41:0.16667)28cd4007-537f-4773-b60d-88c6ac350982:0.16667,(42:0.16667,43:0.16667)2d788dbe-aea8-4378-8ef8-493d65b1db12:0.16667)46b597a4-4330-4451-9bfc-1ca67958ebe8:0.16667,((44:0.16667,45:0.16667)170f012b-9e48-46ae-bee3-79dfe4b9835d:0.16667,(46:0.16667,47:0.16667)d5606d8a-9b75-45ad-88b7-6ea60c6a1b8c:0.16667)52ed9b25-2529-4f99-b2d6-e175624b3584:0.16667)cdbcdfd8-938a-40d2-9ed4-fe98b3a6def3:0.16667)5497cf05-a28e-4bf3-ba7f-74587192d143:0.16667,((((48:0.16667,49:0.16667)ec6cb031-6fe9-4458-904a-11b8637eb07f:0.16667,(50:0.16667,51:0.16667)d063be88-ce9d-430d-b390-287dc792b25c:0.16667)d5a8b09d-1aa6-4a27-af3a-2ec3e18418c3:0.16667,((52:0.16667,53:0.16667)eb8ba81c-eee0-469b-a1c8-f1f110396562:0.16667,(54:0.16667,55:0.16667)9fd2da20-caf7-4c13-81d1-6af9da49106f:0.16667)fb4ef037-3797-493a-a646-5fec4c5f05d4:0.16667)954f7def-41e0-47ec-acb5-e3a560f3fa16:0.16667,(((56:0.16667,57:0.16667)3e4cbb37-3896-4f0a-834d-02c05903b0a2:0.16667,(58:0.16667,59:0.16667)075a3d1e-8ff5-4748-8e27-cbbd1bdd1446:0.16667)c8d354fc-ee87-4164-8ba2-b9eb599e1826:0.16667,((60:0.16667,61:0.16667)e5891dff-7724-489b-80b0-57e9820b38e6:0.16667,(62:0.16667,63:0.16667)afef3bca-d495-4825-8519-1e13e5f78ff1:0.16667)c84d298f-682e-4409-b551-b209ab2105a3:0.16667)6e0f2d8f-a5ce-4336-b4c5-5e0cf3597df1:0.16667)efe3201f-2846-4657-b746-196ffbee17ff:0.16667)4d878a75-5691-42c1-b50d-abce5b9acf39:0.16667)root:0.00000;
2 |
--------------------------------------------------------------------------------
/ci_support/random_start_sequence.py:
--------------------------------------------------------------------------------
1 | import ccmpred.raw
2 | from Bio.Seq import Seq
3 | from Bio.SeqRecord import SeqRecord
4 | from Bio import SeqIO
5 | import random
6 | import sys
7 |
8 | def write_new_tree_for_ccmgen(file_coupling : str, file_name : str):
9 |
10 | print("Enter")
11 | raw = ccmpred.raw.parse_msgpack(file_coupling)
12 | Field = raw.x_single
13 | size_prot = Field.shape[0]
14 |
15 | sequence = ''.join([random.choice('ACDEFGHIKLMNPQRSTVWY-') for x in range(size_prot)])
16 |
17 | print("Random sequence : %s"%sequence)
18 |
19 | record = SeqRecord(
20 | Seq(sequence),
21 | id="ID_0.1",
22 | name="RandomSequences",
23 | description="Random Sequences for the root of a phylogeny tree",
24 | )
25 |
26 | SeqIO.write(record, file_name, "fasta")
27 |
28 | if __name__ == '__main__':
29 | # Map command line arguments to function arguments.
30 | write_new_tree_for_ccmgen(*sys.argv[1:])
31 |
--------------------------------------------------------------------------------
/ci_support/run_tests.sh:
--------------------------------------------------------------------------------
1 | set -e
2 | ## New test
3 | python ci_support/random_start_sequence.py ci_support/1atzA.braw.gz ci_support/seq0_file.fasta
4 |
5 | ccmgen --tree-newick ci_support/1atzA_rootname.tree --seq0-file ci_support/seq0_file.fasta --mutation-rate 1 --num-threads 1 ci_support/1atzA.braw.gz sequences.msa
6 | ## Commented to reduce time for test
7 | #ccmgen --tree-newick ci_support/1atzA_rootname.tree --seq0-file ci_support/seq0_file.fasta --mutation-rate-neff --num-threads 1 --alnfile ci_support/1atzA.fas ci_support/1atzA.braw.gz sequences.msa
8 |
9 | ##Old test
10 | ccmgen --tree-newick ci_support/phylo.newick --aln-format psicov --mutation-rate 1 --num-threads 1 ci_support/mrf_params.braw.gz sequences.msa
11 | ccmgen --tree-newick ci_support/phylo.newick --aln-format fasta --mutation-rate 1 --num-threads 1 ci_support/mrf_params.braw.gz sequences.msa
12 |
--------------------------------------------------------------------------------
/example/1atzA.alignment_statistics.mcmc_pcd_vs_original.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.alignment_statistics.mcmc_pcd_vs_original.png
--------------------------------------------------------------------------------
/example/1atzA.apc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.apc.png
--------------------------------------------------------------------------------
/example/1atzA.braw.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.braw.gz
--------------------------------------------------------------------------------
/example/1atzA.pcd.apc.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.pcd.apc.png
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, Extension, find_packages
2 |
3 | def ext(name, sources=[], include_dirs=[], library_dirs=[], libraries=[], extra_compile_args=['-g', '-fopenmp', '-std=c99'], extra_link_args=['-g', '-fopenmp']):
4 | return Extension(name, include_dirs=include_dirs, library_dirs=library_dirs, libraries=libraries, sources=sources, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args)
5 |
6 | setup(
7 | name="ccmgen",
8 | version="1.0.0",
9 | description="Residue-residue contact prediction from correlated mutations predicted quickly and precisely",
10 | license="AGPLv3",
11 | author="Susann Vorberg, Stefan Seemayer, Johannes Soeding",
12 | author_email="Susann.Vorberg@gmail.com",
13 | url="https://github.com/soedinglab/ccmgen",
14 | packages=find_packages(),
15 | install_requires=['msgpack-python', 'numpy', 'plotly==3.0.0rc10', 'scipy', 'pandas', 'biopython', 'colorlover'],
16 | ext_modules=[
17 | ext(
18 | 'ccmpred.objfun.pll.cext.libpll',
19 | sources=['ccmpred/objfun/pll/cext/pll.c']
20 | ),
21 | ext(
22 | 'ccmpred.objfun.cd.cext.libcd',
23 | sources=[
24 | 'ccmpred/objfun/cd/cext/cd.c',
25 | 'ccmpred/objfun/cd/cext/cdutil.c'
26 | ]
27 | ),
28 | ext(
29 | 'ccmpred.counts.libmsacounts',
30 | sources=['ccmpred/counts/msacounts.c']
31 | ),
32 | ext(
33 | 'ccmpred.gaps.cext.libgaps',
34 | sources=['ccmpred/gaps/cext/gaps.c'],
35 | extra_compile_args=['-g','-std=c99'],
36 | extra_link_args=['-g'],
37 | ),
38 | ext(
39 | 'ccmpred.weighting.cext.libweighting',
40 | sources=['ccmpred/weighting/cext/weighting.c']
41 | ),
42 | ext(
43 | 'ccmpred.sampling.cext.libtreecd',
44 | include_dirs=['ccmpred/objfun/cd/cext'],
45 | sources=[
46 | 'ccmpred/objfun/cd/cext/cd.c',
47 | 'ccmpred/objfun/cd/cext/cdutil.c',
48 | 'ccmpred/sampling/cext/treecd.c',
49 | ]
50 | ),
51 | ],
52 | entry_points={
53 | 'console_scripts': [
54 | 'ccmpred=ccmpred.scripts.run_ccmpred:main',
55 | 'ccmgen=ccmpred.scripts.run_ccmgen:main',
56 | 'ccm_replace_gaps=ccmpred.scripts.replace_gaps:main',
57 | 'ccm_plot=ccmpred.scripts.plot_ccmpred:main',
58 | 'ccm_convert_aln=ccmpred.scripts.convert:main'
59 | ]
60 | }
61 | )
62 |
--------------------------------------------------------------------------------