├── .gitignore ├── .pep8 ├── .ycm_extra_conf.py ├── LICENSE ├── README.md ├── azure-pipelines.yml ├── ccmpred ├── __init__.py ├── algorithm │ ├── __init__.py │ ├── gradient_descent.py │ └── lbfgs.py ├── centering.py ├── counts │ ├── __init__.py │ └── msacounts.c ├── gaps │ ├── __init__.py │ └── cext │ │ ├── __init__.py │ │ ├── gaps.c │ │ └── gaps.h ├── io │ ├── __init__.py │ ├── alignment.py │ ├── contactmatrix.py │ └── pdb.py ├── locmeth │ ├── __init__.py │ ├── mi │ │ └── __init__.py │ └── omes │ │ └── __init__.py ├── logo.py ├── monitor │ ├── __init__.py │ └── progress.py ├── objfun │ ├── __init__.py │ ├── cd │ │ ├── __init__.py │ │ └── cext │ │ │ ├── __init__.py │ │ │ ├── cd.c │ │ │ ├── cd.h │ │ │ ├── cdutil.c │ │ │ └── cdutil.h │ └── pll │ │ ├── __init__.py │ │ └── cext │ │ ├── __init__.py │ │ ├── pll.c │ │ └── pll.h ├── parameter_handling.py ├── plotting │ └── __init__.py ├── pseudocounts.py ├── raw │ ├── __init__.py │ ├── ccmraw.py │ ├── convert_msgpack.py │ └── convert_raw.py ├── regularization.py ├── sampling │ ├── __init__.py │ └── cext │ │ ├── __init__.py │ │ ├── treecd.c │ │ └── treecd.h ├── sanity_check.py ├── scripts │ ├── __init__.py │ ├── convert.py │ ├── plot_ccmpred.py │ ├── replace_gaps.py │ ├── run_ccmgen.py │ └── run_ccmpred.py ├── substitution_matrices.py ├── trees.py └── weighting │ ├── __init__.py │ └── cext │ ├── __init__.py │ ├── weighting.c │ └── weighting.h ├── ci_support ├── 1atzA.braw.gz ├── 1atzA.fas ├── 1atzA_rootname.tree ├── mrf_params.braw.gz ├── phylo.newick ├── random_start_sequence.py └── run_tests.sh ├── example ├── 1atzA.alignment_statistics.mcmc_pcd_vs_original.png ├── 1atzA.apc.html ├── 1atzA.apc.mat ├── 1atzA.apc.png ├── 1atzA.braw.gz ├── 1atzA.ec.mat ├── 1atzA.fas ├── 1atzA.noapc.mat ├── 1atzA.pcd.apc.png └── 1atzA.pdb └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | ### https://raw.github.com/github/gitignore/master/Python.gitignore 2 | 3 | # Byte-compiled / optimized 4 | __pycache__/ 5 | *.py[cod] 6 | 7 | # Distribution / packaging 8 | .Python 9 | env/ 10 | build/ 11 | develop-eggs/ 12 | dist/ 13 | eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # PyInstaller 24 | # Usually these files are written by a python script from a template 25 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 26 | *.manifest 27 | *.spec 28 | 29 | # Installer logs 30 | pip-log.txt 31 | pip-delete-this-directory.txt 32 | 33 | # Unit test / coverage reports 34 | htmlcov/ 35 | .tox/ 36 | .coverage 37 | .cache 38 | nosetests.xml 39 | coverage.xml 40 | 41 | # Translations 42 | *.mo 43 | *.pot 44 | 45 | # Django stuff: 46 | *.log 47 | 48 | #Pycharm 49 | .idea/ 50 | 51 | # Sphinx documentation 52 | docs/_build/ 53 | 54 | # PyBuilder 55 | target/ 56 | 57 | 58 | ### https://raw.github.com/github/gitignore/master/C.gitignore 59 | 60 | # Object files 61 | *.o 62 | *.ko 63 | *.obj 64 | *.elf 65 | 66 | # Libraries 67 | *.lib 68 | *.a 69 | *.la 70 | *.lo 71 | 72 | # Shared objects (inc. Windows DLLs) 73 | *.dll 74 | *.so 75 | *.so.* 76 | *.dylib 77 | 78 | # Executables 79 | *.exe 80 | *.out 81 | *.app 82 | *.i*86 83 | *.x86_64 84 | *.hex 85 | 86 | 87 | ### https://raw.github.com/github/gitignore/master/Global/vim.gitignore 88 | 89 | [._]*.s[a-w][a-z] 90 | [._]s[a-w][a-z] 91 | *.un~ 92 | Session.vim 93 | .netrwhist 94 | *~ 95 | -------------------------------------------------------------------------------- /.pep8: -------------------------------------------------------------------------------- 1 | [pep8] 2 | ignore = E501 3 | -------------------------------------------------------------------------------- /.ycm_extra_conf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import ycm_core 3 | 4 | flags = [ 5 | '-Wall', 6 | '-Wextra', 7 | '-Werror', 8 | '-std=c99', 9 | '-x', 10 | 'c', 11 | '-Iccmpred/objfun/cd/cext' 12 | ] 13 | 14 | 15 | compilation_database_folder = '' 16 | 17 | if os.path.exists(compilation_database_folder): 18 | database = ycm_core.CompilationDatabase(compilation_database_folder) 19 | else: 20 | database = None 21 | 22 | SOURCE_EXTENSIONS = ['.cpp', '.cxx', '.cc', '.c', '.m', '.mm'] 23 | 24 | 25 | def DirectoryOfThisScript(): 26 | return os.path.dirname(os.path.abspath(__file__)) 27 | 28 | 29 | def MakeRelativePathsInFlagsAbsolute(flags, working_directory): 30 | if not working_directory: 31 | return list(flags) 32 | new_flags = [] 33 | make_next_absolute = False 34 | path_flags = ['-isystem', '-I', '-iquote', '--sysroot='] 35 | for flag in flags: 36 | new_flag = flag 37 | 38 | if make_next_absolute: 39 | make_next_absolute = False 40 | if not flag.startswith('/'): 41 | new_flag = os.path.join(working_directory, flag) 42 | 43 | for path_flag in path_flags: 44 | if flag == path_flag: 45 | make_next_absolute = True 46 | break 47 | 48 | if flag.startswith(path_flag): 49 | path = flag[len(path_flag):] 50 | new_flag = path_flag + os.path.join(working_directory, path) 51 | break 52 | 53 | if new_flag: 54 | new_flags.append(new_flag) 55 | return new_flags 56 | 57 | 58 | def IsHeaderFile(filename): 59 | extension = os.path.splitext(filename)[1] 60 | return extension in ['.h', '.hxx', '.hpp', '.hh'] 61 | 62 | 63 | def GetCompilationInfoForFile(filename): 64 | # The compilation_commands.json file generated by CMake does not have entries 65 | # for header files. So we do our best by asking the db for flags for a 66 | # corresponding source file, if any. If one exists, the flags for that file 67 | # should be good enough. 68 | if IsHeaderFile(filename): 69 | basename = os.path.splitext(filename)[0] 70 | for extension in SOURCE_EXTENSIONS: 71 | replacement_file = basename + extension 72 | if os.path.exists(replacement_file): 73 | compilation_info = database.GetCompilationInfoForFile( 74 | replacement_file) 75 | if compilation_info.compiler_flags_: 76 | return compilation_info 77 | return None 78 | return database.GetCompilationInfoForFile(filename) 79 | 80 | 81 | def FlagsForFile(filename, **kwargs): 82 | if database: 83 | # Bear in mind that compilation_info.compiler_flags_ does NOT return a 84 | # python list, but a "list-like" StringVec object 85 | compilation_info = GetCompilationInfoForFile(filename) 86 | if not compilation_info: 87 | return None 88 | 89 | final_flags = MakeRelativePathsInFlagsAbsolute( 90 | compilation_info.compiler_flags_, 91 | compilation_info.compiler_working_dir_) 92 | 93 | else: 94 | relative_to = DirectoryOfThisScript() 95 | final_flags = MakeRelativePathsInFlagsAbsolute(flags, relative_to) 96 | 97 | return { 98 | 'flags': final_flags, 99 | 'do_cache': True 100 | } 101 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # CCMgen and CCMpredPy 2 | [![Build Status](https://dev.azure.com/christianroth0419/christianroth/_apis/build/status/soedinglab.CCMgen?branchName=master)](https://dev.azure.com/christianroth0419/christianroth/_build/latest?definitionId=4&branchName=master) [![Generic badge](https://img.shields.io/badge/DOI-10.1371/journal.pcbi.1006526-blue.svg)](https://doi.org/10.1371/journal.pcbi.1006526) 3 | 4 | This repository provides a Python toolkit for learning second-order Markov Random Field (MRF) models from multiple sequence alignments of a protein families and using these models for generating realistic synthetic protein sequences. 5 | 6 | CCMpredPy is a fast implementation of an evolutionary coupling method for learning a Markov Randon Field (MRF) Model for a protein family. The parameters of the MRF can either be ingerred by pseudo-likelihood maximization or with persistent contrastive divergence. 7 | While state-of-the-art pseudo-likelihood models have consistenly been found to work best for the purpose of predicting residue-residue contacts, models learned with persistent contrastive divergence are much more accurate in their fine statistics and are recommended for the use with CCMgen to generate realistic sequence samples. 8 | 9 | CCMgen is a tool for sampling protein-like sequences from a second-order Markov Randon Field (MRF) model, such as it can be learned with CCMpredPy. The residues of generated sequences will obey the selection pressures described by the MRF with pairwise statistical couplings between residue positions. Furthermore, CCMgen provides full control over the generation of the synthetic alignment by allowing to specify the evolutionary times and phylogeny along which the sequences are sampled. 10 | 11 | ## Citation 12 | Vorberg S, Seemayer S, Söding J. Synthetic protein alignments by CCMgen quantify noise in residue-residue contact prediction. PLoS computational biology. 2018 Nov 5;14(11):e1006526. 13 | 14 | ## License 15 | 16 | CCMgen and CCMpredPy are released under the [GNU AGPLv3](https://choosealicense.com/licenses/agpl-3.0/) license. 17 | 18 | ## Dependencies 19 | 20 | - CCMgen/CCMpredPy was developed and tested with Python 3.6 21 | - There are some C libraries to speed up crucial parts of the calculations that need to be compiled with a C compiler. 22 | Note: When installing on osx, make sure to use an appropriate gcc compiler and not clang, e.g. by setting `export CC=/usr/local/Cellar/gcc/X.X.X/bin/gcc-X` if gcc was installed via brew. 23 | 24 | The following Python packages are required 25 | 26 | * NumPy 27 | * SciPy 28 | * BioPython 29 | * MsgPack 30 | * six 31 | * plotly 32 | * colorlover 33 | 34 | ## Download 35 | 36 | ### Release Versions 37 | Please check out the [GitHub releases page for CCMgen](https://github.com/soedinglab/CCMgen/releases/tag/v1.0.0-alpha) to download a stable CCMgen/CCMpredPy release. After you're done downloading and extracting, please follow the installation instructions below. 38 | 39 | ### Development Versions from Git 40 | 41 | To clone the latest development version of CCMgen/CCMpredPy, please use the following command line: 42 | 43 | ```bash 44 | git clone https://github.com/soedinglab/ccmgen.git 45 | ``` 46 | 47 | ## Installation 48 | 49 | ### From cloned/downloaded repository 50 | 51 | CCMgen/CCmpredPy can be installed from the main directory into your local Python environment via `pip`: 52 | 53 | ```bash 54 | pip install . 55 | ``` 56 | 57 | ### Directly from Github Repository 58 | 59 | Alternatively, you can install the latest development version of CCMgen/CCMpredPy with `pip` directly from this repository: 60 | 61 | ```bash 62 | pip install git+https://github.com/soedinglab/ccmgen@master 63 | ``` 64 | and keep updated with: 65 | 66 | ```bash 67 | pip install git+https://github.com/soedinglab/ccmgen@master --upgrade 68 | ``` 69 | ## Uninstall 70 | 71 | The CCMgen/CCmpredPy toolkit can be uninstalled with: 72 | 73 | ```bash 74 | pip uninstall ccmgen 75 | ``` 76 | 77 | 78 | 79 | ## Next Steps 80 | Now you're ready to use CCMgen and CCMpredPy! You can have a look at the [getting started guide](https://github.com/soedinglab/CCMgen/wiki/Getting-Started-with-CCMgen-and-CCMpredPy) to learn how to use both tools. 81 | -------------------------------------------------------------------------------- /azure-pipelines.yml: -------------------------------------------------------------------------------- 1 | trigger: 2 | - master 3 | pr: 4 | - master 5 | 6 | jobs: 7 | - job: 8 | displayName: ubuntu-16.04 9 | pool: 10 | vmImage: 'ubuntu-16.04' 11 | strategy: 12 | matrix: 13 | Python36_bp_170: 14 | python.version: '3.6' 15 | biopython.version: '1.70' 16 | Python36: 17 | python.version: '3.6' 18 | biopython.version: '*' 19 | Python39: 20 | python.version: '3.9' 21 | biopython.version: '*' 22 | 23 | steps: 24 | - bash: echo "##vso[task.prependpath]$CONDA/bin" 25 | displayName: Add conda to PATH 26 | 27 | - bash: conda create --yes --quiet --name env 28 | displayName: Create Anaconda environment 29 | 30 | - bash: | 31 | source activate env 32 | conda install --yes --quiet --name env -c conda-forge python=$PYTHON_VERSION biopython=$BIOPYTHON_VERSION pip numpy c-compiler openmp 33 | pip install . 34 | displayName: Install CCMgen 35 | - bash: | 36 | source activate env 37 | bash ci_support/run_tests.sh && test -f sequences.msa 38 | displayName: Run tests 39 | 40 | - job: 41 | displayName: macOS 10.14 42 | pool: 43 | vmImage: 'macOS-10.14' 44 | strategy: 45 | matrix: 46 | Python36_bp_170: 47 | python.version: '3.6' 48 | biopython.version: '1.70' 49 | Python36: 50 | python.version: '3.6' 51 | biopython.version: '*' 52 | Python39: 53 | python.version: '3.9' 54 | biopython.version: '*' 55 | 56 | steps: 57 | - bash: echo "##vso[task.prependpath]$CONDA/bin" 58 | displayName: Add conda to PATH 59 | 60 | - bash: conda create --yes --quiet --name env 61 | displayName: Create Anaconda environment 62 | 63 | - bash: | 64 | source activate env 65 | conda install --yes --quiet --name env -c conda-forge python=$PYTHON_VERSION biopython=$BIOPYTHON_VERSION pip numpy c-compiler openmp 66 | pip install . 67 | displayName: Install CCMgen 68 | - bash: | 69 | source activate env 70 | bash ci_support/run_tests.sh && test -f sequences.msa 71 | displayName: Run tests -------------------------------------------------------------------------------- /ccmpred/algorithm/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/algorithm/__init__.py -------------------------------------------------------------------------------- /ccmpred/algorithm/gradient_descent.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import ccmpred.logo 3 | import ccmpred.monitor.progress as pr 4 | 5 | 6 | class gradientDescent(): 7 | """Optimize objective function using gradient descent""" 8 | 9 | def __init__(self, progress, neff, maxit=2000, alpha0=0, 10 | decay=True, decay_start=1e-1, decay_rate=5e-6, decay_type="sig", 11 | fix_v=True, epsilon=1e-8, convergence_prev=5, early_stopping=True, 12 | non_contact_indices=None): 13 | 14 | 15 | self.maxit = maxit 16 | self.alpha0 = alpha0 17 | 18 | #initial learning rate defined wrt to effective number of sequences 19 | if self.alpha0 == 0: 20 | self.alpha0 = 5e-2 / np.sqrt(neff) 21 | 22 | #decay settings 23 | self.decay=decay 24 | self.decay_start = decay_start 25 | self.decay_rate = np.float(decay_rate) 26 | self.decay_type = decay_type 27 | self.it_succesfull_stop_condition=-1 28 | 29 | #single potentials will not be optimized if fix_v=True 30 | self.fix_v=fix_v 31 | 32 | #convergence settings for optimization 33 | self.early_stopping = early_stopping 34 | self.epsilon = epsilon 35 | self.convergence_prev=convergence_prev 36 | 37 | #whether optimization is run with constraints (non-contacts are masked) 38 | self.non_contact_indices = non_contact_indices 39 | 40 | #optimization progress logger 41 | self.progress = progress 42 | 43 | 44 | 45 | def __repr__(self): 46 | rep_str="Gradient descent optimization (alpha0={0})\n".format( np.round(self.alpha0, decimals=8)) 47 | 48 | rep_str+="\tconvergence criteria: maxit={0} early_stopping={1} epsilon={2} prev={3}\n".format( 49 | self.maxit, self.early_stopping, self.epsilon, self.convergence_prev) 50 | 51 | if self.decay: 52 | rep_str+="\tdecay: decay_type={0} decay_rate={1} decay_start={2} \n".format( 53 | self.decay_type, np.round(self.decay_rate, decimals=8), self.decay_start 54 | ) 55 | else: 56 | rep_str+="no decay\n" 57 | 58 | return rep_str 59 | 60 | def minimize(self, objfun, x): 61 | 62 | subtitle = self.progress.title + self.__repr__().replace("\n", "
") 63 | subtitle += objfun.__repr__().replace("\n", "
") 64 | self.progress.set_plot_title(subtitle) 65 | 66 | ret = { 67 | "code": 2, 68 | "message": "Reached maximum number of iterations", 69 | "num_iterations": self.maxit 70 | } 71 | 72 | fx = -1 73 | alpha = self.alpha0 74 | persistent=False 75 | for i in range(self.maxit): 76 | 77 | #in case CD has property persistent=True 78 | #turn on persistent CD when learning rate is small enough 79 | if objfun.persistent and alpha < self.alpha0/10: 80 | persistent=True 81 | 82 | fx, gx, greg = objfun.evaluate(x, persistent) 83 | g = gx + greg 84 | 85 | #decompose gradients and parameters 86 | x_single, x_pair = objfun.linear_to_structured(x) 87 | g_single, g_pair = objfun.linear_to_structured(g) 88 | gx_single, gx_pair = objfun.linear_to_structured(gx) 89 | g_reg_single, g_reg_pair = objfun.linear_to_structured(greg) 90 | 91 | #masking: set coupling gradients for all pairs (i,j) with d_ij > contact_thr = 0 92 | if self.non_contact_indices is not None: 93 | g_pair[self.non_contact_indices[0], self.non_contact_indices[1], :, :] = 0 94 | 95 | 96 | #compute norm of coupling parameters 97 | xnorm_pair = np.sqrt(np.sum(x_pair * x_pair)/2) 98 | 99 | if i > self.convergence_prev: 100 | xnorm_prev = self.progress.optimization_log['||w||'][-self.convergence_prev] 101 | xnorm_diff = np.abs((xnorm_prev - xnorm_pair)) / xnorm_prev 102 | else: 103 | xnorm_diff = 1.0 104 | 105 | #start decay at iteration i 106 | if self.decay and xnorm_diff < self.decay_start and self.it_succesfull_stop_condition < 0: 107 | self.it_succesfull_stop_condition = i 108 | 109 | #new step size 110 | if self.it_succesfull_stop_condition > 0: 111 | t = i - self.it_succesfull_stop_condition + 1 112 | if self.decay_type == "lin": 113 | alpha = self.alpha0 / (1 + self.decay_rate * t) 114 | if self.decay_type == "sig": 115 | alpha *= 1.0 / (1 + self.decay_rate * t) 116 | if self.decay_type == "sqrt": 117 | alpha = self.alpha0 / np.sqrt(1 + self.decay_rate * t) 118 | if self.decay_type == "exp": 119 | alpha = self.alpha0 * np.exp(- self.decay_rate * t) 120 | 121 | 122 | #print out progress 123 | log_metrics={} 124 | log_metrics['||w||'] = xnorm_pair 125 | log_metrics['||g||'] = np.sqrt(np.sum(g_pair * g_pair)/2) 126 | log_metrics['||g_w||'] = np.sqrt(np.sum(gx_pair * gx_pair)/2) 127 | log_metrics['||greg_w||'] = np.sqrt(np.sum(g_reg_pair * g_reg_pair)/2) 128 | log_metrics['xnorm_diff'] = xnorm_diff 129 | log_metrics['max_g'] = np.max(np.abs(gx)) 130 | log_metrics['alpha'] = alpha 131 | log_metrics['PCD'] = persistent 132 | 133 | if not self.fix_v: 134 | log_metrics['||v||'] = np.sqrt(np.sum(x_single * x_single)) 135 | log_metrics['||v+w||'] = np.sqrt(np.sum(x * x)) 136 | log_metrics['||g_v||'] = np.sqrt(np.sum(gx_single * gx_single)) 137 | log_metrics['||g||'] = np.sqrt(np.sum(gx * gx)) 138 | log_metrics['||g_reg_v||'] = np.sqrt(np.sum(g_reg_single * g_reg_single)) 139 | 140 | self.progress.log_progress(i + 1, **log_metrics) 141 | 142 | 143 | #stop condition 144 | if self.early_stopping: 145 | if xnorm_diff < self.epsilon: 146 | 147 | ret = { 148 | "code": 0, 149 | "message": "Stopping condition (xnorm diff < {0}) successfull.".format(self.epsilon), 150 | "num_iterations": i 151 | } 152 | return fx, x, ret 153 | 154 | # update parameters 155 | if not self.fix_v: 156 | x_single -= alpha * g_single 157 | x_pair -= alpha * g_pair 158 | 159 | x = objfun.structured_to_linear(x_single, x_pair) 160 | 161 | return fx, x, ret 162 | 163 | def get_parameters(self): 164 | parameters={} 165 | 166 | parameters['convergence'] = {} 167 | parameters['convergence']['maxit'] = self.maxit 168 | parameters['convergence']['early_stopping'] = self.early_stopping 169 | parameters['convergence']['epsilon'] = self.epsilon 170 | parameters['convergence']['convergence_prev'] = self.convergence_prev 171 | 172 | parameters['decay'] = {} 173 | parameters['decay']['alpha0'] = self.alpha0 174 | parameters['decay']['decay'] = self.decay 175 | parameters['decay']['decay_start'] = self.decay_start 176 | parameters['decay']['decay_rate'] = self.decay_rate 177 | parameters['decay']['decay_type'] = self.decay_type 178 | 179 | parameters['fix_v'] = self.fix_v 180 | 181 | return parameters 182 | -------------------------------------------------------------------------------- /ccmpred/algorithm/lbfgs.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import ccmpred.monitor.progress as pr 3 | from scipy.optimize import minimize as min 4 | 5 | class LBFGS(object): 6 | """Optimize objective function usign lbfgs""" 7 | 8 | def __init__(self, progress, maxit=100, ftol=1e-4, max_linesearch=20, maxcor=5, non_contact_indices=None): 9 | 10 | self.max_linesearch=max_linesearch 11 | self.ftol = ftol 12 | self.maxit = maxit 13 | self.maxcor = maxcor 14 | 15 | # whether optimization is run with constraints (non-contacts are masked) 16 | self.non_contact_indices = non_contact_indices 17 | 18 | # optimization progress logger 19 | self.progress = progress 20 | 21 | self.g_x = None 22 | self.objfun=None 23 | self.iteration=0 24 | 25 | 26 | def __repr__(self): 27 | 28 | repr_str = "LBFGS optimization (ftol={0}, maxcor={1}, max_ls={2})\n".format( 29 | self.ftol,self.maxcor,self.max_linesearch) 30 | repr_str += "\tconvergence criteria: maxit={0} \n".format(self.maxit) 31 | 32 | return repr_str 33 | 34 | def lbfgs_f(self, x): 35 | 36 | fx, g_x, g_reg = self.objfun.evaluate(x) 37 | 38 | #gradient is computed x 2 in pll.evaluate because of compatibility with conjugate gradient optimization!! 39 | g_x_single, g_x_pair = self.objfun.linear_to_structured(g_x) 40 | g_reg_single, g_reg_pair = self.objfun.linear_to_structured(g_reg) 41 | g = self.objfun.structured_to_linear(g_x_single+g_reg_single, (g_x_pair+g_reg_pair)/2) 42 | 43 | # masking: set coupling gradients for all pairs (i,j) with d_ij > contact_thr = 0 44 | if self.non_contact_indices is not None: 45 | g_single, g_pair = self.objfun.linear_to_structured(g) 46 | g_pair[self.non_contact_indices[0], self.non_contact_indices[1], :, :] = 0 47 | g = self.objfun.structured_to_linear(g_single, g_pair) 48 | 49 | return fx, g 50 | 51 | def print_and_plot(self, x): 52 | 53 | self.iteration += 1 54 | 55 | x_single, x_pair = self.objfun.finalize(x) 56 | 57 | log_metrics={} 58 | log_metrics['||v+w||'] = np.sqrt(np.sum(x_single * x_single) + np.sum(x_pair * x_pair)/2) 59 | log_metrics['||v||'] = np.sqrt(np.sum(x_single * x_single)) 60 | log_metrics['||w||'] = np.sqrt(np.sum(x_pair * x_pair)/2) 61 | self.progress.log_progress(self.iteration, **log_metrics) 62 | 63 | def minimize(self, objfun, x): 64 | 65 | self.objfun = objfun 66 | 67 | subtitle = self.progress.title + self.__repr__().replace("\n", "
") 68 | subtitle += objfun.__repr__().replace("\n", "
") 69 | self.progress.set_plot_title(subtitle) 70 | 71 | res = min(self.lbfgs_f, 72 | x, 73 | method='L-BFGS-B', 74 | jac=True, 75 | options={ 76 | 'maxls': self.max_linesearch, 77 | 'gtol': 1e-05, 78 | 'eps': 1e-08, 79 | 'maxiter': self.maxit, 80 | 'ftol': self.ftol, 81 | 'maxfun': 15000, 82 | 'maxcor': self.maxcor, 83 | 'disp': False 84 | }, 85 | callback=self.print_and_plot 86 | ) 87 | 88 | 89 | ret = { 90 | "code": res.status, 91 | "message": res.message.decode("utf-8"), 92 | "num_iterations": res.nit 93 | } 94 | 95 | return res.fun, res.x, ret 96 | 97 | def get_gradient_x(self): 98 | 99 | return(self.g_x) 100 | 101 | def get_parameters(self): 102 | parameters={} 103 | 104 | parameters['convergence']={} 105 | parameters['convergence']['maxit'] = self.maxit 106 | parameters['convergence']['max_linesearch'] = self.max_linesearch 107 | parameters['convergence']['ftol'] = self.ftol 108 | 109 | 110 | return parameters -------------------------------------------------------------------------------- /ccmpred/centering.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def center_v(freqs): 4 | single_freqs, _ = freqs 5 | 6 | #single_freqs either normalized with or without gaps --> same result due to subtraction of mean 7 | 8 | 9 | #hack when usign no pseudo counts to be able to take log of zero counts 10 | eps = 1e-10 11 | single_freqs[single_freqs 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #define N_ALPHA 21 9 | 10 | void msa_count_single(double *counts, uint8_t *msa, double *weights, uint32_t nrow, uint32_t ncol) { 11 | int n, i; 12 | unsigned char a; 13 | 14 | memset(counts, 0, sizeof(double) * ncol * N_ALPHA); 15 | 16 | for(n = 0; n < nrow; n++) { 17 | for(i = 0; i < ncol; i++) { 18 | a = msa[n * ncol + i]; 19 | counts[i * N_ALPHA + a] += weights[n]; 20 | } 21 | } 22 | } 23 | 24 | 25 | void msa_count_pairs(double *counts, uint8_t *msa, double *weights, uint32_t nrow, uint32_t ncol) { 26 | memset(counts, 0, sizeof(double) * ncol * ncol * N_ALPHA * N_ALPHA); 27 | 28 | #pragma omp parallel 29 | #pragma omp for nowait 30 | for(int ij = 0; ij < ncol * ncol; ij++) { 31 | int i = ij / ncol; 32 | int j = ij % ncol; 33 | for(int n = 0; n < nrow; n++) { 34 | 35 | unsigned char a = msa[n * ncol + i]; 36 | unsigned char b = msa[n * ncol + j]; 37 | counts[((i * ncol + j) * N_ALPHA + a) * N_ALPHA + b] += weights[n]; 38 | } 39 | } 40 | } 41 | 42 | void msa_char_to_index(uint8_t *msa, uint32_t nrow, uint32_t ncol) { 43 | 44 | int amino_indices[29]; 45 | int n, i; 46 | unsigned char c; 47 | 48 | // Make hash lookup table for amino acid characters to amino acid numbers 49 | // hash keys are the ASCII codes of the upper-case amino acids, modulo 29. 50 | // hash values are the amino acid numbers. 51 | // 52 | // aa A R N D C Q E G H I L K M F P S T W Y V - 53 | // asc 65 82 78 68 67 81 69 71 72 73 76 75 77 70 80 83 84 87 89 86 45 54 | // mod 7 24 20 10 9 23 11 13 14 15 18 17 19 12 22 25 26 0 2 28 16 55 | for(c = 0; c < 29; c++) { 56 | amino_indices[c] = 20; 57 | } 58 | 59 | amino_indices[ 7] = 0; // A 60 | amino_indices[24] = 1; // R 61 | amino_indices[20] = 2; // N 62 | amino_indices[10] = 3; // D 63 | amino_indices[ 9] = 4; // C 64 | amino_indices[23] = 5; // Q 65 | amino_indices[11] = 6; // E 66 | amino_indices[13] = 7; // G 67 | amino_indices[14] = 8; // H 68 | amino_indices[15] = 9; // I 69 | amino_indices[18] = 10; // L 70 | amino_indices[17] = 11; // K 71 | amino_indices[19] = 12; // M 72 | amino_indices[12] = 13; // F 73 | amino_indices[22] = 14; // P 74 | amino_indices[25] = 15; // S 75 | amino_indices[26] = 16; // T 76 | amino_indices[ 0] = 17; // W 77 | amino_indices[ 2] = 18; // Y 78 | amino_indices[28] = 19; // V 79 | amino_indices[16] = 20; // - 80 | 81 | for(n = 0; n < nrow; n++) { 82 | for(i = 0; i < ncol; i++) { 83 | msa[n * ncol + i] = amino_indices[ toupper(msa[n * ncol + i]) % 29 ]; 84 | } 85 | } 86 | 87 | } 88 | 89 | 90 | void msa_index_to_char(uint8_t *msa, uint32_t nrow, uint32_t ncol) { 91 | uint8_t char_indices[] = {'A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F', 'P', 'S', 'T', 'W', 'Y', 'V', '-' }; 92 | int n, i; 93 | 94 | for(n = 0; n < nrow; n++) { 95 | for(i = 0; i < ncol; i++) { 96 | msa[n * ncol + i] = char_indices[msa[n * ncol + i]]; 97 | } 98 | } 99 | } 100 | -------------------------------------------------------------------------------- /ccmpred/gaps/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import ccmpred.counts 3 | 4 | from ccmpred.gaps.cext import remove_gaps_probs, remove_gaps_consensus 5 | 6 | 7 | def remove_gaps_col_freqs(msa): 8 | counts = ccmpred.counts.single_counts(msa) 9 | counts[:, 20] = 0 10 | 11 | counts /= np.sum(counts, axis=1)[:, np.newaxis] 12 | 13 | return remove_gaps_probs(msa, counts) 14 | 15 | 16 | def backinsert_gapped_positions_aln(msa, gapped_positions): 17 | 18 | for position in gapped_positions: 19 | msa = np.insert(msa, position, [20], axis=1) 20 | 21 | return msa 22 | 23 | def backinsert_gapped_positions_mat(mat, gapped_positions): 24 | 25 | for position in gapped_positions: 26 | mat = np.insert(mat, position, [0], axis=0) 27 | mat = np.insert(mat, position, [0], axis=1) 28 | 29 | return mat 30 | 31 | def backinsert_gapped_positions(x_single, x_pair, gapped_positions): 32 | 33 | for position in gapped_positions: 34 | x_single = np.insert(x_single,position, [0], axis=0) 35 | x_pair = np.insert(x_pair,position, [0], axis=0) 36 | x_pair = np.insert(x_pair,position, [0], axis=1) 37 | 38 | return x_single, x_pair 39 | 40 | 41 | def remove_gapped_sequences(msa, max_gap_seq): 42 | 43 | if max_gap_seq >= 100: 44 | return msa 45 | 46 | msa_gap_count_per_sequence = (msa == 20).sum(1) 47 | 48 | #how many positions per sequence are allowed to contain gaps? 49 | max_gap_percentage_per_sequence = ((max_gap_seq / 100.0) * msa.shape[1]) 50 | 51 | high_coverage = np.where(msa_gap_count_per_sequence < max_gap_percentage_per_sequence) 52 | 53 | print("Removed {0} sequences with > {1} percent gaps.".format( 54 | msa.shape[0] - len(high_coverage[0]), max_gap_seq/100.0)) 55 | 56 | return np.ascontiguousarray(msa[high_coverage[0], :]) 57 | 58 | def remove_gapped_positions(msa, max_gap_percentage): 59 | 60 | if max_gap_percentage >= 100: 61 | return msa, [] 62 | 63 | msa_gap_counts = (msa == 20).sum(0) 64 | 65 | max_gap_count = ((max_gap_percentage/100.0) * msa.shape[0]) 66 | 67 | ungapped_positions = np.where(msa_gap_counts < max_gap_count) 68 | gapped_positions = np.where(msa_gap_counts >= max_gap_count) 69 | 70 | 71 | print("Removed {0} alignment positions with > {1} percent gaps.".format( 72 | len(gapped_positions[0]), max_gap_percentage/100.0)) 73 | 74 | return np.ascontiguousarray(msa[:, ungapped_positions[0]]), gapped_positions[0] -------------------------------------------------------------------------------- /ccmpred/gaps/cext/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.ctypeslib as npct 3 | import ctypes 4 | import os.path 5 | 6 | import ccmpred.counts 7 | 8 | array_2d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=2, flags='CONTIGUOUS') 9 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS') 10 | array_1d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=1, flags='CONTIGUOUS') 11 | 12 | libgaps = npct.load_library('libgaps', os.path.join(os.path.dirname(__file__), '_build')) 13 | 14 | libgaps.remove_gaps_probs.restype = None 15 | libgaps.remove_gaps_probs.argtypes = [ 16 | array_2d_float, # *x 17 | array_2d_char, # *msa 18 | ctypes.c_uint32, # nrow 19 | ctypes.c_uint32, # ncol 20 | ] 21 | 22 | 23 | libgaps.remove_gaps_consensus.restype = None 24 | libgaps.remove_gaps_consensus.argtypes = [ 25 | array_2d_char, # *msa 26 | array_1d_char, # *consensus 27 | ctypes.c_uint32, # nrow 28 | ctypes.c_uint32, # ncol 29 | ] 30 | 31 | 32 | def compute_consensus(msa, ignore_gaps=True): 33 | counts = ccmpred.counts.single_counts(msa) 34 | if ignore_gaps: 35 | counts = counts[:, :20] 36 | 37 | return np.argmax(counts, axis=1).astype('uint8') 38 | 39 | 40 | def remove_gaps_probs(msa, probs): 41 | assert(probs.shape[0] == msa.shape[1]) 42 | libgaps.remove_gaps_probs(np.ascontiguousarray(probs), msa, *msa.shape) 43 | return msa 44 | 45 | 46 | def remove_gaps_consensus(msa, consensus=None): 47 | if not consensus: 48 | consensus = compute_consensus(msa) 49 | 50 | assert(consensus.shape[0] == msa.shape[1]) 51 | libgaps.remove_gaps_consensus(msa, consensus, *msa.shape) 52 | 53 | return msa 54 | -------------------------------------------------------------------------------- /ccmpred/gaps/cext/gaps.c: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #include "gaps.h" 4 | 5 | int pick_random_weighted(flt *probs, int n) { 6 | int a; 7 | double p = (double)rand() / (double)RAND_MAX; 8 | for (a = 0; a < n; a++) { 9 | flt p_curr = probs[a]; 10 | if (p < p_curr) { 11 | return a; 12 | } 13 | p -= p_curr; 14 | } 15 | return n - 1; 16 | } 17 | 18 | 19 | /** 20 | * substitute gaps in the sequence according to probability 21 | * 22 | * @param[in] p The MSA probabilities 23 | * @param[inout] msa The MSA to clean 24 | * @param[in] nrow The number of rows 25 | * @param[in] ncol The number of columns 26 | */ 27 | void remove_gaps_probs( 28 | const flt *const p, 29 | unsigned char *const msa, 30 | int nrow, 31 | int ncol 32 | ) { 33 | int i, j; 34 | for(i = 0; i < nrow; i++) { 35 | for (j = 0; j < ncol; j++) { 36 | if (msa[i * ncol + j] != GAP) continue; 37 | 38 | msa[i * ncol + j] = pick_random_weighted((flt *)&p[j * N_ALPHA], N_ALPHA); 39 | } 40 | } 41 | } 42 | 43 | /** 44 | * remove gaps according to consensus sequence 45 | * 46 | * @param[inout] msa the MSA to clean (nrow x ncol) 47 | * @param[in] The consensus sequence to use as a replacement (ncol) 48 | * @param[in] nrow The number of rows 49 | * @param[in] ncol The number of columns 50 | */ 51 | void remove_gaps_consensus( 52 | unsigned char *const msa, 53 | unsigned char *const consensus, 54 | int nrow, 55 | int ncol 56 | ) { 57 | int i, j; 58 | for(i = 0; i < nrow; i++) { 59 | for(j = 0; j < ncol; j++) { 60 | if(msa[i * ncol + j] != GAP) continue; 61 | msa[i * ncol + j] = consensus[j]; 62 | } 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /ccmpred/gaps/cext/gaps.h: -------------------------------------------------------------------------------- 1 | #ifndef GAP_H 2 | #define GAP_H 3 | 4 | #define GAP 20 5 | #define N_ALPHA 21 6 | typedef double flt; 7 | 8 | void remove_gaps_probs( 9 | const flt *const p, 10 | unsigned char *const msa, 11 | int nrow, 12 | int ncol 13 | ); 14 | 15 | void remove_gaps_consensus( 16 | unsigned char *const msa, 17 | unsigned char *const consensus, 18 | int nrow, 19 | int ncol 20 | ); 21 | 22 | #endif 23 | -------------------------------------------------------------------------------- /ccmpred/io/__init__.py: -------------------------------------------------------------------------------- 1 | from ccmpred.io.alignment import read_msa, read_msa_biopython, read_msa_psicov, write_msa_psicov, AMINO_ACIDS 2 | from ccmpred.io.contactmatrix import write_matrix 3 | from ccmpred.io.pdb import distance_map -------------------------------------------------------------------------------- /ccmpred/io/alignment.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import ccmpred.counts 3 | import Bio.AlignIO as aio 4 | 5 | AMINO_ACIDS = "ARNDCQEGHILKMFPSTWYV-" 6 | 7 | def read_msa(f, format, return_indices=True, return_identifiers=False): 8 | if format == 'psicov': 9 | return read_msa_psicov(f, return_indices, return_identifiers) 10 | else: 11 | return read_msa_biopython(f, format, return_indices, return_identifiers) 12 | 13 | def read_msa_biopython(f, format, return_indices=True, return_identifiers=False): 14 | 15 | records = list(aio.read(f, format)) 16 | 17 | msa = [str(r.seq) for r in records] 18 | msa = np.array([[ord(c) for c in x.strip()] for x in msa], dtype=np.uint8) 19 | 20 | if return_indices: 21 | ccmpred.counts.index_msa(msa, in_place=True) 22 | 23 | if return_identifiers: 24 | identifiers = [r.name for r in records] 25 | return msa, identifiers 26 | else: 27 | return msa 28 | 29 | def read_msa_psicov(f, return_indices=True, return_identifiers=False): 30 | 31 | if isinstance(f, str): 32 | with open(f, 'r') as o: 33 | msa = o.readlines() 34 | else: 35 | msa = f 36 | 37 | for i, line in enumerate(msa): 38 | if ">" in line: 39 | raise Exception("Line number {0} contains a '>' - please set the correct alignment format!:\n{1}".format(i + 1, line)) 40 | 41 | msa = np.array([[ord(c) for c in x.strip()] for x in msa], dtype=np.uint8) 42 | 43 | if return_indices: 44 | ccmpred.counts.index_msa(msa, in_place=True) 45 | 46 | if return_identifiers: 47 | identifiers = ["seq{0}".format(i) for i in range(msa.shape[0])] 48 | return msa, identifiers 49 | else: 50 | return msa 51 | 52 | 53 | def write_msa(f, msa, ids, format, is_indices=True, descriptions=None): 54 | 55 | if format == 'psicov': 56 | write_msa_psicov(f, msa, is_indices=is_indices) 57 | else: 58 | write_msa_biopython(f, msa, ids, format, is_indices=is_indices, descriptions=descriptions) 59 | 60 | def write_msa_psicov(f, msa, is_indices=True): 61 | 62 | if is_indices: 63 | msa = ccmpred.counts.char_msa(msa) 64 | 65 | f.write("\n".join(["".join(chr(cell) for cell in row) for row in msa])) 66 | 67 | def write_msa_biopython(f, msa, ids, format, is_indices=True, descriptions=None): 68 | import Bio.SeqIO 69 | from Bio.SeqRecord import SeqRecord 70 | from Bio.Seq import Seq 71 | 72 | if is_indices: 73 | msa = ccmpred.counts.char_msa(msa) 74 | 75 | if descriptions is None: 76 | descriptions = ["" for _ in range(msa.shape[0])] 77 | 78 | msa = ["".join(chr(c) for c in row) for row in msa] 79 | 80 | records = [ 81 | SeqRecord(Seq(seq), id=id, description=desc, 82 | annotations={"molecule_type": "protein"}) 83 | for seq, id, desc in zip(msa, ids, descriptions) 84 | ] 85 | 86 | Bio.SeqIO.write(records, f, format) 87 | -------------------------------------------------------------------------------- /ccmpred/io/contactmatrix.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import json 3 | import gzip 4 | import os 5 | import sys 6 | 7 | def frobenius_score(x): 8 | """ 9 | Compute frobenius norm of couplig matrix 10 | 11 | :param x: pair potentials of dimension [ L x L x 20 x 20 ] 12 | :param squared: 13 | :return: 14 | """ 15 | 16 | return np.sqrt(np.sum(x * x, axis=(2, 3))) 17 | 18 | def apc(cmat): 19 | """ 20 | Compute average product correction (APC) according to Dunn et al 2004 21 | 22 | :param cmat: contact matrix 23 | :return: corrected contact matrix 24 | """ 25 | print("Apply Average Product Correction (APC)") 26 | 27 | mean = np.mean(cmat, axis=0) 28 | apc_term = mean[:, np.newaxis] * mean[np.newaxis, :] / np.mean(cmat) 29 | 30 | return cmat - apc_term 31 | 32 | def compute_scaling_factor(x_pair, uij, nr_states, squared=True): 33 | """ 34 | Set the strength of the entropy correction by optimization eta with least squares 35 | 36 | Minimize sum_i,j sum_a,b (w_ijab^2 - eta * u_ia * u_jb)^2 37 | 38 | :param x_pair: raw coupling scores 39 | :param uij: 40 | :param nr_states: normalize entropy wrt 20 or 21 characters 41 | :param squared: 42 | :return: 43 | """ 44 | 45 | squared_sum_couplings = np.sum(x_pair[:,:,:20,:20] * x_pair[:,:,:20,:20], axis=(3,2)) 46 | 47 | if squared: 48 | 49 | squared_sum_entropy = np.sum(uij[:,:,:nr_states,:nr_states], axis=(3,2)) 50 | scaling_factor = np.sum(squared_sum_couplings * squared_sum_entropy) 51 | 52 | denominator = np.sum(uij * uij) 53 | scaling_factor /= denominator 54 | 55 | else: 56 | 57 | #According to Stefan's CCMgen paper 58 | #both are LxL matrices 59 | c_ij = np.sqrt(squared_sum_couplings) 60 | e_ij = np.sqrt(np.sum(uij[:,:,:nr_states,:nr_states], axis=(3,2))) 61 | 62 | scaling_factor = np.sum(c_ij * e_ij) 63 | denominator = np.sum(uij[:,:,:nr_states,:nr_states]) 64 | scaling_factor /= denominator 65 | 66 | return scaling_factor 67 | 68 | def compute_local_correction( 69 | single_freq, x_pair, Neff, lambda_w, squared=True, 70 | entropy=False, nr_states=20, log=np.log2): 71 | 72 | print("Apply entropy correction (using {0} states and {1})".format(nr_states, log.__name__)) 73 | 74 | 75 | if entropy: 76 | N_factor = 1 77 | ui = N_factor * single_freq[:, :nr_states] * log(single_freq[:, :nr_states]) 78 | else: 79 | #correct for fractional counts 80 | N_factor = np.sqrt(Neff) * (1.0 / lambda_w) 81 | ui = N_factor * single_freq[:, :nr_states] * (1 - single_freq[:, :nr_states]) 82 | uij = np.transpose(np.multiply.outer(ui, ui), (0,2,1,3)) 83 | 84 | ### compute optimal scaling factor 85 | scaling_factor = compute_scaling_factor(x_pair, uij, nr_states, squared=squared) 86 | 87 | if not squared: 88 | mat = frobenius_score(x_pair) 89 | correction = scaling_factor * np.sqrt(np.sum(uij, axis=(3, 2))) 90 | else: 91 | mat = np.sum(x_pair * x_pair, axis=(2, 3)) 92 | correction = scaling_factor * np.sum(uij, axis=(3, 2)) 93 | 94 | return scaling_factor, mat - correction 95 | 96 | 97 | def write_matrix(matfile, mat, meta): 98 | 99 | if matfile.endswith(".gz"): 100 | with gzip.open(matfile, 'wb') as f: 101 | np.savetxt(f, mat) 102 | f.write("#>META> " + json.dumps(meta) + "\n") 103 | f.close() 104 | else: 105 | np.savetxt(matfile, mat) 106 | with open(matfile,'a') as f: 107 | f.write("#>META> " + json.dumps(meta) + "\n") 108 | f.close() 109 | 110 | def read_matrix(matfile): 111 | """ 112 | Read matrix file 113 | :param mat_file: path to matrix file 114 | :return: matrix 115 | """ 116 | 117 | if not os.path.exists(matfile): 118 | raise IOError("Matrix File " + str(matfile) + "cannot be found. ") 119 | 120 | 121 | ### Read contact map (matfile can also be compressed file) 122 | mat = np.genfromtxt(matfile, comments="#") 123 | 124 | ### Read meta data from mat file 125 | meta = {} 126 | with open(matfile) as f: 127 | for line in f: 128 | if '#>META>' in line: 129 | meta = json.loads(line.split("> ")[1]) 130 | 131 | if len(meta) == 0: 132 | print(str(matfile) + " does not contain META info. (Line must start with #META!)") 133 | 134 | return mat, meta 135 | 136 | def find_dict_key(key, dictionary): 137 | for k, v in dictionary.items(): 138 | if k == key: 139 | return v 140 | if isinstance(v, dict): 141 | res = find_dict_key(key, v) 142 | if res is not None: 143 | return res 144 | if isinstance(v, list): 145 | for d in v: 146 | if isinstance(d, list) or isinstance(d, dict): 147 | res = find_dict_key(key, d) 148 | if res is not None: 149 | return res 150 | 151 | 152 | return None 153 | -------------------------------------------------------------------------------- /ccmpred/io/pdb.py: -------------------------------------------------------------------------------- 1 | from Bio.PDB import PDBParser 2 | import numpy as np 3 | 4 | def read_pdb(pdbfile): 5 | ''' 6 | Read a PDB file as structure file with BIO.PDB 7 | 8 | :param pdbfile: path to pdb file 9 | :return: structure 10 | ''' 11 | 12 | parser = PDBParser() 13 | structure = parser.get_structure('pdb', pdbfile) 14 | 15 | return structure 16 | 17 | def calc_residue_dist(residue_one, residue_two, distance_definition="Cb"): 18 | ''' 19 | Calculate euclidian distance between C-beta (C-alpha in case of Glycine/missing C-beta) 20 | atoms of oth residues 21 | :param residue_one: BIO.PDB residue object 1 22 | :param residue_two: BIO.PDB residue object 2 23 | :return: float euclidian distance between residues 24 | ''' 25 | 26 | if distance_definition == "Cb": 27 | 28 | if residue_one.has_id("CB"): 29 | residue_one_atom = residue_one["CB"] 30 | else: 31 | residue_one_atom = residue_one["CA"] 32 | 33 | if residue_two.has_id("CB"): 34 | residue_two_atom = residue_two["CB"] 35 | else: 36 | residue_two_atom = residue_two["CA"] 37 | 38 | diff_vector = residue_one_atom.coord - residue_two_atom.coord 39 | diff = np.sqrt(np.sum(diff_vector * diff_vector)) 40 | else: 41 | diff_list = [] 42 | for atom_1 in [atom for atom in residue_one if atom.name not in ['N', 'O', 'C']]: 43 | for atom_2 in [atom for atom in residue_two if atom.name not in ['N', 'O', 'C']]: 44 | diff_vector = atom_1.coord - atom_2.coord 45 | diff_list.append(np.sqrt(np.sum(diff_vector * diff_vector))) 46 | 47 | diff = np.min(diff_list) 48 | 49 | return diff 50 | 51 | def distance_map(pdb_file, L=None, distance_definition="Cb"): 52 | ''' 53 | Compute the distances between Cbeta (Calpha for Glycine) atoms of all residue pairs 54 | 55 | :param pdb_file: PDB file (first chain of first model will be used) 56 | :return: LxL numpy array with distances (L= protein length) 57 | ''' 58 | 59 | structure = read_pdb(pdb_file) 60 | structure.get_list() 61 | model = structure[0] 62 | chain = model.get_list()[0] 63 | 64 | # due to missing residues in the pdb file (or additionally solved??) 65 | # protein length L can differ from len(chain.get_list()) 66 | if L is None: 67 | L = chain.get_list()[-1].id[1] 68 | 69 | distance_map = np.full((L, L), np.NaN) 70 | 71 | residues = chain.get_list() 72 | for i in range(np.min([L, len(chain.get_list())])): 73 | for j in range(np.min([L, len(chain.get_list())])): 74 | residue_one = residues[i] 75 | residue_two = residues[j] 76 | distance_map[residue_one.id[1] - 1, residue_two.id[1] - 1] = calc_residue_dist(residue_one, residue_two, distance_definition) 77 | 78 | return distance_map -------------------------------------------------------------------------------- /ccmpred/locmeth/__init__.py: -------------------------------------------------------------------------------- 1 | from ccmpred.locmeth.mi import compute_mi, compute_mi_pseudocounts 2 | from ccmpred.locmeth.omes import compute_omes, compute_omes_freq -------------------------------------------------------------------------------- /ccmpred/locmeth/mi/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.stats 3 | 4 | def compute_mi(counts, normalized=False): 5 | """ 6 | 7 | :param counts: single and pairwise amino acid counts 8 | :param remove_gaps: do not count gaps 9 | :param normalized: According to Martin et al 2005 10 | (Using information theory to search for co-evolving residues in proteins) 11 | MI is normalized by joint entropy 12 | :return: 13 | """ 14 | 15 | single_counts, pair_counts = counts 16 | 17 | 18 | L = pair_counts.shape[0] 19 | indices_i_less_j = np.triu_indices(L, k=1) #excluding diagonal 20 | 21 | #compute shannon and joint shannon entropy 22 | shannon_entropy = scipy.stats.entropy(single_counts.transpose(),base=2) 23 | 24 | joint_shannon_entropy = np.zeros((L, L)) 25 | pair_counts_flat = pair_counts.reshape(L, L, pair_counts.shape[2]*pair_counts.shape[3]) 26 | joint_shannon_entropy[indices_i_less_j] = scipy.stats.entropy(pair_counts_flat[indices_i_less_j].transpose(), base=2) 27 | 28 | #compute mutual information 29 | mi = np.zeros((L, L)) 30 | mi[indices_i_less_j] = [shannon_entropy[i] + shannon_entropy[j] - joint_shannon_entropy[i,j] for i,j in zip(*indices_i_less_j)] 31 | 32 | #According to Martin et al 2005 33 | if normalized: 34 | mi[indices_i_less_j] /= joint_shannon_entropy[indices_i_less_j] 35 | 36 | #symmetrize 37 | mi += mi.transpose() 38 | 39 | 40 | return mi 41 | 42 | def compute_mi_pseudocounts(freqs): 43 | 44 | single_freqs, pair_freqs = freqs 45 | 46 | L = pair_freqs.shape[0] 47 | indices_i_less_j = np.triu_indices(L, k=1) #excluding diagonal 48 | mi = np.zeros((L, L)) 49 | 50 | #works as it should 51 | # outer = single_freqs[indices_i_less_j[0]][10, :20, np.newaxis] * single_freqs[indices_i_less_j[1]][10, np.newaxis, :20] 52 | # print outer[4,7] 53 | # print outer[7,4] 54 | # print single_freqs[indices_i_less_j[0]][10,4] * single_freqs[indices_i_less_j[1]][10,7] 55 | # print single_freqs[indices_i_less_j[0]][10,7] * single_freqs[indices_i_less_j[1]][10,4] 56 | 57 | mi_raw = pair_freqs[indices_i_less_j][:, :20, :20] * np.log2(pair_freqs[indices_i_less_j][:, :20, :20] / (single_freqs[indices_i_less_j[0]][:, :20, np.newaxis] * single_freqs[indices_i_less_j[1]][:, np.newaxis, :20]) ) 58 | 59 | 60 | mi[indices_i_less_j] = mi_raw.sum(2).sum(1) 61 | 62 | #symmetrize 63 | mi += mi.transpose() 64 | 65 | return mi 66 | -------------------------------------------------------------------------------- /ccmpred/locmeth/omes/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def compute_omes(counts, fodoraldrich=False): 5 | """ 6 | 7 | Chi squared statistic: 8 | X^2 = sum_{i=1}^N [(O_i - E_i)^2 / E_i ] # comparing counts 9 | = N sum_{i=1}^N [(O_i/N - p_i)^2 / p_i ] # comparing frequencies 10 | 11 | O_i = number of observations of type i => pairwise amino acid counts 12 | E_i = Np_i = the expected (theoretical) occurence of type i, 13 | asserted by the null hypothesis that the fraction of type i in the population is p_{i} 14 | 15 | 16 | 17 | According to Kass & Horovitz, 2002: 18 | Mapping Pathways of Allosteric Communication in GroEL by Analysis of Correlated Mutations 19 | 20 | omes(i,j) = [ count_ij(a,b) - (count_i(a) * count_j(b))/N_ij ] ^2 21 | sum_(a,b=1)^20 ----------------------------------------------------- 22 | (count_i(a) * count_j(b))/N_ij 23 | 24 | 25 | According to Fodor & Aldrich, 2004: 26 | Influence of conservation on calculations of amino acid covariance in multiple sequence alignments. 27 | omes(i,j) = [ count_ij(a,b) - (count_i(a) * count_j(b))/N_ij ] ^2 28 | sum_(a,b=1)^20 ----------------------------------------------------- 29 | N_ij 30 | 31 | 32 | Here we implement Kass & Horovitz! (see line 43) 33 | 34 | :return: 35 | """ 36 | 37 | single_counts, pair_counts = counts 38 | Nij = pair_counts.sum(3).sum(2) #== Neff 39 | L = single_counts.shape[0] 40 | 41 | # gaps do not add 42 | # if gap_treatment: 43 | # Nij = pair_counts[:, :, :20, :20].sum(3).sum(2) 44 | 45 | # compute chi square statistic 46 | Nexp = np.outer(single_counts[:, :20], single_counts[:, :20]).reshape((L, L, 20, 20)) 47 | 48 | #works as it should 49 | # print Nexp[0, 11, 2, 4] 50 | # print single_counts[0, 2] * single_counts[11, 4] 51 | 52 | 53 | Nexp /= Nij[:, :, np.newaxis, np.newaxis] 54 | diff = (pair_counts[:, :, :20, :20] - Nexp) 55 | 56 | if fodoraldrich: 57 | omes = (diff * diff) / Nij[:, :, np.newaxis, np.newaxis] # Fodor & Aldrich: we divide by Nij(neff) 58 | else: 59 | omes = (diff * diff) / Nexp # Kass & Horovitz: we divide by Nexp 60 | 61 | omes = omes.sum(3).sum(2) 62 | 63 | 64 | return omes 65 | 66 | 67 | 68 | def compute_omes_freq(counts, freqs, fodoraldrich=False, ignore_zero_counts=True): 69 | 70 | 71 | single_freqs, pair_freqs = freqs 72 | single_counts, pair_counts = counts 73 | Nij = pair_counts.sum(3).sum(2) #== Neff 74 | L = single_freqs.shape[0] 75 | 76 | # gaps do not add 77 | # if gap_treatment: 78 | # Nij = pair_counts[:, :, :20, :20].sum(3).sum(2) 79 | 80 | # compute chi square statistic 81 | Nexp = single_freqs[:, np.newaxis, :20, np.newaxis] * single_freqs[np.newaxis, :, np.newaxis, :20] 82 | 83 | #works as it should 84 | # print Nexp[0, 11, 2, 4] 85 | # print single_counts[0, 2] * single_counts[11, 4] 86 | 87 | 88 | Nexp *= Nij[:, :, np.newaxis, np.newaxis] 89 | diff = (pair_counts[:, :, :20, :20] - Nexp) 90 | 91 | 92 | if fodoraldrich: 93 | omes_full = (diff * diff) / Nij[:, :, np.newaxis, np.newaxis] # Fodor & Aldrich: we divide by Nij(neff) 94 | else: 95 | omes_full = (diff * diff) / Nexp # Kass & Horovitz: we divide by Nexp 96 | 97 | 98 | 99 | #compute statistics only for non-zero pair counts 100 | if ignore_zero_counts: 101 | ind_nonzero_ab = np.nonzero(pair_counts[:, :, :20, :20]) 102 | omes = np.zeros((L, L, 20, 20)) 103 | omes[ind_nonzero_ab] = omes_full[ind_nonzero_ab] 104 | else: 105 | omes = omes_full 106 | 107 | omes = omes.sum(3).sum(2) 108 | 109 | return omes 110 | -------------------------------------------------------------------------------- /ccmpred/logo.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import ccmpred 3 | import sys 4 | 5 | is_tty = (sys.stdin.isatty()) and (sys.stdout.isatty()) 6 | 7 | LOGOS = {} 8 | LOGOS['ccmpred', True] = """ 9 | \x1b[32m┏━╸┏━╸┏┳┓\x1b[34m┏━┓┏━┓┏━╸╺┳┓\x1b[32m┏━┓╻ ╻\x1b[0m version {0} 10 | \x1b[32m┃ ┃ ┃┃┃\x1b[34m┣━┛┣┳┛┣╸ ┃┃\x1b[32m┣━┛┗┳┛\x1b[0m Vorberg, Seemayer and Soeding (2018) 11 | \x1b[32m┗━╸┗━╸╹ ╹\x1b[34m╹ ╹┗╸┗━╸╺┻┛\x1b[32m╹ ╹ \x1b[0m https://github.com/soedinglab/ccmgen 12 | """ 13 | 14 | LOGOS['ccmpred', False] = """ 15 | ┏━╸┏━╸┏┳┓┏━┓┏━┓┏━╸╺┳┓ ┏━┓╻ ╻ version {0} 16 | ┃ ┃ ┃┃┃┣━┛┣┳┛┣╸ ┃┃ ┣━┛┗┳┛ Vorberg, Seemayer and Soeding (2018) 17 | ┗━╸┗━╸╹ ╹╹ ╹┗╸┗━╸╺┻┛ ╹ ╹ https://github.com/soedinglab/ccmgen 18 | """ 19 | 20 | 21 | LOGOS['ccmgen', True] = """ 22 | \x1b[32m┏━╸┏━╸┏┳┓\x1b[34m┏━╸┏━╸┏┓╻\x1b[0m version {0} 23 | \x1b[32m┃ ┃ ┃┃┃\x1b[34m┃╺┓┣╸ ┃┗┫\x1b[0m Vorberg, Seemayer and Soeding (2018) 24 | \x1b[32m┗━╸┗━╸╹ ╹\x1b[34m┗━┛┗━╸╹ ╹\x1b[0m https://github.com/soedinglab/ccmgen 25 | """ 26 | 27 | LOGOS['ccmgen', False] = """ 28 | ┏━╸┏━╸┏┳┓┏━╸┏━╸┏┓╻ version {0} 29 | ┃ ┃ ┃┃┃┃╺┓┣╸ ┃┗┫ Vorberg, Seemayer and Soeding (2018) 30 | ┗━╸┗━╸╹ ╹┗━┛┗━╸╹ ╹ https://github.com/soedinglab/ccmgen 31 | """ 32 | 33 | 34 | 35 | def logo(what_for="ccmpred", color=is_tty): 36 | version = ccmpred.__version__ 37 | 38 | print(LOGOS[what_for, color].format(version)) 39 | -------------------------------------------------------------------------------- /ccmpred/monitor/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/monitor/__init__.py -------------------------------------------------------------------------------- /ccmpred/monitor/progress.py: -------------------------------------------------------------------------------- 1 | import ccmpred.logo 2 | import plotly.graph_objs as go 3 | import os 4 | import sys 5 | from plotly.offline import plot as plotly_plot 6 | 7 | 8 | class Progress(): 9 | """ 10 | Plot the progress as plotly graph 11 | """ 12 | 13 | def __init__(self): 14 | 15 | self.optimization_log={} 16 | self.plotfile=None 17 | self.title="" 18 | 19 | def print_header(self): 20 | 21 | headerline ="{0:>{1}s}".format('iter', 8) 22 | headerline += (" ".join("{0:>{1}s}".format(ht, 14) for ht in sorted(self.optimization_log.keys()))) 23 | 24 | if ccmpred.logo.is_tty: 25 | print("\x1b[2;37m{0}\x1b[0m".format(headerline)) 26 | else: 27 | print(headerline) 28 | 29 | def set_plot_title(self, title): 30 | self.title=title 31 | 32 | def set_plot_file(self, file): 33 | self.plotfile=file 34 | 35 | def init_log(self, **kwargs): 36 | for name in kwargs.keys(): 37 | self.optimization_log[name] = [] 38 | 39 | self.print_header() 40 | 41 | def log_progress(self, n_iter, **kwargs): 42 | 43 | if len(self.optimization_log) == 0: 44 | self.init_log(**kwargs) 45 | 46 | if (n_iter != 0) and (n_iter % 100 == 0): 47 | self.print_header() 48 | 49 | 50 | log = "{0:>{1}}".format(n_iter, '8g') 51 | for name, metric in sorted(kwargs.items()): 52 | self.optimization_log[name].append(metric) 53 | log += "{0:>{1}}".format(metric, '15g') 54 | print(log) 55 | 56 | # log = "{0:>{1}}".format(n_iter, '8g') 57 | # print(log + " ".join("{0:>{1}}".format(self.optimization_log[key][-1], '15g') for key in sorted(self.optimization_log.keys()))) 58 | 59 | if self.plotfile is not None: 60 | self.plot_progress() 61 | 62 | sys.stdout.flush() 63 | 64 | 65 | def plot_progress(self ): 66 | 67 | if self.plotfile is not None: 68 | 69 | protein = os.path.basename(self.plotfile).split(".")[0] 70 | title = "Optimization Log for {0} ".format(protein) 71 | title += self.title 72 | 73 | data = [] 74 | for name, metric in self.optimization_log.items(): 75 | data.append( 76 | go.Scatter( 77 | x=list(range(1, len(self.optimization_log[name]) + 1)), 78 | y=metric, 79 | mode='lines', 80 | visible="legendonly", 81 | name=name 82 | ) 83 | ) 84 | 85 | plot = { 86 | "data": data, 87 | "layout": go.Layout( 88 | title=title, 89 | xaxis1=dict( 90 | title="iteration", 91 | exponentformat="e", 92 | showexponent='all' 93 | ), 94 | yaxis1=dict( 95 | title="metric", 96 | exponentformat="e", 97 | showexponent='all' 98 | ), 99 | font=dict(size=18), 100 | titlefont=dict(size=14) 101 | ) 102 | } 103 | 104 | plotly_plot(plot, filename=self.plotfile, auto_open=False) 105 | -------------------------------------------------------------------------------- /ccmpred/objfun/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/objfun/__init__.py -------------------------------------------------------------------------------- /ccmpred/objfun/cd/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import ccmpred.raw 3 | import ccmpred.gaps 4 | import ccmpred.counts 5 | import ccmpred.objfun 6 | import ccmpred.objfun.cd.cext 7 | import ccmpred.parameter_handling 8 | from ccmpred.pseudocounts import PseudoCounts 9 | import ccmpred.sampling 10 | 11 | class ContrastiveDivergence(): 12 | 13 | def __init__(self, msa, weights, regularization, pseudocounts, x_single, x_pair, 14 | gibbs_steps=1, nr_seq_sample=500, persistent=False): 15 | 16 | 17 | self.msa = msa 18 | self.nrow, self.ncol = self.msa.shape 19 | self.weights = weights 20 | self.neff = np.sum(weights) 21 | self.regularization = regularization 22 | 23 | self.pseudocount_type = pseudocounts.pseudocount_type 24 | self.pseudocount_n_single = pseudocounts.pseudocount_n_single 25 | self.pseudocount_n_pair = pseudocounts.pseudocount_n_pair 26 | 27 | self.structured_to_linear = lambda x_single, x_pair: \ 28 | ccmpred.parameter_handling.structured_to_linear( 29 | x_single, x_pair, nogapstate=True, padding=False) 30 | self.linear_to_structured = lambda x: \ 31 | ccmpred.parameter_handling.linear_to_structured( 32 | x, self.ncol, nogapstate=True, add_gap_state=False, padding=False) 33 | 34 | 35 | self.x_single = x_single 36 | self.x_pair = x_pair 37 | self.x = self.structured_to_linear(self.x_single, self.x_pair) 38 | 39 | self.nsingle = self.ncol * 20 40 | self.npair = self.ncol * self.ncol * 21 * 21 41 | self.nvar = self.nsingle + self.npair 42 | 43 | # get constant alignment counts - INCLUDING PSEUDO COUNTS 44 | # important for small alignments 45 | self.freqs_single, self.freqs_pair = pseudocounts.freqs 46 | self.msa_counts_single = self.freqs_single * self.neff 47 | self.msa_counts_pair = self.freqs_pair * self.neff 48 | 49 | # reset gap counts 50 | self.msa_counts_single[:, 20] = 0 51 | self.msa_counts_pair[:, :, :, 20] = 0 52 | self.msa_counts_pair[:, :, 20, :] = 0 53 | 54 | # non_gapped counts 55 | self.Ni = self.msa_counts_single.sum(1) 56 | self.Nij = self.msa_counts_pair.sum(3).sum(2) 57 | 58 | ### Setting for (Persistent) Contrastive Divergence 59 | 60 | #perform this many steps of Gibbs sampling per sequence 61 | # 1 Gibbs step == sample every sequence position once 62 | self.gibbs_steps = np.max([gibbs_steps, 1]) 63 | 64 | #define how many markov chains are run in parallel 65 | # => how many sequences are sampled at each iteration 66 | # at least 500 sequences or 10% of sequences in MSA 67 | self.nr_seq_sample = np.max([int(self.nrow/10), nr_seq_sample]) 68 | 69 | #prepare the persistent MSA (Markov chains are NOT reset after each iteration) 70 | self.persistent=persistent 71 | #ensure that msa has at least NR_SEQ_SAMPLE sequences 72 | seq_id = list(range(self.nrow)) * int(np.ceil(self.nr_seq_sample / float(self.nrow))) 73 | self.msa_persistent = self.msa[seq_id] 74 | self.weights_persistent = self.weights[seq_id] 75 | 76 | def __repr__(self): 77 | 78 | repr_str = "" 79 | 80 | if self.persistent: 81 | repr_str += "persistent " 82 | 83 | repr_str += "contrastive divergence: " 84 | 85 | repr_str += "\nnr of sampled sequences={0} ({1}xN and {2}xNeff and {3}xL) Gibbs steps={4} ".format( 86 | self.nr_seq_sample, 87 | np.round(self.nr_seq_sample / float(self.nrow), decimals=3), 88 | np.round(self.nr_seq_sample / self.neff, decimals=3), 89 | np.round(self.nr_seq_sample / float(self.ncol), decimals=3), 90 | self.gibbs_steps 91 | ) 92 | 93 | return repr_str 94 | 95 | def init_sample_alignment(self, persistent=False): 96 | """ 97 | in case of CD: 98 | Randomly choose NR_SEQ_SAMPLE sequences from the ORIGINAL alignment 99 | in case of persistent CD: 100 | Randomly choose NR_SEQ_SAMPLE sequences from the alignment containing previously sampled sequences 101 | use the sequence weights computed from the original alignment 102 | (recomputing sequence weights in each iteration is too expensive) 103 | 104 | :return: 105 | """ 106 | 107 | if persistent: 108 | # in case of PERSISTENT CD, continue the Markov chain: 109 | #randomly select NR_SEQ_SAMPLE sequences from persistent MSA 110 | self.sample_seq_id = np.random.choice(self.msa_persistent.shape[0], self.nr_seq_sample, replace=False) 111 | msa = self.msa_persistent[self.sample_seq_id] 112 | weights = self.weights_persistent[self.sample_seq_id] 113 | else: 114 | # in case of plain CD, reinitialize the Markov chains from original sequences: 115 | # randomly select NR_SEQ_SAMPLE sequences from original MSA 116 | self.sample_seq_id = np.random.choice(self.nrow, self.nr_seq_sample, replace=True) 117 | msa = self.msa[self.sample_seq_id] 118 | weights = self.weights[self.sample_seq_id] 119 | 120 | return msa, weights 121 | 122 | def finalize(self, x): 123 | return ccmpred.parameter_handling.linear_to_structured( 124 | x, self.ncol, clip=False, nogapstate=True, add_gap_state=True, padding=False 125 | ) 126 | 127 | def evaluate(self, x, persistent=False): 128 | 129 | #setup sequences for sampling 130 | self.msa_sampled, self.msa_sampled_weights = self.init_sample_alignment(persistent) 131 | 132 | #Gibbs Sampling of sequences (each position of each sequence will be sampled this often: GIBBS_STEPS) 133 | self.msa_sampled = ccmpred.sampling.gibbs_sample_sequences(x, self.msa_sampled, self.gibbs_steps) 134 | 135 | if persistent: 136 | self.msa_persistent[self.sample_seq_id] = self.msa_sampled 137 | 138 | # compute amino acid frequencies from sampled alignment 139 | # add pseudocounts for stability 140 | pseudocounts = PseudoCounts(self.msa_sampled, self.msa_sampled_weights) 141 | pseudocounts.calculate_frequencies( 142 | self.pseudocount_type, 143 | self.pseudocount_n_single, 144 | self.pseudocount_n_pair, 145 | remove_gaps=False) 146 | 147 | #compute frequencies excluding gap counts 148 | sampled_freq_single = pseudocounts.degap(pseudocounts.freqs[0], True) 149 | sampled_freq_pair = pseudocounts.degap(pseudocounts.freqs[1], True) 150 | 151 | 152 | #compute counts and scale them accordingly to size of original MSA 153 | sample_counts_single = sampled_freq_single * self.Ni[:, np.newaxis] 154 | sample_counts_pair = sampled_freq_pair * self.Nij[:, :, np.newaxis, np.newaxis] 155 | 156 | #actually compute the gradients 157 | g_single = sample_counts_single - self.msa_counts_single 158 | g_pair = sample_counts_pair - self.msa_counts_pair 159 | 160 | #sanity check 161 | if(np.abs(np.sum(sample_counts_single[1,:20]) - np.sum(self.msa_counts_single[1,:20])) > 1e-5): 162 | print("Warning: sample aa counts ({0}) do not equal input msa aa counts ({1})!".format( 163 | np.sum(sample_counts_single[1,:20]), np.sum(self.msa_counts_single[1,:20])) 164 | ) 165 | 166 | # set gradients for gap states to 0 167 | g_single[:, 20] = 0 168 | g_pair[:, :, :, 20] = 0 169 | g_pair[:, :, 20, :] = 0 170 | 171 | # set diagonal elements to 0 172 | for i in range(self.ncol): 173 | g_pair[i, i, :, :] = 0 174 | 175 | #compute regularization 176 | x_single, x_pair = self.linear_to_structured(x) #x_single has dim L x 20 177 | _, g_single_reg, g_pair_reg = self.regularization(x_single, x_pair) #g_single_reg has dim L x 20 178 | 179 | #gradient for x_single only L x 20 180 | g = self.structured_to_linear(g_single[:, :20], g_pair) 181 | g_reg = self.structured_to_linear(g_single_reg[:, :20], g_pair_reg) 182 | 183 | return -1, g, g_reg 184 | 185 | def get_parameters(self): 186 | parameters = {} 187 | parameters['gibbs_steps'] = int(self.gibbs_steps) 188 | parameters['nr_seq_sample'] = int(self.nr_seq_sample) 189 | 190 | 191 | return parameters -------------------------------------------------------------------------------- /ccmpred/objfun/cd/cext/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.ctypeslib as npct 3 | import ctypes 4 | import os.path 5 | 6 | array_1d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=1, flags='CONTIGUOUS') 7 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS') 8 | 9 | libcd = npct.load_library('libcd', os.path.join(os.path.dirname(__file__), '_build')) 10 | 11 | libcd.sample_position_in_sequences.restype = None 12 | libcd.sample_position_in_sequences.argtypes = [ 13 | array_2d_char, # *msa 14 | array_1d_float, # *x 15 | ctypes.c_uint64, # nrow 16 | ctypes.c_uint32, # ncol 17 | ] 18 | 19 | libcd.gibbs_sample_sequences.restype = None 20 | libcd.gibbs_sample_sequences.argtypes = [ 21 | array_2d_char, # *msa 22 | array_1d_float, # *x 23 | ctypes.c_uint32, # steps 24 | ctypes.c_uint64, # nrow 25 | ctypes.c_uint32, # ncol 26 | ] 27 | 28 | libcd.gibbs_sample_sequences_nogaps.restype = None 29 | libcd.gibbs_sample_sequences_nogaps.argtypes = [ 30 | array_2d_char, # *msa 31 | array_1d_float, # *x 32 | ctypes.c_uint32, # steps 33 | ctypes.c_uint64, # nrow 34 | ctypes.c_uint32, # ncol 35 | ] 36 | 37 | def sample_position_in_sequences(msa, x): 38 | libcd.sample_position_in_sequences(msa, x, *msa.shape) 39 | return msa 40 | 41 | def gibbs_sample_sequences(msa, x, steps): 42 | libcd.gibbs_sample_sequences(msa, x, steps, *msa.shape) 43 | return msa 44 | 45 | def gibbs_sample_sequences_nogaps(msa, x, steps): 46 | libcd.gibbs_sample_sequences(msa, x, steps, *msa.shape) 47 | return msa -------------------------------------------------------------------------------- /ccmpred/objfun/cd/cext/cd.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #include "cd.h" 11 | #include "cdutil.h" 12 | 13 | 14 | /** 15 | * Compute conditional probabilities 16 | * $P(X_i = a | X^n_0, ... X^n_L \setminus X^n_i, v, w)$ 17 | * 18 | * @param[in] i Index of the column to compute probabilities for 19 | * @param[out] cond_probs Returns a 20-field array of conditional probabilities 20 | * @param[in] x The current potentials 21 | * @param[in] last_seq The current sequence to condition on 22 | * @param[in] ncol The number of columns in the MSA 23 | */ 24 | void compute_conditional_probs( 25 | const int i, 26 | flt *const cond_probs, 27 | const flt *const x, 28 | const unsigned char *const last_seq, 29 | const int ncol 30 | ) { 31 | int a, j; 32 | int nsingle = ncol * (N_ALPHA - 1); 33 | 34 | for (a = 0; a < N_ALPHA - 1; a++) { 35 | cond_probs[a] = E1(i,a); 36 | } 37 | 38 | for (a = 0; a < N_ALPHA - 1; a++) { 39 | for (j = 0; j < ncol; j++) { 40 | cond_probs[a] += E2(i, a, j, last_seq[j]); 41 | } 42 | 43 | // don't add up the case i = j 44 | cond_probs[a] -= E2(i, a, i, last_seq[i]); 45 | } 46 | 47 | cond_probs[GAP] = F0; 48 | 49 | flt denom = F0; 50 | for (a = 0; a < N_ALPHA - 1; a++) { 51 | cond_probs[a] = fexp(cond_probs[a]); 52 | denom += cond_probs[a]; 53 | } 54 | 55 | for (a = 0; a < N_ALPHA - 1; a++) { 56 | cond_probs[a] /= denom; 57 | } 58 | } 59 | 60 | /** 61 | * Resample a multiple sequence alignment 62 | * 63 | * @param[inout] seq The MSA to work on 64 | * @param[in] x The current potentials 65 | * @param[in] ncol The number of columns in the MSA 66 | * @param[in] n_samples The number of samples to generate (also the number of rows in the MSA) 67 | */ 68 | void sample_position_in_sequences( 69 | unsigned char *seq, 70 | const flt *const x, 71 | const unsigned long n_samples, 72 | const int ncol 73 | ) { 74 | seed_rng(); 75 | 76 | #pragma omp parallel 77 | { 78 | int i; 79 | unsigned long k; 80 | flt *pcondcurr = fl_malloc(N_ALPHA); 81 | 82 | #pragma omp for 83 | for (k = 0; k < n_samples; k++) { 84 | 85 | do { 86 | i = pick_random_uniform(ncol - 1); 87 | } while(seq[k * ncol + i] == GAP); 88 | 89 | compute_conditional_probs(i, pcondcurr, x, &seq[k * ncol], ncol); 90 | seq[k * ncol + i] = pick_random_weighted(pcondcurr, N_ALPHA - 1); 91 | 92 | } 93 | fl_free(pcondcurr); 94 | } 95 | } 96 | 97 | 98 | void gibbs_sample_sequences( 99 | unsigned char *seq, 100 | const flt *const x, 101 | const int steps, 102 | const unsigned long n_samples, 103 | const int ncol 104 | ){ 105 | 106 | seed_rng(); 107 | omp_set_dynamic(0); 108 | 109 | #pragma omp parallel 110 | { 111 | int i; 112 | unsigned long k; 113 | flt *pcondcurr = fl_malloc(N_ALPHA); 114 | 115 | //int array with elements 1..L 116 | unsigned int sequence_position_vector[ncol]; 117 | for (unsigned int p=0; p < ncol; p++) sequence_position_vector[p] = p; 118 | 119 | //int num_threads = omp_get_num_threads(); 120 | //printf("max thread num %d ", num_threads); 121 | 122 | #pragma omp for private(k) 123 | for (k = 0; k < n_samples; k++) { 124 | //int this_thread = omp_get_thread_num(); 125 | //printf("Compute seq %zu with thread %d \n", k, this_thread); 126 | 127 | for (int s=0; s < steps; s++){ 128 | shuffle(sequence_position_vector, ncol); 129 | 130 | for (i=0; i < ncol; i++){ 131 | if (seq[k * ncol + sequence_position_vector[i]] != GAP){ 132 | compute_conditional_probs(sequence_position_vector[i], pcondcurr, x, &seq[k * ncol], ncol); 133 | seq[k * ncol + sequence_position_vector[i]] = pick_random_weighted(pcondcurr, N_ALPHA - 1); 134 | } 135 | 136 | } 137 | } 138 | } 139 | fl_free(pcondcurr); 140 | } 141 | 142 | } 143 | 144 | 145 | void gibbs_sample_sequences_nogaps( 146 | unsigned char *seq, 147 | const flt *const x, 148 | const int steps, 149 | const unsigned long n_samples, 150 | const int ncol 151 | ){ 152 | 153 | seed_rng(); 154 | 155 | #pragma omp parallel 156 | { 157 | int i; 158 | unsigned long k; 159 | flt *pcondcurr = fl_malloc(N_ALPHA); 160 | 161 | //int array with elements 1..L 162 | unsigned int sequence_position_vector[ncol]; 163 | for (unsigned int p=0; p < ncol; p++) sequence_position_vector[p] = p; 164 | 165 | 166 | #pragma omp for 167 | for (int s=0; s < steps; s++){ 168 | for (k = 0; k < n_samples; k++) { 169 | shuffle(sequence_position_vector, ncol); 170 | 171 | for (i=0; i < ncol; i++){ 172 | compute_conditional_probs(sequence_position_vector[i], pcondcurr, x, &seq[k * ncol], ncol); 173 | seq[k * ncol + sequence_position_vector[i]] = pick_random_weighted(pcondcurr, N_ALPHA - 1); 174 | } 175 | } 176 | } 177 | fl_free(pcondcurr); 178 | } 179 | 180 | } -------------------------------------------------------------------------------- /ccmpred/objfun/cd/cext/cd.h: -------------------------------------------------------------------------------- 1 | #ifndef CD_H 2 | #define CD_H 3 | 4 | #define N_ALPHA 21 5 | #define GAP 20 6 | 7 | typedef double flt; 8 | #define F0 0.0 9 | #define F1 1.0 10 | #define F2 2.0 11 | #define fexp exp 12 | #define flog log 13 | 14 | 15 | #define X1_INDEX(i,a) (i) * (N_ALPHA - 1) + (a) 16 | #define X2_INDEX(i,a,j,b) (((i) * N_ALPHA + (a)) * ncol + (j)) * N_ALPHA + (b) 17 | 18 | #define G1(i,a) g[X1_INDEX(i,a)] 19 | #define G2(i,a,j,b) g[nsingle + X2_INDEX(i,a,j,b)] 20 | #define E1(i,a) x[X1_INDEX(i,a)] 21 | #define E2(i,a,j,b) x[nsingle + X2_INDEX(i,a,j,b)] 22 | #define H1(i,a) h[X1_INDEX(i,a)] 23 | #define H2(i,a,j,b) h[nsingle + X2_INDEX(i,a,j,b)] 24 | 25 | 26 | #define MSA(n,i) msa[MSA_INDEX(n,i)] 27 | 28 | #define MSA_INDEX(n,i) (n) * ncol + (i) 29 | 30 | void compute_conditional_probs( 31 | const int i, 32 | flt *const cond_probs, 33 | const flt *const x, 34 | const unsigned char *const last_seq, 35 | const int ncol 36 | ); 37 | 38 | #endif 39 | -------------------------------------------------------------------------------- /ccmpred/objfun/cd/cext/cdutil.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | 6 | #include "cd.h" 7 | #include "cdutil.h" 8 | 9 | 10 | void seed_rng() { 11 | int pid; 12 | struct timeval t; 13 | gettimeofday(&t, NULL); 14 | pid = getpid(); 15 | srand(t.tv_usec * t.tv_sec * pid); 16 | } 17 | 18 | 19 | /* Arrange the N elements of ARRAY in random order. 20 | Only effective if N is much smaller than RAND_MAX; 21 | if this may not be the case, use a better random 22 | number generator. */ 23 | void shuffle(unsigned int *array, size_t n) 24 | { 25 | if (n > 1) 26 | { 27 | size_t i; 28 | for (i = 0; i < n - 1; i++) 29 | { 30 | size_t j = i + rand() / (RAND_MAX / (n - i) + 1); 31 | int t = array[j]; 32 | array[j] = array[i]; 33 | array[i] = t; 34 | } 35 | } 36 | } 37 | 38 | 39 | int pick_random_uniform(int max) { 40 | int div = RAND_MAX / (max + 1); 41 | int retval; 42 | 43 | do { 44 | retval = rand() / div; 45 | } while (retval > max); 46 | 47 | return retval; 48 | } 49 | 50 | // A B C 51 | // 0.1 0.2 0.7 52 | // |----|--------|------------------------------------------| 53 | // | 0.1 0.3 1 54 | 55 | //p<0.1 --> A 56 | //0.1 < p < 0.3 --> p - 0.1 < 0.2 --> B 57 | //p>=0.3 --> p - 0.1 - 0.2 < 0.7 --> C 58 | int pick_random_weighted(flt *probs, int n) { 59 | int a; 60 | double p = (double)rand() / (double)RAND_MAX; 61 | for (a = 0; a < n; a++) { 62 | flt p_curr = probs[a]; 63 | if (p < p_curr) { 64 | return a; 65 | } 66 | p -= p_curr; 67 | } 68 | return n - 1; 69 | } 70 | 71 | flt* fl_malloc(int n) { 72 | return (flt *)malloc(sizeof(flt) * n); 73 | } 74 | 75 | void fl_free(flt *dest) { 76 | free(dest); 77 | } 78 | 79 | 80 | 81 | void fl_memcpy(flt *dest, flt *src, int n) { 82 | memcpy(dest, src, sizeof(flt) * n); 83 | } 84 | 85 | void vecimulc(flt *dst, flt f, int n) { 86 | int i; 87 | for(i = 0; i < n; i++) { 88 | dst[i] *= f; 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /ccmpred/objfun/cd/cext/cdutil.h: -------------------------------------------------------------------------------- 1 | #ifndef CDUTIL_H 2 | #define CDUTIL_H 3 | 4 | void seed_rng(void); 5 | 6 | void shuffle(unsigned int *array, size_t n); 7 | int pick_random_uniform(int max); 8 | int pick_random_weighted(flt *probs, int n); 9 | 10 | flt* fl_malloc(int n); 11 | void fl_free(flt *dest); 12 | void fl_memcpy(flt *dest, flt *src, int n); 13 | 14 | void vecimulc(flt *dst, flt f, int n); 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /ccmpred/objfun/pll/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import ccmpred.raw 4 | import ccmpred.regularization 5 | import ccmpred.objfun 6 | import ccmpred.objfun.pll.cext 7 | import ccmpred.counts 8 | import ccmpred.parameter_handling 9 | 10 | class PseudoLikelihood(): 11 | def __init__(self, msa, weights, regularization, pseudocounts, x_single, x_pair): 12 | 13 | self.msa = msa 14 | self.nrow, self.ncol = msa.shape 15 | self.weights = weights 16 | self.neff = np.sum(weights) 17 | self.regularization = regularization 18 | 19 | self.structured_to_linear = lambda x_single, x_pair: \ 20 | ccmpred.parameter_handling.structured_to_linear( 21 | x_single, x_pair, nogapstate=False, padding=True) 22 | self.linear_to_structured = lambda x: \ 23 | ccmpred.parameter_handling.linear_to_structured( 24 | x, self.ncol, nogapstate=False, add_gap_state=False, padding=True) 25 | 26 | self.x_single = x_single 27 | self.x_pair = x_pair 28 | self.x = self.structured_to_linear(self.x_single, self.x_pair) 29 | 30 | #use msa counts with pseudo counts - numerically more stable?? but gradient does not fit ll fct!! 31 | #self.freqs_single, self.freqs_pair = ccm.pseudocounts.freqs 32 | #msa_counts_single, msa_counts_pair = neff * freqs_single, neff * freqs_pair 33 | #use msa counts without pseudo counts 34 | msa_counts_single, msa_counts_pair = pseudocounts.counts 35 | 36 | msa_counts_single[:, 20] = 0 37 | msa_counts_pair[:, :, 20, :] = 0 38 | msa_counts_pair[:, :, :, 20] = 0 39 | 40 | for i in range(self.ncol): 41 | msa_counts_pair[i, i, :, :] = 0 42 | 43 | #non_gapped counts 44 | # self.Ni = msa_counts_single.sum(1) 45 | # self.Nij = msa_counts_pair.sum(3).sum(2) 46 | 47 | #no pseudo counts in gradient calculation 48 | #pairwise gradient is two-fold 49 | self.g_init = ccmpred.parameter_handling.structured_to_linear( 50 | msa_counts_single, 2 * msa_counts_pair) 51 | 52 | self.nsingle = self.ncol * 21 53 | self.nsingle_padded = self.nsingle + 32 - (self.nsingle % 32) 54 | self.nvar = self.nsingle_padded + self.ncol * self.ncol * 21 * 32 55 | 56 | # memory allocation for intermediate variables 57 | #gradient for single and pair potentials 58 | self.g = np.empty((self.nsingle_padded + self.ncol * self.ncol * 21 * 32,), dtype=np.dtype('float64')) 59 | #gradient for only pair potentials 60 | self.g2 = np.empty((self.ncol * self.ncol * 21 * 32,), dtype=np.dtype('float64')) 61 | 62 | 63 | def finalize(self, x): 64 | return ccmpred.parameter_handling.linear_to_structured( 65 | x, self.ncol, clip=True, nogapstate=False, add_gap_state=False, padding=True) 66 | 67 | def evaluate(self, x): 68 | 69 | #fx is function value of objective function over w_ijab with i=j=1 < L 70 | #--> potentials are symmetric and counted twice!! 71 | #w_ijab will later be updated by gradient for i=1 therefore gradient for w_ijab is multiplied by 2!! 73 | 74 | #pointer to g == self.g 75 | fx, g = ccmpred.objfun.pll.cext.evaluate(x, self.g, self.g2, self.weights, self.msa) 76 | g -= self.g_init 77 | 78 | x_single, x_pair = self.linear_to_structured(x) 79 | 80 | #compute regularizer 81 | fx_reg, g_single_reg, g_pair_reg = self.regularization(x_single, x_pair) 82 | g_pair_reg *= 2 #gradient is multiplied by 2 because of issue mentioned above 83 | g_reg = self.structured_to_linear(g_single_reg, g_pair_reg) 84 | fx += fx_reg 85 | 86 | return fx, g, g_reg 87 | 88 | def get_parameters(self): 89 | return {'padding' : True, 90 | 'pseudocounts': False} 91 | 92 | def __repr__(self): 93 | return "PLL " 94 | 95 | -------------------------------------------------------------------------------- /ccmpred/objfun/pll/cext/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.ctypeslib as npct 3 | import ctypes 4 | import os.path 5 | 6 | array_1d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=1, flags='CONTIGUOUS') 7 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS') 8 | 9 | libpll = npct.load_library('libpll', os.path.join(os.path.dirname(__file__), '_build')) 10 | 11 | libpll.evaluate_pll.restype = ctypes.c_double 12 | libpll.evaluate_pll.argtypes = [ 13 | array_1d_float, # *x 14 | array_1d_float, # *g 15 | array_1d_float, # *g2 16 | array_1d_float, # *weights 17 | array_2d_char, # *msa 18 | ctypes.c_uint32, # ncol 19 | ctypes.c_uint32, # nrow 20 | ] 21 | 22 | 23 | def evaluate(x, g, g2, weights, msa): 24 | nrow, ncol = msa.shape 25 | fx = libpll.evaluate_pll(x, g, g2, weights, msa, ncol, nrow) 26 | return fx, g 27 | -------------------------------------------------------------------------------- /ccmpred/objfun/pll/cext/pll.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include "pll.h" 10 | 11 | double evaluate_pll( 12 | const double *x, 13 | double *g, 14 | double *g2, 15 | double *weights, 16 | unsigned char *msa, 17 | const uint32_t ncol, 18 | const uint32_t nrow 19 | ) { 20 | uint32_t nsingle = ncol * N_ALPHA; 21 | uint32_t nsingle_padded = nsingle + N_ALPHA_PAD - (nsingle % N_ALPHA_PAD); 22 | uint64_t nvar_padded = nsingle_padded + ncol * ncol * N_ALPHA * N_ALPHA_PAD; 23 | 24 | const double *x1 = x; 25 | const double *x2 = &x[nsingle_padded]; 26 | 27 | double *g1 = g; 28 | double *g2l = &g[nsingle_padded]; 29 | 30 | // set fx and gradient to 0 initially 31 | double fx = 0.0; 32 | 33 | //gradient for single and pair potentials 34 | memset(g, 0, sizeof(double) * nvar_padded); 35 | //gradient only for pair potentials 36 | memset(g2, 0, sizeof(double) * (nvar_padded - nsingle_padded)); 37 | 38 | double *precomp_norm = malloc(sizeof(double) * N_ALPHA * nrow * ncol); 39 | 40 | //#pragma omp parallel for reduction(+:fx) 41 | //iterate over ALL pairs (not only i log Z_nj 52 | for(int a = 0; a < N_ALPHA - 1; a++) { 53 | precomp[a] = V(a, j); 54 | 55 | for(uint32_t i = 0; i < ncol; i++) { 56 | unsigned char xni = X(n, i); 57 | 58 | //ignore gaps 59 | if (xni < N_ALPHA - 1) { 60 | precomp[a] += W(a, j, xni, i); 61 | } 62 | } 63 | 64 | precomp_sum += exp(precomp[a]); 65 | } 66 | precomp[N_ALPHA - 1] = 0; // set precomp(gap) to zero 67 | precomp_sum = log(precomp_sum); 68 | 69 | 70 | // compute exp(V_j(a) + sum(i < L) w_{ji}(a, x_ni)) / Z_nj 71 | // needed for gradient computation 72 | // --> exp(precomp) / exp(log(Z)) 73 | // --> exp(precomp - log(Z)) 74 | //ignore gaps! 75 | for(int a = 0; a < N_ALPHA - 1; a++) { 76 | precomp_norm[(n * N_ALPHA + a) * ncol + j] = exp(precomp[a] - precomp_sum); 77 | } 78 | precomp_norm[(n * N_ALPHA + N_ALPHA - 1) * ncol + j] = 0; 79 | 80 | 81 | 82 | unsigned char xnj = X(n,j); 83 | 84 | // actually add up the function value if x_nj is not a gap 85 | // * -1.0 because we are using negative log likelihood 86 | // weight(n) * (precomp( x_nj ) - log Z_nj) 87 | // weight(n) * ( V_j(x_nj) + sum(i < L) w_{ji}(x_nj, x_ni) - log Z_nj) 88 | 89 | if(xnj < N_ALPHA - 1) { 90 | fx += weight * (precomp_sum - precomp[xnj]); 91 | } 92 | 93 | } // nj 94 | 95 | 96 | //compute gradients for single emissions 97 | #pragma omp parallel for 98 | for(uint32_t nj = 0; nj < nrow * ncol; nj++) { 99 | uint32_t n = nj / ncol; 100 | uint32_t j = nj % ncol; 101 | unsigned char xnj = X(n,j); 102 | double weight = weights[n]; 103 | 104 | //if xnj is not a gap: add second part of gradient 105 | if(xnj < N_ALPHA - 1) { 106 | 107 | for(uint32_t a = 0; a < N_ALPHA - 1; a++) { 108 | #pragma omp atomic 109 | G1(a, j) += weight * precomp_norm[(n * N_ALPHA + a) * ncol + j]; 110 | } 111 | } else { 112 | //otherwise set precomp_norm to zero so that no count will be added to G2 113 | for(uint32_t a = 0; a < N_ALPHA; a++) { 114 | precomp_norm[(n * N_ALPHA + a) * ncol + j] = 0; 115 | } 116 | } 117 | 118 | } // nj 119 | 120 | //compute gradients for pair emissions 121 | #pragma omp parallel for 122 | //iterate over WHOLE matrix (not only iresidue j: {1}
score: {2}".format( 66 | plot_matrix.residue_i.tolist()[i], 67 | plot_matrix.residue_j.tolist()[i], 68 | np.round(plot_matrix.confidence.tolist()[i], decimals=3)) 69 | for i in range(len(plot_matrix.residue_i.tolist()))] 70 | 71 | hover_text += ["residue i: {0}
residue j: {1}
score: {2}".format( 72 | plot_matrix.residue_j.tolist()[i], 73 | plot_matrix.residue_i.tolist()[i], 74 | np.round(plot_matrix.confidence.tolist()[i], decimals=3)) 75 | for i in range(len(plot_matrix.residue_i.tolist()))] 76 | 77 | # add predicted contact map 78 | data.append( 79 | go.Heatmap( 80 | x=plot_matrix.residue_i.tolist() + plot_matrix.residue_j.tolist(), 81 | y=plot_matrix.residue_j.tolist() + plot_matrix.residue_i.tolist(), 82 | z=plot_matrix.confidence.tolist() + plot_matrix.confidence.tolist(), 83 | name='predicted', 84 | hoverinfo="text", 85 | text=hover_text, 86 | colorscale='Greys', 87 | reversescale=True, 88 | colorbar=dict( 89 | x = 1, 90 | y = 0.4, 91 | yanchor = 'bottom', 92 | len = 0.4, 93 | title = "Score" 94 | ) 95 | ) 96 | ) 97 | 98 | 99 | # if distances and class are available 100 | if 'contact' in plot_matrix and 'distance' in plot_matrix: 101 | 102 | # colorscale from red (small distance) to blue(large distance) 103 | zmax = np.max(plot_matrix.distance) 104 | percent_at_contact_thr = 8 / zmax 105 | distance_colorscale = [[0, 'rgb(128, 0, 0)'], 106 | [percent_at_contact_thr, 'rgb(255, 255, 255)'], 107 | [1, 'rgb(22, 96, 167)']] 108 | 109 | 110 | hover_text = ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format( 111 | plot_matrix.residue_i.tolist()[i], 112 | plot_matrix.residue_j.tolist()[i], 113 | np.round(plot_matrix.confidence.tolist()[i], decimals=3), 114 | np.round(plot_matrix.distance.tolist()[i], decimals=3)) 115 | for i in range(len(plot_matrix.residue_i.tolist()))] 116 | 117 | hover_text += ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format( 118 | plot_matrix.residue_j.tolist()[i], 119 | plot_matrix.residue_i.tolist()[i], 120 | np.round(plot_matrix.confidence.tolist()[i], decimals=3), 121 | np.round(plot_matrix.distance.tolist()[i], decimals=3)) 122 | for i in range(len(plot_matrix.residue_i.tolist()))] 123 | 124 | # define triangle on opposite site of Predictions 125 | data.append( 126 | go.Heatmap( 127 | x=plot_matrix.residue_j.tolist(), 128 | y=plot_matrix.residue_i.tolist(), 129 | z=plot_matrix.distance.tolist(), 130 | name='observed', 131 | hoverinfo="text", 132 | text=hover_text, 133 | zmin=0, 134 | zmax=zmax, 135 | colorscale=distance_colorscale, 136 | colorbar=dict( 137 | x=1, 138 | y=0, 139 | yanchor='bottom', 140 | len=0.4, 141 | title="Distance [A]") 142 | ) 143 | ) 144 | 145 | 146 | # define true and false positives among the L/5 highest scores 147 | sub_L5_true = plot_matrix.query('distance > 0').head(int(L / 5)).query('contact > 0') 148 | sub_L5_false = plot_matrix.query('distance > 0').head(int(L / 5)).query('contact < 1') 149 | 150 | tp_text = ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format( 151 | sub_L5_true.residue_i.tolist()[i], 152 | sub_L5_true.residue_j.tolist()[i], 153 | np.round(sub_L5_true.confidence.tolist()[i], decimals=3), 154 | np.round(sub_L5_true.distance.tolist()[i], decimals=3)) 155 | for i in range(len(sub_L5_true.residue_i.tolist()))] 156 | 157 | tp_text += ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format( 158 | sub_L5_true.residue_j.tolist()[i], 159 | sub_L5_true.residue_i.tolist()[i], 160 | np.round(sub_L5_true.confidence.tolist()[i], decimals=3), 161 | np.round(sub_L5_true.distance.tolist()[i], decimals=3)) 162 | for i in range(len(sub_L5_true.residue_i.tolist()))] 163 | 164 | if len(sub_L5_true) > 0: 165 | # Mark TP and FP in the plot with little crosses 166 | data.append( 167 | go.Scatter( 168 | x=sub_L5_true['residue_i'].tolist() + sub_L5_true['residue_j'].tolist(), 169 | y=sub_L5_true['residue_j'].tolist() + sub_L5_true['residue_i'].tolist(), 170 | mode='markers', 171 | text=tp_text, 172 | hoverinfo="text", 173 | marker=dict( 174 | symbol=134, 175 | color="green", 176 | line=dict(width=2), 177 | size=12 178 | ), 179 | name="TP (L/5)" 180 | ) 181 | ) 182 | 183 | fp_text = ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format( 184 | sub_L5_false.residue_i.tolist()[i], 185 | sub_L5_false.residue_j.tolist()[i], 186 | np.round(sub_L5_false.confidence.tolist()[i], decimals=3), 187 | np.round(sub_L5_false.distance.tolist()[i], decimals=3)) 188 | for i in range(len(sub_L5_false.residue_i.tolist()))] 189 | 190 | fp_text += ["residue i: {0}
residue j: {1}
score: {2}
distance: {3}".format( 191 | sub_L5_false.residue_j.tolist()[i], 192 | sub_L5_false.residue_i.tolist()[i], 193 | np.round(sub_L5_false.confidence.tolist()[i], decimals=3), 194 | np.round(sub_L5_false.distance.tolist()[i], decimals=3)) 195 | for i in range(len(sub_L5_false.residue_i.tolist()))] 196 | 197 | if len(sub_L5_false) > 0: 198 | data.append( 199 | go.Scatter( 200 | x=sub_L5_false['residue_i'].tolist() + sub_L5_false['residue_j'].tolist(), 201 | y=sub_L5_false['residue_j'].tolist() + sub_L5_false['residue_i'].tolist(), 202 | mode='markers', 203 | text=fp_text, 204 | hoverinfo="text", 205 | marker=dict( 206 | symbol=134, 207 | color="red", 208 | line=dict(width=2), 209 | size=12 210 | ), 211 | name="FP (L/5)" 212 | 213 | ) 214 | ) 215 | 216 | 217 | 218 | 219 | # add diagonal and diagonals marking sequence separation 220 | data.append(go.Scatter( 221 | x=[0, L], y=[0, L], 222 | mode='lines', 223 | line=dict(color=('rgb(0, 0, 0)'), width=4), 224 | hoverinfo=None, 225 | showlegend=False) 226 | ) 227 | data.append( 228 | go.Scatter( 229 | x=[0, L - seqsep + 1], y=[seqsep - 1, L], 230 | mode='lines', 231 | line=dict(color=('rgb(0, 0, 0)'), width=2), 232 | showlegend=False) 233 | ) 234 | data.append( 235 | go.Scatter( 236 | x=[seqsep - 1, L], y=[0, L - seqsep + 1], 237 | mode='lines', 238 | line=dict(color=('rgb(0, 0, 0)'), width=2), 239 | showlegend=False) 240 | ) 241 | 242 | 243 | fig = tools.make_subplots(rows=2, cols=1, shared_xaxes=True, print_grid=False) 244 | 245 | 246 | if gaps_percentage_plot is not None: 247 | for trace in gaps_percentage_plot['data']: 248 | fig.append_trace(trace, 1, 1) 249 | 250 | for trace in data: 251 | fig.append_trace(trace, 2, 1) 252 | 253 | fig['layout']['title'] = title 254 | fig['layout']['width'] = 1000 255 | fig['layout']['height'] = 1000 256 | fig['layout']['legend'] = {'x': 1, 'y': 1} # places legend to the right of plot 257 | fig['layout']['hovermode'] = "closest" 258 | 259 | fig['layout']['xaxis1']['title'] = 'i' 260 | fig['layout']['xaxis1']['range'] = [0.5, L + 0.5] 261 | fig['layout']['xaxis1']['domain'] = [0.0, 1.0] 262 | fig['layout']['xaxis1']['zeroline'] = False 263 | 264 | fig['layout']['yaxis2']['title'] = 'j' 265 | fig['layout']['yaxis2']['range'] = [0.5, L + 0.5] 266 | fig['layout']['yaxis2']['domain'] = [0.0, 1.0] 267 | fig['layout']['yaxis2']['scaleanchor'] = "x" 268 | fig['layout']['yaxis2']['scaleratio'] = 1.0 269 | fig['layout']['yaxis2']['zeroline'] = False 270 | 271 | fig['layout']['font']['size'] = 18 272 | 273 | #percentage gaps and entropy plot 274 | if gaps_percentage_plot is not None: 275 | fig['layout']['yaxis2']['domain'] = [0.0, 0.9] 276 | #fig['layout']['xaxis1']['domain'] = [0.0, 0.9] 277 | fig['layout']['yaxis1']['domain'] = [0.9, 1.0] 278 | 279 | 280 | 281 | if plot_file: 282 | plotly_plot(fig, filename=plot_file, auto_open=False, show_link=False) 283 | else: 284 | return fig 285 | 286 | def plot_empirical_vs_model_statistics( 287 | single_freq_observed, single_freq_sampled, 288 | pairwise_freq_observed, pairwise_freq_sampled, 289 | plot_out): 290 | 291 | L = single_freq_observed.shape[0] 292 | indices_upper_triangle_i, indices_upper_triangle_j = np.triu_indices(L, k=1) 293 | 294 | x_single = single_freq_observed.flatten().tolist() 295 | y_single = single_freq_sampled.flatten().tolist() 296 | pair_freq_observed = pairwise_freq_observed[ 297 | indices_upper_triangle_i, 298 | indices_upper_triangle_j, :, :].flatten().tolist() 299 | pair_freq_sampled = pairwise_freq_sampled[ 300 | indices_upper_triangle_i, 301 | indices_upper_triangle_j, :, :].flatten().tolist() 302 | cov_observed = [pairwise_freq_observed[i, j, a, b] - (single_freq_observed[i, a] * single_freq_observed[j, b]) 303 | for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)] 304 | cov_sampled = [pairwise_freq_sampled[i, j, a, b] - (single_freq_sampled[i, a] * single_freq_sampled[j, b]) 305 | for i in range(L - 1) for j in range(i + 1, L) for a in range(20) for b in range(20)] 306 | 307 | 308 | ## first trace: single amino acid frequencies 309 | trace_single_frequencies = go.Scattergl( 310 | x=x_single, 311 | y=y_single, 312 | mode='markers', 313 | name='single frequencies', 314 | text=["position: {0}
amino acid: {1}".format(i+1,io.AMINO_ACIDS[a]) for i in range(L) for a in range(20)], 315 | marker=dict(color='black'), 316 | opacity=0.1, 317 | showlegend=False 318 | ) 319 | pearson_corr_single = np.corrcoef(x_single, y_single)[0,1] 320 | 321 | ## second trace: pairwise amino acid frequencies 322 | parir_freq_annotation = ["position: {0}-{1}
amino acid: {2}-{3}".format( 323 | i+1, 324 | j+1, 325 | io.AMINO_ACIDS[a], 326 | io.AMINO_ACIDS[b]) for i in range(L-1) for j in range(i+1, L) for a in range(20) for b in range(20)] 327 | trace_pairwise_frequencies = go.Scattergl( 328 | x=pair_freq_observed, 329 | y=pair_freq_sampled, 330 | mode='markers', 331 | name='pairwise frequencies', 332 | text=parir_freq_annotation, 333 | marker=dict(color='black'), 334 | opacity=0.1, 335 | showlegend=False 336 | ) 337 | pearson_corr_pair = np.corrcoef(pair_freq_observed, pair_freq_sampled)[0, 1] 338 | 339 | ## third trace: covariances 340 | trace_cov = go.Scattergl( 341 | x=cov_observed, 342 | y=cov_sampled, 343 | mode='markers', 344 | name='covariances', 345 | text=parir_freq_annotation, 346 | marker=dict(color='black'), 347 | opacity=0.1, 348 | showlegend=False 349 | ) 350 | pearson_corr_cov = np.corrcoef(cov_observed, cov_sampled)[0, 1] 351 | 352 | 353 | #define diagonals 354 | diag_single = [np.min(x_single + y_single), np.max(x_single + y_single)] 355 | diag_pair = [np.min(pair_freq_observed + pair_freq_sampled), np.max(pair_freq_observed + pair_freq_sampled)] 356 | diag_cov = [np.min(cov_observed + cov_sampled), np.max(cov_observed+ cov_sampled)] 357 | 358 | 359 | diagonal_single = go.Scattergl( 360 | x=diag_single, 361 | y=diag_single, 362 | mode="lines", 363 | showlegend=False, 364 | marker=dict(color='rgb(153, 204, 255)') 365 | ) 366 | 367 | diagonal_pair = go.Scattergl( 368 | x=diag_pair, 369 | y=diag_pair, 370 | mode="lines", 371 | showlegend=False, 372 | marker=dict(color='rgb(153, 204, 255)') 373 | ) 374 | 375 | diagonal_cov = go.Scattergl( 376 | x=diag_cov, 377 | y=diag_cov, 378 | mode="lines", 379 | showlegend=False, 380 | marker=dict(color='rgb(153, 204, 255)') 381 | ) 382 | 383 | 384 | 385 | ## define subplots 386 | fig = tools.make_subplots( 387 | rows=1, 388 | cols=3, 389 | subplot_titles=["single site amino acid frequencies", "pairwise amino acid frequencies", "covariances"], 390 | horizontal_spacing = 0.05, 391 | print_grid=False 392 | ) 393 | 394 | ## add traces as subplots 395 | fig.append_trace(trace_single_frequencies, 1, 1) 396 | fig.append_trace(diagonal_single, 1, 1) 397 | fig.append_trace(trace_pairwise_frequencies, 1, 2) 398 | fig.append_trace(diagonal_pair, 1, 2) 399 | fig.append_trace(trace_cov, 1, 3) 400 | fig.append_trace(diagonal_cov, 1, 3) 401 | 402 | 403 | 404 | #incresae size of subplot titles 405 | fig.layout.annotations[0].font.size = 20 406 | fig.layout.annotations[1].font.size = 20 407 | fig.layout.annotations[2].font.size = 20 408 | 409 | 410 | # add text to plot: Pearson correlation coefficient 411 | annotation_single = go.layout.Annotation( 412 | dict( 413 | x=0.13,#0.02, 414 | y=0.04,#0.95, 415 | xanchor="left", 416 | xref='paper', 417 | yref='paper', 418 | text='Pearson r = ' + str(np.round(pearson_corr_single, decimals=3)), 419 | bgcolor = "white", 420 | showarrow=False 421 | ) 422 | ) 423 | 424 | annotation_pair = go.layout.Annotation( 425 | dict( 426 | x=0.48,#0.37, 427 | y=0.04,#0.95, 428 | xanchor="left", 429 | xref='paper', 430 | yref='paper', 431 | text='Pearson r = ' + str(np.round(pearson_corr_pair, decimals=3)), 432 | bgcolor="white", 433 | showarrow=False 434 | ) 435 | ) 436 | 437 | annotation_cov = go.layout.Annotation( 438 | dict( 439 | x=0.85,#0.71, 440 | y=0.04,#0.95, 441 | xanchor="left", 442 | xref='paper', 443 | yref='paper', 444 | text='Pearson r = ' + str(np.round(pearson_corr_cov, decimals=3)), 445 | bgcolor="white", 446 | showarrow=False 447 | ) 448 | ) 449 | 450 | fig.layout.annotations += (annotation_single, annotation_pair, annotation_cov) 451 | 452 | #define layout 453 | fig['layout'].update( 454 | font = dict(size=20), 455 | hovermode = 'closest', 456 | width=1500, 457 | height=500, 458 | margin=dict(t=40) 459 | 460 | ) 461 | 462 | 463 | #specify axis layout details 464 | fig['layout']['yaxis1'].update( 465 | title="statistics from MCMC sample", 466 | exponentformat="e", 467 | showexponent='all', 468 | scaleanchor="x1", 469 | scaleratio=1 470 | ) 471 | fig['layout']['yaxis2'].update( 472 | exponentformat="e", 473 | showexponent='all', 474 | scaleanchor="x2", 475 | scaleratio=1 476 | ) 477 | fig['layout']['yaxis3'].update( 478 | exponentformat="e", 479 | showexponent='all', 480 | scaleanchor="x3", 481 | scaleratio=1 482 | ) 483 | fig['layout']['xaxis1'].update( 484 | exponentformat="e", 485 | showexponent='all', 486 | scaleanchor="y1", 487 | scaleratio=1, 488 | showspikes=True 489 | ) 490 | fig['layout']['xaxis2'].update( 491 | title="statistics from natural sequences", 492 | exponentformat="e", 493 | showexponent='all', 494 | scaleanchor="y2", 495 | scaleratio=1 496 | ) 497 | fig['layout']['xaxis3'].update( 498 | exponentformat="e", 499 | showexponent='all', 500 | scaleanchor="y3", 501 | scaleratio=1 502 | ) 503 | 504 | fig['layout']['xaxis1']['range'] = [0, 1] 505 | fig['layout']['xaxis2']['range'] = [0, 1] 506 | fig['layout']['yaxis1']['range'] = [0, 1] 507 | fig['layout']['yaxis2']['range'] = [0, 1] 508 | 509 | 510 | 511 | plotly_plot(fig, filename=plot_out, auto_open=False, show_link=False, image_filename=plot_out.replace("html", "")) 512 | 513 | def plot_alignment(aa_counts_single, title, plot_file, freq=True): 514 | 515 | Neff = np.sum(aa_counts_single[0,:]) 516 | L = aa_counts_single.shape[0] 517 | 518 | #create plot 519 | data = [] 520 | 521 | if freq: 522 | aa_counts_single /= Neff 523 | 524 | #add bar for each amino acid for each position 525 | for aa in range(20): 526 | data.append( 527 | go.Bar( 528 | x= list(range(1,L+1)), 529 | y=aa_counts_single[:, aa].tolist(), 530 | showlegend=True, 531 | name=io.AMINO_ACIDS[aa] 532 | ) 533 | ) 534 | 535 | 536 | layout = go.Layout( 537 | barmode='stack', 538 | title=title, 539 | xaxis=dict(title="Alignment Position"), 540 | yaxis=dict( 541 | title="Amino Acid Distribution", 542 | exponentformat='e', 543 | showexponent='All' 544 | ), 545 | font=dict(size=18) 546 | ) 547 | 548 | plot = {'data': data, 'layout': layout} 549 | 550 | 551 | plotly_plot(plot, filename=plot_file, auto_open=False, link_text='') 552 | -------------------------------------------------------------------------------- /ccmpred/pseudocounts.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import ccmpred.counts 4 | import ccmpred.substitution_matrices 5 | 6 | 7 | class PseudoCounts(object): 8 | """Add pseudocounts to prevent vanishing amino acid frequencies""" 9 | 10 | def __init__(self, msa, weights): 11 | 12 | self.msa = msa 13 | self.N, self.L = self.msa.shape 14 | self.weights=weights 15 | self.neff = np.sum(weights) if self.weights is not None else self.N 16 | 17 | #with weights 18 | self.counts = ccmpred.counts.both_counts(self.msa, self.weights) 19 | self.freqs = None 20 | 21 | self.pseudocount_n_single = None 22 | self.pseudocount_n_pair = None 23 | self.pseudocount_type = None 24 | self.remove_gaps = None 25 | self.pseudocount_ratio_single = None 26 | self.pseudocount_ratio_pair = None 27 | 28 | #will be computed from Freq with pseudo-counts and Neff 29 | self.Ni = None 30 | self.Nij = None 31 | 32 | 33 | def calculate_Ni(self, freqs_single=None): 34 | 35 | if freqs_single is not None: 36 | #counts may include pseudo-counts 37 | single_counts = freqs_single * self.neff 38 | else: 39 | single_counts, pair_counts = self.counts 40 | 41 | # reset gap counts 42 | single_counts[:, 20] = 0 43 | 44 | Ni = single_counts.sum(1) 45 | 46 | self.Ni = Ni 47 | 48 | def calculate_Nij(self, freqs_pair=None): 49 | 50 | if freqs_pair is not None: 51 | #counts may include pseudo-counts 52 | pair_counts = freqs_pair * self.neff 53 | else: 54 | single_counts, pair_counts = self.counts 55 | 56 | # reset gap counts 57 | pair_counts[:, :, :, 20] = 0 58 | pair_counts[:, :, 20, :] = 0 59 | 60 | # non_gapped counts 61 | Nij = pair_counts.sum(3).sum(2) 62 | 63 | self.Nij = Nij 64 | 65 | def calculate_global_aa_freq(self): 66 | 67 | single_counts, _ = self.counts 68 | 69 | #normalized with gaps 70 | single_freq = single_counts / self.neff 71 | 72 | #single freq counts normalized without gaps 73 | single_freq = self.degap(single_freq, True) 74 | 75 | 76 | return np.mean(single_freq[:, :20], axis=0)[np.newaxis, :][0] 77 | 78 | def calculate_frequencies(self, pseudocount_type, pseudocount_n_single=1, pseudocount_n_pair=None, remove_gaps=False): 79 | 80 | 81 | self.pseudocount_n_single = pseudocount_n_single 82 | self.pseudocount_n_pair = pseudocount_n_pair 83 | self.pseudocount_type = pseudocount_type 84 | self.remove_gaps = remove_gaps 85 | 86 | single_counts, pair_counts = self.counts 87 | 88 | if pseudocount_n_pair is None: 89 | pseudocount_n_pair = pseudocount_n_single 90 | 91 | 92 | self.pseudocount_ratio_single = pseudocount_n_single / (self.neff + pseudocount_n_single) 93 | self.pseudocount_ratio_pair = pseudocount_n_pair / (self.neff + pseudocount_n_pair) 94 | 95 | #frequencies are normalized WITH gaps 96 | single_freq = single_counts / self.neff 97 | pair_freq = pair_counts / self.neff 98 | 99 | if (remove_gaps): 100 | single_freq = self.degap(single_freq,True) 101 | pair_freq = self.degap(pair_freq, True) 102 | 103 | pcounts = getattr(self, pseudocount_type)(single_freq) 104 | 105 | single_freq_pc = (1 - self.pseudocount_ratio_single) * single_freq + self.pseudocount_ratio_single * pcounts 106 | pair_freq_pc = ((1 - self.pseudocount_ratio_pair) ** 2) * \ 107 | (pair_freq - single_freq[:, np.newaxis, :, np.newaxis] * single_freq[np.newaxis, :, np.newaxis, :]) + \ 108 | (single_freq_pc[:, np.newaxis, :, np.newaxis] * single_freq_pc[np.newaxis, :, np.newaxis, :]) 109 | 110 | self.freqs = single_freq_pc, pair_freq_pc 111 | 112 | #compute weighted non-gapped sequence counts 113 | self.calculate_Ni(single_freq_pc) 114 | self.calculate_Nij(pair_freq_pc) 115 | 116 | @staticmethod 117 | def degap(freq, keep_dims=False): 118 | if len(freq.shape) == 2 : 119 | out = freq[:, :20] / (1 - freq[:, 20])[:, np.newaxis] 120 | else: 121 | freq_sum = freq[:,:,:20, :20].sum(3).sum(2)[:, :, np.newaxis, np.newaxis] 122 | out = freq[:, :, :20, :20] / (freq_sum + 1e-10) 123 | 124 | if keep_dims: 125 | if len(freq.shape) == 2 : 126 | out2 = np.zeros((freq.shape[0], 21)) 127 | out2[:, :20] = out 128 | else: 129 | out2 = np.zeros((freq.shape[0], freq.shape[1], 21, 21)) 130 | out2[:, :, :20, :20] = out 131 | out = out2 132 | 133 | return out 134 | 135 | def uniform_pseudocounts(self, single_freq): 136 | uniform_pc = np.zeros_like(single_freq) 137 | uniform_pc.fill(1. / single_freq.shape[1]) 138 | return uniform_pc 139 | 140 | def constant_pseudocounts(self, single_freq): 141 | return np.mean(single_freq, axis=0)[np.newaxis, :] 142 | 143 | def substitution_matrix_pseudocounts(self, single_freq, substitution_matrix=ccmpred.substitution_matrices.BLOSUM62): 144 | """ 145 | Substitution matrix pseudocounts 146 | 147 | $\tilde{q}(x_i = a) = \sum_{b=1}^{20} p(a | b) q_0(x_i = b)$ 148 | """ 149 | single_freq_degap = self.degap(single_freq) 150 | 151 | # $p(b) = \sum{a=1}^{20} p(a, b)$ 152 | pb = np.sum(substitution_matrix, axis=0) 153 | 154 | # p(a | b) = p(a, b) / p(b) 155 | cond_prob = substitution_matrix / pb[np.newaxis, :] 156 | 157 | freqs_pc = np.zeros_like(single_freq) 158 | freqs_pc[:, :20] = np.sum(cond_prob[np.newaxis, :, :] * single_freq_degap[:, np.newaxis, :], axis=2) 159 | 160 | return freqs_pc 161 | 162 | def no_pseudocounts(self, single_freq): 163 | return single_freq 164 | -------------------------------------------------------------------------------- /ccmpred/raw/__init__.py: -------------------------------------------------------------------------------- 1 | from ccmpred.raw.ccmraw import parse_oldraw, parse_msgpack, parse, write_msgpack, write_oldraw, CCMRaw 2 | -------------------------------------------------------------------------------- /ccmpred/raw/ccmraw.py: -------------------------------------------------------------------------------- 1 | import msgpack 2 | import functools 3 | import numpy as np 4 | import re 5 | import json 6 | import gzip 7 | from six import string_types, StringIO 8 | 9 | 10 | META_PREFIX = "#>META> " 11 | 12 | 13 | class CCMRaw(object): 14 | """Storage class for CCMpred raw prediction""" 15 | def __init__(self, ncol, x_single, x_pair, meta): 16 | self.ncol = ncol 17 | self.x_single = x_single 18 | self.x_pair = x_pair 19 | self.meta = meta 20 | 21 | def __repr__(self): 22 | return "".format(self.ncol) 23 | 24 | 25 | def stream_or_file(mode='r'): 26 | """Decorator for making a function accept either a filename or file-like object as a first argument""" 27 | 28 | def inner(fn): 29 | @functools.wraps(fn) 30 | def streamify(f, *args, **kwargs): 31 | if isinstance(f, string_types): 32 | 33 | open_fn = gzip.open if f.endswith(".gz") else open 34 | 35 | try: 36 | fh = open_fn(f, mode) 37 | res = fn(fh, *args, **kwargs) 38 | finally: 39 | fh.close() 40 | 41 | return res 42 | else: 43 | return fn(f, *args, **kwargs) 44 | 45 | return streamify 46 | 47 | return inner 48 | 49 | 50 | _PARSERS = [] 51 | 52 | 53 | def parser(fn): 54 | _PARSERS.append(fn) 55 | return fn 56 | 57 | 58 | @parser 59 | @stream_or_file('rb') 60 | def parse_msgpack(f): 61 | """Parse a msgpack CCMpred prediction from a filename or file object""" 62 | x = msgpack.unpackb(f.read(), encoding="utf-8") 63 | 64 | assert(x['format'] == 'ccm-1') 65 | 66 | ncol = x['ncol'] 67 | x_single = np.array(x['x_single']).reshape((ncol, 20)) 68 | x_pair = np.zeros((ncol, ncol, 21, 21)) 69 | 70 | meta = x['meta'] if 'meta' in x else None 71 | 72 | for p in x['x_pair'].values(): 73 | i = p['i'] 74 | j = p['j'] 75 | mat = np.array(p['x']).reshape((21, 21)) 76 | x_pair[i, j, :, :] = mat 77 | x_pair[j, i, :, :] = mat.T 78 | 79 | return CCMRaw(ncol, x_single, x_pair, meta) 80 | 81 | 82 | @parser 83 | @stream_or_file('r') 84 | def parse_oldraw(f): 85 | """Read raw emission potentials from rawfile""" 86 | 87 | buf = StringIO() 88 | re_identifier = re.compile("^#\s*(\d+)\s+(\d+)\s*$") 89 | 90 | x_single = None 91 | x_pair = None 92 | i, j = None, None 93 | meta = None 94 | for line_idx, line in enumerate(f): 95 | if line.startswith(META_PREFIX): 96 | meta = json.loads(line[len(META_PREFIX):].strip()) 97 | 98 | elif line.startswith("#"): 99 | 100 | buf.seek(0) 101 | 102 | if x_single is not None: 103 | x_pair[i, j, :, :] = np.loadtxt(buf) 104 | x_pair[j, i, :, :] = x_pair[i, j, :, :].T 105 | 106 | else: 107 | x_single = np.loadtxt(buf) 108 | 109 | ncol = x_single.shape[0] 110 | x_pair = np.zeros((ncol, ncol, 21, 21)) 111 | 112 | buf = StringIO() 113 | 114 | m = re_identifier.match(line) 115 | if m: 116 | i, j = int(m.group(1)), int(m.group(2)) 117 | 118 | else: 119 | raise Exception("Line {0} starts with # but doesn't match regex!".format(line_idx + 1)) 120 | 121 | else: 122 | buf.write(line) 123 | 124 | if x_single is not None and buf.tell(): 125 | buf.seek(0) 126 | x_pair[i, j, :, :] = np.loadtxt(buf) 127 | x_pair[j, i, :, :] = x_pair[i, j, :, :].T 128 | 129 | return CCMRaw(ncol, x_single, x_pair, meta) 130 | 131 | 132 | def parse(f): 133 | r = None 134 | for parser in _PARSERS: 135 | try: 136 | if hasattr(f, 'seek'): 137 | f.seek(0) 138 | 139 | r = parser(f) 140 | except Exception as e: 141 | pass 142 | 143 | if r is not None: 144 | continue 145 | return r 146 | 147 | 148 | @stream_or_file('wb') 149 | def write_msgpack(f, data): 150 | 151 | x_single = data.x_single.reshape(data.ncol * 20).tolist() 152 | x_pair = {} 153 | for i in range(data.ncol): 154 | for j in range(i + 1, data.ncol): 155 | x_pair["{0}/{1}".format(i, j)] = { 156 | "i": i, 157 | "j": j, 158 | "x": data.x_pair[i, j, :, :].reshape(21 * 21).tolist() 159 | } 160 | 161 | out = { 162 | "format": "ccm-1", 163 | "ncol": data.ncol, 164 | "x_single": x_single, 165 | "x_pair": x_pair 166 | } 167 | 168 | if data.meta: 169 | out['meta'] = data.meta 170 | 171 | f.write(msgpack.packb(out)) 172 | 173 | 174 | @stream_or_file('wb') 175 | def write_oldraw(f, data): 176 | np.savetxt(f, data.x_single, delimiter="\t") 177 | 178 | for i in range(data.ncol): 179 | for j in range(i + 1, data.ncol): 180 | f.write("# {0} {1}\n".format(i, j).encode("utf-8")) 181 | np.savetxt(f, data.x_pair[i, j], delimiter="\t") 182 | 183 | if data.meta: 184 | f.write(META_PREFIX.encode("utf-8") + json.dumps(data.meta).encode("utf-8") + b"\n") 185 | 186 | if __name__ == '__main__': 187 | # data = parse_oldraw("data/test.raw") 188 | data = parse_msgpack("data/test.braw") 189 | 190 | print("data:") 191 | print(data) 192 | 193 | print("data.x_single.shape:") 194 | print(data.x_single.shape) 195 | 196 | print("data.x_single:") 197 | print(data.x_single) 198 | 199 | print("data.x_pair.shape:") 200 | print(data.x_pair.shape) 201 | 202 | print("data.x_pair[3, 4]:") 203 | print(data.x_pair[3, 4]) 204 | -------------------------------------------------------------------------------- /ccmpred/raw/convert_msgpack.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Convert a msgpack potential file to flatfile format""" 3 | 4 | import ccmraw as cr 5 | 6 | 7 | def main(): 8 | import argparse 9 | parser = argparse.ArgumentParser(description=__doc__) 10 | parser.add_argument("in_msgpack", help="Input raw file in new msgpack format") 11 | parser.add_argument("out_flat", help="Output raw file in old flatfile format") 12 | 13 | opt = parser.parse_args() 14 | 15 | cr.write_oldraw(opt.out_flat, cr.parse_msgpack(opt.in_msgpack)) 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /ccmpred/raw/convert_raw.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Convert a raw potential file to msgpack format""" 3 | 4 | import ccmraw as cr 5 | 6 | 7 | def main(): 8 | import argparse 9 | parser = argparse.ArgumentParser(description=__doc__) 10 | parser.add_argument("in_raw", help="Input raw file in old raw format") 11 | parser.add_argument("out_msgpack", help="Output raw file in new msgpack format") 12 | 13 | opt = parser.parse_args() 14 | 15 | cr.write_msgpack(opt.out_msgpack, cr.parse_oldraw(opt.in_raw)) 16 | 17 | 18 | if __name__ == '__main__': 19 | main() 20 | -------------------------------------------------------------------------------- /ccmpred/regularization.py: -------------------------------------------------------------------------------- 1 | # coding: utf-8 2 | import numpy as np 3 | 4 | 5 | class L2(object): 6 | """L2 regularization on single and pair emission potentials""" 7 | 8 | def __init__(self, lambda_single, lambda_pair_factor, scaling, center_x_single): 9 | self.lambda_single = lambda_single 10 | self.lambda_pair = lambda_pair_factor * scaling 11 | self.lambda_pair_factor = lambda_pair_factor 12 | self.center_x_single = center_x_single 13 | 14 | 15 | def __call__(self, x_single, x_pair): 16 | x_ofs = x_single - self.center_x_single[:, :x_single.shape[1]] 17 | 18 | # log likelihood uses: 19 | # - lambda_single * sum_i sum_a (v_ia - center_x_single)^2 20 | # - lambda_pair / 2 * sum_i sum_j sum_a sum_b (w_ijab)^2 21 | # w_ijab == w_jiba --> potentials are symmetric 22 | 23 | # gradient computes as: 24 | # - 2 * lambda_single * (v_ia - center_x_single) 25 | # - lambda_pair * w_ijab 26 | 27 | g_single = 2 * self.lambda_single * x_ofs 28 | g_pair = self.lambda_pair * x_pair 29 | 30 | fx_reg = self.lambda_single * np.sum(x_ofs * x_ofs) + 0.5 * self.lambda_pair * np.sum(x_pair * x_pair) 31 | 32 | return fx_reg, g_single, g_pair 33 | 34 | def __repr__(self): 35 | return "L₂ regularization (λsingle={0} λpairfactor={1} λpair={2})".format(self.lambda_single, self.lambda_pair_factor, self.lambda_pair) 36 | 37 | -------------------------------------------------------------------------------- /ccmpred/sampling/__init__.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import ccmpred.objfun.cd.cext 3 | import ccmpred.weighting 4 | import ccmpred.trees 5 | import ccmpred.sampling.cext 6 | import numpy as np 7 | import sys 8 | from ccmpred.io.alignment import AMINO_ACIDS 9 | from ccmpred.weighting.cext import count_ids, calculate_weights_simple 10 | import ccmpred.counts 11 | from ccmpred.pseudocounts import PseudoCounts 12 | 13 | def gibbs_sample_sequences(x, msa_sampled, gibbs_steps): 14 | return ccmpred.objfun.cd.cext.gibbs_sample_sequences(msa_sampled, x, gibbs_steps) 15 | 16 | def all_parents(tree): 17 | parents = {} 18 | for clade in tree.find_clades(order='level'): 19 | for child in clade: 20 | parents[child] = clade 21 | return parents 22 | 23 | def mutate_along_phylogeny(tree, seq0, mutation_rate, x): 24 | 25 | ncol = len(seq0) 26 | 27 | #assign ancestor sequence to root 28 | tree.clade.seq = seq0 29 | 30 | #get all parents 31 | parents = all_parents(tree) 32 | 33 | #iterate breadth first over tree and mutate sequences 34 | for clade in tree.find_clades(order="level"): 35 | if clade.name != "root": 36 | #print("parent name: {0} parent seq: {1}".format( parents[clade], parents[clade].seq)) 37 | nmut = int(clade.branch_length * mutation_rate * ncol) 38 | clade.seq =ccmpred.sampling.cext.mutate_sequence(parents[clade].seq, x, nmut, ncol) 39 | #print("clade name: {0} clade seq: {1}".format(clade.name, clade.seq)) 40 | #print("---") 41 | 42 | #get sequences of all leave nodes 43 | msa_sampled = np.array([clade.seq for clade in tree.get_terminals()]) 44 | 45 | return msa_sampled 46 | 47 | def generate_mcmc_sample(x, ncol, msa, size=10000, burn_in=500, sample_type="original"): 48 | 49 | print("Start sampling {0} sequences according to model starting with {1} sequences using burn-in={2}.".format( 50 | size, sample_type, burn_in)) 51 | sys.stdout.flush() 52 | 53 | if msa is not None: 54 | N = msa.shape[0] 55 | else: 56 | N = 1000 57 | 58 | # sample at max 1000 sequences per iteration 59 | sample_size_per_it = np.min([N, 1000]) 60 | 61 | ##repeat sampling until 10k sequences are obtained 62 | repeat = int(np.ceil(size / sample_size_per_it)) 63 | samples = np.empty([repeat * sample_size_per_it, ncol], dtype="uint8") 64 | for i in range(repeat): 65 | 66 | if sample_type == "aln": 67 | 68 | #random selection of sequences from original MSA 69 | sample_seq_id = np.random.choice(ncol, sample_size_per_it, replace=False) 70 | msa_sampled = msa[sample_seq_id] 71 | 72 | elif sample_type == "random": 73 | 74 | #generate random sequences of length L 75 | msa_sampled = np.ascontiguousarray( 76 | [np.random.choice(20, ncol, replace=True) for _ in range(sample_size_per_it)], dtype="uint8") 77 | 78 | elif sample_type == "random-gapped": 79 | 80 | #generate random sequences of length L 81 | msa_sampled = np.ascontiguousarray( 82 | [np.random.choice(20, ncol, replace=True) for _ in range(sample_size_per_it)], dtype="uint8") 83 | 84 | #find gaps in randomly selected original sequences 85 | sample_seq_id = np.random.choice(N, sample_size_per_it, replace=False) 86 | msa_sampled_orig = msa[sample_seq_id] 87 | gap_indices = np.where(msa_sampled_orig == AMINO_ACIDS.index('-')) 88 | 89 | #assign gap states to random sequences 90 | msa_sampled[gap_indices] = AMINO_ACIDS.index('-') 91 | 92 | 93 | # burn in phase to move away from initial sequences 94 | msa_sampled = ccmpred.sampling.gibbs_sample_sequences(x, msa_sampled, gibbs_steps=burn_in) 95 | 96 | # add newly sampled sequences 97 | samples[i * sample_size_per_it: (i + 1) * sample_size_per_it] = msa_sampled 98 | print("sampled alignment has {0} sequences...".format((i + 1) * sample_size_per_it)) 99 | sys.stdout.flush() 100 | 101 | #compute neff of sampled sequences 102 | neff = ccmpred.weighting.get_HHsuite_neff(msa_sampled) 103 | 104 | print("Sampled alignment has Neff {0:.6g}".format(neff)) 105 | 106 | return samples, neff 107 | 108 | def sample_with_mutation_rate(tree, nseq, seq0, x, mutation_rate): 109 | """ 110 | 111 | Parameters 112 | ---------- 113 | tree: Tree object 114 | nseq: int 115 | seq0: 2dim array 116 | x: 117 | mutation_rate: float 118 | 119 | Returns 120 | ------- 121 | 122 | """ 123 | 124 | branch_lengths = tree.branch_lengths 125 | 126 | #how many substitutions per sequence will be performed 127 | nmut = [0]*(len(branch_lengths)-2) 128 | for i, bl in enumerate(branch_lengths[2:]): 129 | nmut[i] = bl * mutation_rate * seq0.shape[1] 130 | print("avg number of amino acid substitutions (parent -> child): {0}".format( 131 | np.round(np.mean(nmut), decimals=0))) 132 | 133 | 134 | # get the average number of amino acid substitution from root --> leave 135 | if tree.type == "binary" or tree.type == "star": 136 | number_splits = 1 137 | if tree.type == "binary": 138 | number_splits = np.log2(nseq) 139 | depth_per_clade = 1.0 /np.ceil(number_splits) 140 | print("avg number of amino acid substitutions (root -> leave): {0}".format( 141 | np.round(1 / depth_per_clade * np.mean(nmut), decimals=0))) 142 | 143 | 144 | # sample sequences according to tree topology 145 | msa_sampled = mutate_along_phylogeny(tree.tree, seq0[0], mutation_rate, x) 146 | 147 | # randomly choose nseq sequences from sampled msa 148 | if msa_sampled.shape[0] > nseq: 149 | msa_sampled = msa_sampled[sorted(np.random.choice(msa_sampled.shape[0], size=nseq, replace=False))] 150 | 151 | # compute neff of sampled sequences 152 | neff = ccmpred.weighting.get_HHsuite_neff(msa_sampled) 153 | 154 | print("\nAlignment with {0} sequences was sampled with mutation rate {1} and has Neff {2:.6g}".format( 155 | nseq, mutation_rate, neff)) 156 | 157 | return msa_sampled, neff 158 | 159 | def sample_to_neff_increasingly(tree, nseq, target_neff, ncol, x, gibbs_steps, root_seq=None): 160 | 161 | branch_lengths = tree.branch_lengths 162 | 163 | print("\nSample alignment of {0} protein sequences with target Neff~{1:.6g}...\n".format( 164 | nseq, target_neff)) 165 | 166 | # keep increasing MR until we are within 1% of target neff 167 | mutation_rate = 1.0 168 | neff = -np.inf 169 | msa_sampled = np.empty((nseq, ncol), dtype="uint8") 170 | while np.abs(target_neff - neff) > 1e-2 * target_neff: 171 | 172 | if root_seq is None: 173 | # sample a new start sequence 174 | seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, gibbs_steps) 175 | print("Ancestor sequence (polyA --> {0} gibbs steps --> seq0) :\n{1}".format(gibbs_steps, "".join( 176 | [AMINO_ACIDS[c] for c in seq0[0]]))) 177 | else: 178 | # start from the specified sequence 179 | seq0 = root_seq 180 | 181 | # how many substitutions per sequence will be performed 182 | nmut = [0] * (len(branch_lengths) - 2) 183 | for i, bl in enumerate(branch_lengths[2:]): 184 | nmut[i] = bl * mutation_rate * ncol 185 | print("avg number of amino acid substitutions (parent -> child): {0}".format( 186 | np.round(np.mean(nmut), decimals=0))) 187 | 188 | # get the average number of amino acid substitution from root --> leave 189 | if tree.type == "binary" or tree.type == "star": 190 | number_splits = 1 191 | if tree.type == "binary": 192 | number_splits = np.log2(nseq) 193 | depth_per_clade = 1.0 / np.ceil(number_splits) 194 | print("avg number of amino acid substitutions (root -> leave): {0}".format( 195 | np.round(1 / depth_per_clade * np.mean(nmut), decimals=0))) 196 | 197 | # sample sequences according to tree topology 198 | msa_sampled = mutate_along_phylogeny(tree.tree, seq0[0], mutation_rate, x) 199 | 200 | # randomly choose nseq sequences from sampled msa 201 | if msa_sampled.shape[0] > nseq: 202 | msa_sampled = msa_sampled[sorted(np.random.choice(msa_sampled.shape[0], size=nseq, replace=False))] 203 | 204 | # compute neff of sampled sequences 205 | neff = ccmpred.weighting.get_HHsuite_neff(msa_sampled) 206 | print("Alignment with {0} sequences was sampled with mutation rate {1:.3g} and has Neff {2:.5g} (ΔNeff [%] = {3:.5g})\n".format( 207 | nseq, mutation_rate, neff, (target_neff - neff)/target_neff*100)) 208 | sys.stdout.flush() 209 | 210 | # inrease mutation rate 211 | if target_neff > neff: 212 | mutation_rate += np.random.random() 213 | 214 | # decrease mutation rate 215 | if target_neff < neff: 216 | mutation_rate -= np.random.random() 217 | 218 | #reset mutation rate if it becomes negative 219 | if mutation_rate < 0 or mutation_rate > 100: 220 | mutation_rate = 1 221 | 222 | return msa_sampled, neff 223 | -------------------------------------------------------------------------------- /ccmpred/sampling/cext/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.ctypeslib as npct 3 | import ctypes 4 | import os.path 5 | 6 | array_1d_float = npct.ndpointer(dtype=np.dtype('float64'), ndim=1, flags='CONTIGUOUS') 7 | array_1d_uint8 = npct.ndpointer(dtype=np.dtype('uint8'), ndim=1, flags='CONTIGUOUS') 8 | array_1d_uint32 = npct.ndpointer(dtype=np.dtype('uint32'), ndim=1, flags='CONTIGUOUS') 9 | array_1d_uint64 = npct.ndpointer(dtype=np.dtype('uint64'), ndim=1, flags='CONTIGUOUS') 10 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS') 11 | 12 | libtreecd = npct.load_library('libtreecd', os.path.join(os.path.dirname(__file__), '_build')) 13 | 14 | libtreecd.mutate_along_tree.restype = None 15 | libtreecd.mutate_along_tree.argtypes = [ 16 | array_1d_uint64, # int32_t *n_children, 17 | array_1d_float, # flt *branch_lengths, 18 | array_1d_float, # flt *x, 19 | ctypes.c_uint64, # uint32_t nvert, 20 | array_2d_char, # uint8_t *seqs, 21 | ctypes.c_uint32, # uint32_t ncol, 22 | ctypes.c_double # flt mutation_rate 23 | ] 24 | 25 | libtreecd.mutate_sequence.restype = None 26 | libtreecd.mutate_sequence.argtypes = [ 27 | array_1d_uint8, # int32_t seq, 28 | array_1d_float, # flt *x, 29 | ctypes.c_uint16, # uint32_t nmut, 30 | ctypes.c_uint32 # uint32_t ncol, 31 | ] 32 | 33 | def mutate_sequence(parent_seq, x, nmut, ncol): 34 | seq = parent_seq.copy() 35 | libtreecd.mutate_sequence(seq, x, nmut, ncol) 36 | 37 | return seq 38 | 39 | def mutate_along_tree(msa_sampled, n_children, branch_lengths, x, nvert, seq0, mutation_rate): 40 | msa_sampled[:, :] = 0 41 | msa_sampled[:seq0.shape[0], :] = seq0 42 | libtreecd.mutate_along_tree(n_children, branch_lengths, x, nvert, msa_sampled, seq0.shape[1], mutation_rate) 43 | 44 | return msa_sampled 45 | -------------------------------------------------------------------------------- /ccmpred/sampling/cext/treecd.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | 7 | #include "treecd.h" 8 | #include "cd.h" 9 | #include "cdutil.h" 10 | 11 | /** 12 | * Mutate a sequence seq nmut times according to potentials in x 13 | * 14 | * @param[inout] seq The sequence to work on 15 | * @param[in] x The single and pairwise emission potentials for computing conditional probabilities 16 | * @param[in] nmut The number of substitutions to perform 17 | * @param[in] ncol The length of the sequence 18 | */ 19 | void mutate_sequence(uint8_t *seq, flt *x, uint16_t nmut, int ncol) { 20 | 21 | flt* pcond = fl_malloc(N_ALPHA); 22 | int i; 23 | 24 | for(int m = 0; m < nmut; m++) { 25 | 26 | //ignore gap positions for sampling 27 | do { 28 | i = pick_random_uniform(ncol - 1); 29 | } while(seq[i] == GAP); 30 | 31 | compute_conditional_probs(i, pcond, x, seq, ncol); 32 | 33 | seq[i] = pick_random_weighted(pcond, N_ALPHA - 1); 34 | // sample gaps as well (need to adjust E2 and X1 in cd.h but single potentials only have dim 20: 35 | // compute_conditional_probs_gaps(i, pcond, x, seq, ncol); 36 | // seq[i] = pick_random_weighted(pcond, N_ALPHA); 37 | } 38 | 39 | fl_free(pcond); 40 | } 41 | 42 | /** 43 | * Mutate a sequence seq nmut times according to potentials in x 44 | * 45 | * @param[inout] seq The sequence to work on 46 | * @param[in] x The single and pairwise emission potentials for computing conditional probabilities 47 | * @param[in] nmut The number of substitutions to perform 48 | * @param[in] ncol The length of the sequence 49 | */ 50 | void mutate_sequence_gibbs(uint8_t *seq, flt *x, uint16_t nmut, int ncol) { 51 | 52 | flt* pcond = fl_malloc(N_ALPHA); 53 | 54 | //int array with elements 1..L 55 | unsigned int sequence_position_vector[ncol]; 56 | for (unsigned int p=0; p < ncol; p++) sequence_position_vector[p] = p; 57 | 58 | for(int m = 0; m < nmut; m++) { 59 | 60 | shuffle(sequence_position_vector, ncol); 61 | 62 | for (int i=0; i < ncol; i++){ 63 | compute_conditional_probs(sequence_position_vector[i], pcond, x, seq, ncol); 64 | seq[sequence_position_vector[i]] = pick_random_weighted(pcond, N_ALPHA - 1); 65 | } 66 | } 67 | 68 | fl_free(pcond); 69 | } 70 | 71 | 72 | 73 | void swap(void **a, void **b) { 74 | void *temp = *a; 75 | *a = *b; 76 | *b = temp; 77 | } 78 | 79 | /** 80 | * Mutate an ancestral sequence along a tree 81 | * 82 | * @param[in] n_children At index i, stores the number of child vertices for vertex i 83 | * @param[in] branch_lengths At index i, stores the length of the branch leading to vertex i 84 | * @param[in] x The single and pairwise emission potentials for computing conditional probabilities 85 | * @param[in] nvert The total number of vertices in the tree 86 | * @param[in] nleaves The total number of leaves in the tree 87 | * @param[inout] seqs The ancestral sequence at the beginning of the array. After this method returns, stores all leaf sequences. 88 | * @param[in] ncol The length of individual sequences 89 | * @param[in] mutation_rate Coefficient to tune the number of substitutions to make per evolutionary time unit 90 | */ 91 | void mutate_along_tree( 92 | uint64_t *n_children, 93 | flt *branch_lengths, 94 | flt *x, 95 | uint64_t nvert, 96 | uint8_t *seqs, 97 | uint32_t ncol, 98 | flt mutation_rate 99 | ) { 100 | 101 | seed_rng(); 102 | 103 | // Preprocessing: Count number of leaves and compute index of first children 104 | uint64_t *first_child_index = (uint64_t *)malloc(sizeof(uint64_t) * nvert); 105 | uint64_t fci = 1; 106 | uint64_t nleaves = 0; 107 | 108 | for(uint64_t i = 0; i < nvert; i++) { 109 | if(n_children[i] == 0) { nleaves++; } 110 | first_child_index[i] = fci; 111 | fci += n_children[i]; 112 | } 113 | 114 | // nc: number of children for vertex at index i of current BFS level 115 | uint64_t *nc_in = (uint64_t *)malloc(sizeof(uint64_t) * nleaves); 116 | uint64_t *nc_out = (uint64_t *)malloc(sizeof(uint64_t) * nleaves); 117 | 118 | // ni: index of vertex at index i of current BFS level 119 | uint64_t *ni_in = (uint64_t *)malloc(sizeof(uint64_t) * nleaves); 120 | uint64_t *ni_out = (uint64_t *)malloc(sizeof(uint64_t) * nleaves); 121 | 122 | // seqs: sequences at index i of current BFS level 123 | uint8_t *seqs_in = (uint8_t *)malloc(sizeof(uint8_t) * ncol * nleaves); 124 | uint8_t *seqs_out = (uint8_t *)malloc(sizeof(uint8_t) * ncol * nleaves); 125 | 126 | // bl: branch length at index i of current BFS level 127 | flt *bl = fl_malloc(nleaves); 128 | 129 | // fill initial level with root nodes and ancestral sequences 130 | uint64_t nn = n_children[0]; 131 | memcpy(nc_in, &n_children[1], sizeof(uint64_t) * nn); 132 | memcpy(seqs_in, seqs, sizeof(uint8_t) * ncol * nn); 133 | for(uint64_t i = 0; i < nn; i++) { 134 | ni_in[i] = i + 1; 135 | } 136 | 137 | // BFS over tree levels 138 | while(nn < nleaves) { 139 | 140 | // Phase 1: grow nc_out, ni_out, bl and seqs_out 141 | uint64_t pos = 0; 142 | for(uint64_t i = 0; i < nn; i++) { 143 | 144 | uint64_t nci = nc_in[i]; 145 | 146 | if(nci == 0) { 147 | // we have no children - copy the leaf node to keep it in next level 148 | nc_out[pos] = nc_in[i]; 149 | ni_out[pos] = ni_in[i]; 150 | bl[pos] = 0; 151 | memcpy(&seqs_out[pos * ncol], &seqs_in[i * ncol], sizeof(uint8_t) * ncol); 152 | 153 | pos++; 154 | 155 | } else { 156 | 157 | // we have one or more children - grow out arrays to make room for descendants 158 | // mutation to descendant sequences will be handled in phase 2 159 | for(uint64_t j = 0; j < nci; j++) { 160 | uint64_t inew = first_child_index[ni_in[i]] + j; 161 | 162 | nc_out[pos] = n_children[inew]; 163 | ni_out[pos] = inew; 164 | bl[pos] = branch_lengths[inew]; 165 | memcpy(&seqs_out[pos * ncol], &seqs_in[i * ncol], sizeof(uint8_t) * ncol); 166 | 167 | pos++; 168 | } 169 | 170 | } 171 | 172 | } 173 | 174 | // Phase 2: evolve seq according to bl 175 | #pragma omp parallel for 176 | for(uint64_t i = 0; i < pos; i++) { 177 | int nmut = bl[i] * mutation_rate * ncol; 178 | //printf("nn = %i, i = %i, nmut = %i, bl[i]=%f\n", nn, i, nmut, bl[i]); 179 | mutate_sequence(&seqs_out[i * ncol], x, nmut, ncol); 180 | } 181 | 182 | nn = pos; 183 | //printf("nn = %i.\n", nn); 184 | swap((void **)&nc_in, (void **)&nc_out); 185 | swap((void **)&ni_in, (void **)&ni_out); 186 | swap((void **)&seqs_in, (void **)&seqs_out); 187 | 188 | } 189 | 190 | memcpy(seqs, seqs_in, sizeof(uint8_t) * ncol * nleaves); 191 | 192 | free(first_child_index); 193 | free(nc_in); 194 | free(nc_out); 195 | free(ni_in); 196 | free(ni_out); 197 | free(seqs_in); 198 | free(seqs_out); 199 | fl_free(bl); 200 | } 201 | -------------------------------------------------------------------------------- /ccmpred/sampling/cext/treecd.h: -------------------------------------------------------------------------------- 1 | #ifndef TREECD_H 2 | #define TREECD_H 3 | 4 | #include 5 | #include "cd.h" 6 | 7 | void mutate_along_tree( 8 | uint64_t *n_children, 9 | flt *branch_lengths, 10 | flt *x, 11 | uint64_t nvert, 12 | uint8_t *seqs, 13 | uint32_t ncol, 14 | flt mutation_rate 15 | ); 16 | 17 | #endif 18 | -------------------------------------------------------------------------------- /ccmpred/sanity_check.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def check_single_potentials(x_single, verbose=0, epsilon=1e-5): 5 | 6 | nr_pot_sum_not_zero = np.where(np.abs(x_single.sum(1)) > epsilon)[0] 7 | if len(nr_pot_sum_not_zero) > 0: 8 | print("Warning: {0} single potentials do not sum to 0 (eps={1}).".format(len(nr_pot_sum_not_zero), epsilon)) 9 | 10 | if verbose: 11 | for ind in nr_pot_sum_not_zero[:10]: 12 | print("e.g.: i={0:<2} has sum_a(v_ia)={1}".format(ind+1, np.sum(x_single[ind]))) 13 | 14 | return 0 15 | 16 | return 1 17 | 18 | def check_pair_potentials(x_pair, verbose=0, epsilon=1e-5): 19 | 20 | indices_triu = np.triu_indices(x_pair.shape[0], 1) 21 | nr_pot_sum_not_zero = np.where(np.abs(x_pair.sum(2).sum(2)[indices_triu]) > epsilon)[0] 22 | if len(nr_pot_sum_not_zero): 23 | print("Warning: {0}/{1} pair potentials do not sum to 0 (eps={2}).".format(len(nr_pot_sum_not_zero), len(indices_triu[0]), epsilon)) 24 | 25 | if verbose: 26 | for ind in nr_pot_sum_not_zero[:10]: 27 | i = indices_triu[0][ind] 28 | j = indices_triu[1][ind] 29 | print("e.g.: i={0:<2} j={1:<2} has sum_ab(w_ijab)={2}".format(i+1, j+1, np.sum(x_pair[i,j]))) 30 | 31 | return 0 32 | 33 | return 1 34 | 35 | 36 | def centering_potentials( x_single, x_pair): 37 | """ 38 | 39 | Enforce gauge choice 40 | 41 | :param x_single: 42 | :param x_pair: 43 | :return: 44 | """ 45 | 46 | means = np.mean(np.mean(x_pair[:, :, :20, :20], axis=2), axis=2) 47 | x_pair[:, :, :20, :20] -= means[:, :, np.newaxis, np.newaxis] 48 | 49 | means = np.mean(x_single[: , :20], axis=1) 50 | x_single[: , :20] -= means[:, np.newaxis] 51 | 52 | 53 | return x_single, x_pair 54 | -------------------------------------------------------------------------------- /ccmpred/scripts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ccmpred/scripts/__init__.py -------------------------------------------------------------------------------- /ccmpred/scripts/convert.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import ccmpred.io.alignment 5 | 6 | 7 | def main(): 8 | 9 | parser = argparse.ArgumentParser(description='Convert Fasta to Psicov format and vice versa.') 10 | 11 | 12 | parser.add_argument("infile", type=str, help="MSA input file") 13 | parser.add_argument("outfile", type=str, help="MSA output file") 14 | parser.add_argument("--msa-in-format", dest="msa_in_format", default="psicov", 15 | help="Input alignment format [default: '%default']") 16 | parser.add_argument("--msa-out-format", dest="msa_out_format", default="fasta", 17 | help="Output alignment format [default: '%default']") 18 | 19 | args = parser.parse_args() 20 | 21 | 22 | msa = ccmpred.io.alignment.read_msa(args.infile, args.msa_in_format) 23 | 24 | with open(args.outfile, "w") as f: 25 | ccmpred.io.alignment.write_msa(f, msa, 26 | ids=["seq_"+str(i) for i in range(msa.shape[0])], 27 | format=args.msa_out_format 28 | ) 29 | 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /ccmpred/scripts/plot_ccmpred.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | """ 4 | Usage: plot_ccmpred.py 5 | 6 | Various plotting functionalities 7 | """ 8 | 9 | import os 10 | import sys 11 | import argparse 12 | import ccmpred.raw as raw 13 | import ccmpred.weighting 14 | from ccmpred.pseudocounts import PseudoCounts 15 | import ccmpred.io as io 16 | import ccmpred.io.contactmatrix as io_cm 17 | import ccmpred.plotting as plot 18 | import ccmpred.gaps as gaps 19 | import pandas as pd 20 | import numpy as np 21 | 22 | def parse_args(): 23 | 24 | parser = argparse.ArgumentParser(description='Various Plotting Functionalities.') 25 | subparsers = parser.add_subparsers(title="Plot types", dest="plot_types") 26 | 27 | 28 | #parent parsers for common flags 29 | parent_parser_out = argparse.ArgumentParser(add_help=False) 30 | requiredNamed = parent_parser_out.add_argument_group('Required Output Arguments') 31 | requiredNamed.add_argument('-o', '--plot-file', dest='plot_file', type=str, required=True, 32 | help='Path to plot file') 33 | 34 | 35 | 36 | #parser for contact map 37 | parser_cmap = subparsers.add_parser('cmap', parents=[parent_parser_out], 38 | help="Specify options for plotting a Contact Map") 39 | 40 | cmap_in_req = parser_cmap.add_argument_group('Required Inputs') 41 | mutual_excl = cmap_in_req.add_mutually_exclusive_group(required=True) 42 | mutual_excl.add_argument('--mat-file', dest='mat_file', type=str, help='path to mat file') 43 | mutual_excl.add_argument('--braw-file', dest='braw_file', type=str,help='path to binary raw coupling file') 44 | 45 | cmap_in = parser_cmap.add_argument_group('Optional Inputs') 46 | cmap_in.add_argument('-p', '--pdb-file', dest='pdb_file', type=str, default=None, 47 | help=' PDB file (renumbered starting from 1) for distance matrix.') 48 | cmap_in.add_argument('-a', '--alignment-file', dest='aln_file', type=str, default=None, 49 | help='path to alignment file') 50 | cmap_in.add_argument("--aln-format", dest="aln_format", default="fasta", 51 | help="File format for MSAs [default: \"%(default)s\"]") 52 | 53 | cmap_options = parser_cmap.add_argument_group('Further Settings for Contact Map Plot') 54 | cmap_options.add_argument('--seq-sep', dest='seqsep', type=int, default=6, help='Minimal sequence separation') 55 | cmap_options.add_argument('--contact-threshold', dest='contact_threshold', type=int, default=8, 56 | help='Contact definition as maximal C_beta distance between residue pairs.') 57 | cmap_options.add_argument("--apc", action="store_true", default=False, help="Apply average product correction") 58 | cmap_options.add_argument("--entropy-correction", dest='entropy_correction', action="store_true", default=False, help="Apply entropy correction") 59 | 60 | 61 | # parser for aa distribution plot 62 | parser_aa_dist = subparsers.add_parser('aa-dist', parents=[parent_parser_out], 63 | help="Specify options for plotting the amino acid distribution in an alignment") 64 | 65 | aadist_in_req = parser_aa_dist.add_argument_group('Required Inputs') 66 | aadist_in_req.add_argument('-a', '--alignment-file', dest='aln_file', type=str, required=True, 67 | help='path to alignment file') 68 | aadist_in_req.add_argument("--aln-format", dest="aln_format", default="psicov", 69 | help="File format for MSAs [default: \"%(default)s\"]") 70 | 71 | 72 | # parser for alignment statistics plot 73 | parser_aln_stats = subparsers.add_parser( 74 | 'aln-stats', parents=[parent_parser_out], 75 | help="Specify options for plotting the alignment statistics of two alignments against each other") 76 | 77 | alnstats_in_req = parser_aln_stats.add_argument_group('Required Inputs') 78 | alnstats_in_req.add_argument('-a', '--alignment-file', dest='aln_file', type=str, required=True, 79 | help='path to alignment file') 80 | alnstats_in_req.add_argument("--aln-format", dest="aln_format", default="psicov", 81 | help="File format for MSAs [default: \"%(default)s\"]") 82 | alnstats_in_req.add_argument('-s', '--sampled-alignment-file', dest='sample_aln_file', type=str, required=True, 83 | help='path to sampled alignment' ) 84 | 85 | parser_aln_stats.add_argument("--max-gap-pos", dest="max_gap_pos", default=100, type=int, 86 | help="Ignore alignment positions with > MAX_GAP_POS percent gaps. " 87 | "[default: %(default)s == no removal of positions]") 88 | 89 | 90 | 91 | args = parser.parse_args() 92 | 93 | if args.plot_types == "cmap": 94 | if args.entropy_correction and args.alignment_file is None: 95 | print("Alignment file (-a) must be specified to compute entropy correction!") 96 | 97 | if args.entropy_correction and args.braw_file is None: 98 | print("Binary Raw file (-b) must be specified to compute entropy correction!") 99 | 100 | return args 101 | 102 | def plot_contact_map(alignment_file, aln_format, braw_file, mat_file, pdb_file, plot_file, 103 | entropy_correction, apc, seqsep, contact_threshold): 104 | 105 | pseudocounts = None 106 | mat = None 107 | gaps_percentage_plot = None 108 | protein = None 109 | 110 | 111 | if entropy_correction and (alignment_file is None or braw_file is None): 112 | print("Entropy correction requires specification of alignment file and binary raw couplign file!") 113 | sys.exit(1) 114 | 115 | if alignment_file is not None: 116 | protein = os.path.basename(alignment_file).split(".")[0] 117 | alignment = io.read_msa(alignment_file, aln_format) 118 | 119 | # compute sequence weights 120 | weights = ccmpred.weighting.weights_simple(alignment, 0.8) 121 | 122 | # compute frequencies 123 | pseudocounts = PseudoCounts(alignment, weights) 124 | pseudocounts.calculate_frequencies( 125 | 'uniform_pseudocounts', 1, 1, remove_gaps=False 126 | ) 127 | 128 | gaps_percentage_plot = plot.plot_percentage_gaps_per_position(pseudocounts.counts[0], plot_file=None) 129 | 130 | if braw_file is not None: 131 | 132 | protein = os.path.basename(braw_file).split(".")[0] 133 | 134 | braw = raw.parse_msgpack(braw_file) 135 | meta_info = braw.meta 136 | 137 | # compute frobenius score from couplings 138 | mat = io_cm.frobenius_score(braw.x_pair) 139 | 140 | if entropy_correction: 141 | 142 | scaling_factor_eta, mat = io_cm.compute_local_correction( 143 | pseudocounts.freqs[0], 144 | braw.x_pair, 145 | meta_info['workflow'][0]['msafile']['neff'], 146 | meta_info['workflow'][0]['regularization']['lambda_pair'], 147 | mat, 148 | entropy=True 149 | ) 150 | elif apc: 151 | mat = io_cm.apc(mat) 152 | 153 | if mat_file is not None: 154 | 155 | protein = os.path.basename(mat_file).split(".")[0] 156 | 157 | mat, meta_info = io_cm.read_matrix(mat_file) 158 | 159 | if apc: 160 | mat = io_cm.apc(mat) 161 | 162 | L = len(mat) 163 | indices_upper_tri_i, indices_upper_tri_j = np.triu_indices(L, seqsep) 164 | 165 | plot_matrix = pd.DataFrame() 166 | plot_matrix['residue_i'] = indices_upper_tri_i + 1 167 | plot_matrix['residue_j'] = indices_upper_tri_j + 1 168 | plot_matrix['confidence'] = mat[indices_upper_tri_i, indices_upper_tri_j] 169 | 170 | if pdb_file is not None: 171 | # compute distance map from pdb file 172 | observed_distances = io.distance_map(pdb_file, L) 173 | plot_matrix['distance'] = observed_distances[indices_upper_tri_i, indices_upper_tri_j] 174 | plot_matrix['contact'] = ((plot_matrix.distance < contact_threshold) * 1).tolist() 175 | 176 | 177 | plot_title="Contact Map for protein {0}".format(protein) 178 | 179 | # Plot Contact Map 180 | plot.plot_contact_map_someScore_plotly(plot_matrix, plot_title, seqsep, gaps_percentage_plot, plot_file) 181 | 182 | def plot_aminoacid_distribution(alignment_file, aln_format, plot_file): 183 | 184 | protein = os.path.basename(alignment_file).split(".")[0] 185 | 186 | #read alignment 187 | try: 188 | alignment = io.read_msa(alignment_file, aln_format) 189 | except OSError as e: 190 | print("Problems reading alignment file {0}: {1}!".format(alignment_file, e)) 191 | sys.exit(0) 192 | 193 | N = alignment.shape[0] 194 | L = alignment.shape[1] 195 | diversity = np.sqrt(N) / L 196 | 197 | # compute sequence weights 198 | weights = ccmpred.weighting.weights_simple(alignment, 0.8, False) 199 | 200 | # compute frequencies 201 | pseudocounts = PseudoCounts(alignment, weights) 202 | pseudocounts.calculate_frequencies( 203 | 'uniform_pseudocounts', 1, 1, remove_gaps=False 204 | ) 205 | 206 | #plot 207 | plot.plot_alignment( 208 | pseudocounts.counts[0], 209 | "Amino Acid Distribution in Alignment for {0} (N={1}, L={2}, diversity={3})".format( 210 | protein, N, L, np.round(diversity, decimals=3)), plot_file 211 | ) 212 | 213 | def plot_alignment_statistics(alignment_file, sample_aln_file, aln_format, max_gap_pos, plot_file): 214 | 215 | 216 | #read alignment 217 | try: 218 | alignment = io.read_msa(alignment_file, aln_format) 219 | except OSError as e: 220 | print("Problems reading alignment file {0}: {1}!".format(alignment_file, e)) 221 | sys.exit(0) 222 | 223 | try: 224 | sampled_alignment = io.read_msa(sample_aln_file, aln_format) 225 | except OSError as e: 226 | print("Problems reading alignment file {0}: {1}!".format(sample_aln_file, e)) 227 | sys.exit(0) 228 | 229 | 230 | #Remove positions with > MAX_GAP_POS % gaps 231 | if max_gap_pos < 100: 232 | alignment, gapped_positions = gaps.remove_gapped_positions(alignment, max_gap_pos) 233 | non_gapped_positions = [i for i in range(sampled_alignment.shape[1]) if i not in gapped_positions] 234 | sampled_alignment = np.ascontiguousarray(sampled_alignment[:, non_gapped_positions]) 235 | 236 | # compute sequence weights for observed sequences 237 | weights = ccmpred.weighting.weights_simple(alignment, 0.8) 238 | 239 | # compute observed amino acid frequencies 240 | pseudocounts = PseudoCounts(alignment, weights) 241 | pseudocounts.calculate_frequencies( 242 | 'uniform_pseudocounts', 1, 1, remove_gaps=False 243 | ) 244 | single_freq_observed, pairwise_freq_observed = pseudocounts.freqs 245 | 246 | 247 | # compute sequence weights for sampled sequences (usually all sampled sequences obtain weight = 1 ) 248 | weights_sampled = ccmpred.weighting.weights_simple(sampled_alignment, 0.8) 249 | 250 | # compute sampled amino acid frequencies 251 | pseudocounts = PseudoCounts(sampled_alignment, weights_sampled) 252 | pseudocounts.calculate_frequencies( 253 | 'uniform_pseudocounts', 1, 1, remove_gaps=False 254 | ) 255 | single_freq_sampled, pairwise_freq_sampled = pseudocounts.freqs 256 | 257 | # degap the frequencies (ignore gap frequencies) 258 | single_freq_observed = pseudocounts.degap(single_freq_observed, False) 259 | single_freq_sampled = pseudocounts.degap(single_freq_sampled, False) 260 | pairwise_freq_observed = pseudocounts.degap(pairwise_freq_observed, False) 261 | pairwise_freq_sampled = pseudocounts.degap(pairwise_freq_sampled, False) 262 | 263 | # plot 264 | plot.plot_empirical_vs_model_statistics( 265 | single_freq_observed, single_freq_sampled, 266 | pairwise_freq_observed, pairwise_freq_sampled, 267 | plot_file) 268 | 269 | 270 | 271 | def main(): 272 | 273 | args = parse_args() 274 | 275 | if args.plot_types == "cmap": 276 | print("Write plot for contact map to {0}".format(args.plot_file)) 277 | 278 | plot_contact_map( 279 | args.aln_file, args.aln_format, 280 | args.braw_file, args.mat_file, args.pdb_file, args.plot_file, 281 | args.entropy_correction, args.apc, 282 | args.seqsep, args.contact_threshold 283 | ) 284 | 285 | if args.plot_types == "aa-dist": 286 | print("Write plot for amino acid distribution in alignment to {0}".format(args.plot_file)) 287 | 288 | plot_aminoacid_distribution( 289 | args.aln_file, args.aln_format, 290 | args.plot_file 291 | ) 292 | 293 | if args.plot_types == "aln-stats": 294 | print("Write plot for alignment statistics to {0}".format(args.plot_file)) 295 | 296 | plot_alignment_statistics( 297 | args.aln_file, args.sample_aln_file, args.aln_format, args.max_gap_pos, 298 | args.plot_file 299 | ) 300 | 301 | 302 | 303 | if __name__ == '__main__': 304 | main() -------------------------------------------------------------------------------- /ccmpred/scripts/replace_gaps.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import optparse 3 | 4 | import ccmpred.io.alignment 5 | import ccmpred.gaps 6 | 7 | 8 | def main(): 9 | parser = optparse.OptionParser(usage="%prog [options] msa_in_file msa_out_file") 10 | 11 | parser.add_option("--with-consensus", dest="replacement", action="store_const", const=ccmpred.gaps.remove_gaps_consensus, help="Remove gaps with consensus characters") 12 | parser.add_option("--with-col-freqs", dest="replacement", action="store_const", const=ccmpred.gaps.remove_gaps_col_freqs, help="Remove gaps with column character frequencies") 13 | parser.add_option("--msa-in-format", dest="msa_in_format", default="psicov", help="Input alignment format [default: '%default']") 14 | 15 | opt, args = parser.parse_args() 16 | 17 | if not opt.replacement: 18 | parser.error("Need to specify one of the --with-* options!") 19 | 20 | if not len(args) == 2: 21 | parser.error("Need exactly two positional arguments!") 22 | 23 | msa_in_file, msa_out_file = args 24 | 25 | msa = ccmpred.io.alignment.read_msa(msa_in_file, opt.msa_in_format) 26 | msa_nogaps = opt.replacement(msa) 27 | 28 | with open(msa_out_file, "w") as f: 29 | ccmpred.io.alignment.write_msa_psicov(f, msa_nogaps) 30 | 31 | 32 | if __name__ == '__main__': 33 | main() 34 | -------------------------------------------------------------------------------- /ccmpred/scripts/run_ccmgen.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | import argparse 4 | import os 5 | from ccmpred import CCMpred 6 | import ccmpred.logo 7 | import ccmpred.io.alignment 8 | import ccmpred.raw 9 | import ccmpred.weighting 10 | import ccmpred.sampling 11 | import ccmpred.gaps 12 | import ccmpred.trees 13 | import ccmpred.parameter_handling 14 | import numpy as np 15 | 16 | EPILOG = """ 17 | Generate a realistic synthetic multiple sequence alignment (MSA) of protein sequences 18 | complying constraints from a Markov Random Field model. 19 | 20 | In a first step, a Markov Random Field Model will have to be learned from a source protein MSA using 21 | e.g. CCMpredPy with the -b command. 22 | This learned model can then be passed to the CCMgen call as RAWFILE. 23 | 24 | """ 25 | 26 | 27 | 28 | def parse_args(): 29 | parser = argparse.ArgumentParser(epilog=EPILOG) 30 | 31 | parser.add_argument("rawfile", help="Raw coupling potential file as generated by the CCMpredPy -b option") 32 | parser.add_argument("outalnfile", help="Output alignment file for sampled sequences.") 33 | 34 | 35 | 36 | grp_opt = parser.add_argument_group("General Options") 37 | grp_opt.add_argument("--alnfile", dest="alnfile", metavar="ALN_FILE", type=str, 38 | help="Reference alignment file that is used to specify NEFF and NSEQ") 39 | grp_opt.add_argument("--num-sequences", dest="nseq", type=int, default=2**10, 40 | help="Specify the number of sequences to generate to NSEQ " 41 | "(does not apply when newick file is specified) [default: %(default)s]") 42 | grp_opt.add_argument("--max-gap-pos", dest="max_gap_pos", default=100, type=int, 43 | help="Ignore alignment positions with > MAX_GAP_POS percent gaps when reading ALN_FILE. " 44 | "[default: %(default)s == no removal of gaps]") 45 | grp_opt.add_argument("--max-gap-seq", dest="max_gap_seq", default=100, type=int, 46 | help="Remove sequences with >X percent gaps when reading ALN_FILE. " 47 | "[default: %(default)s == no removal of sequences]") 48 | grp_opt.add_argument("--aln-format", dest="aln_format", type=str, default="fasta", 49 | help="Specify format for alignment files [default: %(default)s]") 50 | grp_opt.add_argument("--num-threads", dest="num_threads", type=int, default=1, 51 | help="Specify the number of threads. [default: %(default)s]") 52 | 53 | 54 | 55 | 56 | grp_tr = parser.add_argument_group("Phylogenetic Tree Options") 57 | grp_tr_me = grp_tr.add_mutually_exclusive_group() 58 | grp_tr_me.add_argument("--tree-newick", dest="tree_file", type=str, 59 | help="Load tree from newick-formatted file") 60 | grp_tr_me.add_argument("--tree-binary", dest="tree_source", action="store_const", const="binary", 61 | help="Generate a binary tree with equally distributed branch lengths.") 62 | grp_tr_me.add_argument("--tree-star", dest="tree_source", action="store_const", const="star", 63 | help="Generate a tree where all leaf nodes are direct descendants of the root node.") 64 | grp_tr_me.add_argument("--mcmc-sampling", dest="mcmc", action="store_true", default=False, 65 | help="Generate MCMC sample without following tree topology.") 66 | 67 | 68 | 69 | grp_tr_opt = parser.add_argument_group("Tree Sampling Options") 70 | grp_tr_opt_me = grp_tr_opt.add_mutually_exclusive_group() 71 | grp_tr_opt_me.add_argument("--mutation-rate", dest="mutation_rate", type=float, 72 | help="Specify constant mutation rate") 73 | grp_tr_opt_me.add_argument("--mutation-rate-neff", dest="neff", nargs='?', type=float, const=0, default=None, 74 | help="Set the mutation rate to approximately hit a target number of effective sequences, Neff " 75 | "(calculated as in the HHsuite package (https://github.com/soedinglab/hh-suite)). " 76 | "Without specifying NEFF, the value will be determined from ALN_FILE." ) 77 | 78 | 79 | grp_s0 = parser.add_argument_group("Initial Sequence Options") 80 | grp_s0_me = grp_s0.add_mutually_exclusive_group() 81 | grp_s0_me.add_argument("--seq0-mrf", dest="seq0_mrf", metavar="NMUT", type=int, default=10, 82 | help="Start out with an all-alanine sequence and use the MRF model to evolve " 83 | "the sequence for NMUT Gibbs steps. [default: NMUT=%(default)s]") 84 | grp_s0_me.add_argument("--seq0-file", dest="seq0_file", metavar="SEQ_FILE", type=str, 85 | help="Specify ancestor sequence in SEQ_FILE.") 86 | 87 | 88 | 89 | grp_mcmc = parser.add_argument_group("MCMC Sampling Options") 90 | grp_mcmc_me = grp_mcmc.add_mutually_exclusive_group() 91 | grp_mcmc_me.add_argument("--mcmc-sample-random-gapped", dest="mcmc_sample_type", action="store_const", const="random-gapped", 92 | default="random-gapped", 93 | help="Sample sequences starting from random sequences. Gap structure of randomly selected " 94 | "input sequences will be copied. Gap positions are not sampled. " 95 | "(requires --alnfile option)[default]") 96 | grp_mcmc_me.add_argument("--mcmc-sample-random", dest="mcmc_sample_type", action="store_const", const="random", 97 | help="Sample sequences starting from random sequences comprised of 20 amino acids. ") 98 | grp_mcmc_me.add_argument("--mcmc-sample-aln", dest="mcmc_sample_type", action="store_const", const="aln", 99 | help="Sample sequences starting from original sequences (requires setting ALN_FILE).") 100 | grp_mcmc.add_argument("--mcmc-burn-in", dest="mcmc_burn_in", type=int, default=500, 101 | help="Number of Gibbs sampling steps to evolve a Markov chain before a sample is obtained.") 102 | 103 | 104 | 105 | 106 | opt = parser.parse_args() 107 | 108 | if not opt.mcmc: 109 | 110 | if not opt.tree_source and not opt.tree_file: 111 | parser.error("Need one of the --tree-* options or --mcmc-sampling!") 112 | 113 | if not opt.mutation_rate and opt.neff is None: 114 | parser.error("Need one of the --mutation-rate* options!") 115 | 116 | if not opt.mutation_rate and opt.neff == 0 and not opt.alnfile: 117 | parser.error("Need to specify Neff with either --mutation-rate-neff or via an alignment file (--alnfile)!") 118 | 119 | 120 | if opt.mcmc: 121 | if (opt.mcmc_sample_type == "aln" or opt.mcmc_sample_type == "random-gapped") and not opt.alnfile: 122 | parser.error("Need an alignment file (--alnfile) for use with " 123 | "--mcmc-sample-aln and --mcmc-sample-random-gapped!") 124 | 125 | return opt 126 | 127 | 128 | 129 | def main(): 130 | 131 | def read_root_sequence(seq0_file, aln_format, print_sequence=True): 132 | seq0 = ccmpred.io.alignment.read_msa(seq0_file, aln_format) 133 | seq_N, seq_L = seq0.shape 134 | 135 | if seq_L != ncol: 136 | print("Length of ancestor sequence must match dimension of MRF model!") 137 | exit(0) 138 | 139 | if seq_N>1: 140 | print("You passed a fasta file with more than one sequence as a root sequences! We took the first sequence.") 141 | print_sequence = True 142 | 143 | if print_sequence: 144 | print("Ancestor sequence:\n{0}".format("".join([ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]]))) 145 | 146 | return seq0 147 | 148 | # read command line options 149 | opt = parse_args() 150 | 151 | ccmpred.logo.logo(what_for="ccmgen") 152 | 153 | # set OMP environment variable for number of threads 154 | os.environ['OMP_NUM_THREADS'] = str(opt.num_threads) 155 | print("Using {0} threads for OMP parallelization.".format(os.environ["OMP_NUM_THREADS"])) 156 | 157 | # instantiate CCMpred 158 | ccm = CCMpred() 159 | 160 | # specify possible file paths 161 | ccm.set_initraw_file(opt.rawfile) 162 | 163 | 164 | # read alignment and remove gapped sequences and positions 165 | if opt.alnfile: 166 | ccm.set_alignment_file(opt.alnfile) 167 | ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq) 168 | 169 | 170 | #read potentials from binary raw file (possibly remove positions with many gaps) 171 | ccm.intialise_potentials() 172 | x = ccmpred.parameter_handling.structured_to_linear(ccm.x_single, ccm.x_pair, nogapstate=True, padding=False) 173 | ncol = ccm.x_single.shape[0] 174 | 175 | 176 | #if MCMC sampling is specified 177 | if opt.mcmc: 178 | msa_sampled, neff = ccmpred.sampling.generate_mcmc_sample( 179 | x, ncol, ccm.msa, size=opt.nseq, burn_in=opt.mcmc_burn_in, sample_type=opt.mcmc_sample_type) 180 | 181 | ids = ["seq {0}".format(i) for i in range(msa_sampled.shape[0])] 182 | 183 | else: 184 | 185 | tree = ccmpred.trees.CCMTree() 186 | 187 | #prepare tree topology 188 | if opt.tree_file: 189 | 190 | tree.load_tree(opt.tree_file) 191 | nseq = tree.n_leaves 192 | 193 | else: 194 | 195 | if opt.alnfile: 196 | nseq = ccm.N 197 | else: 198 | nseq = opt.nseq 199 | tree.specify_tree(nseq, opt.tree_source) 200 | 201 | 202 | ids = tree.ids 203 | 204 | 205 | # sample alignment with specified mutation rate 206 | if opt.mutation_rate: 207 | seq0 = np.zeros((1, ncol), dtype="uint8") 208 | 209 | if opt.seq0_mrf and not opt.seq0_file: 210 | seq0 = ccmpred.trees.get_seq0_mrf(x, ncol, opt.seq0_mrf) 211 | print("Ancestor sequence (polyA --> {0} gibbs steps --> seq0) :\n{1}".format( 212 | opt.seq0_mrf, "".join([ccmpred.io.alignment.AMINO_ACIDS[c] for c in seq0[0]]))) 213 | 214 | elif opt.seq0_file: 215 | seq0 = read_root_sequence(opt.seq0_file, opt.aln_format) 216 | 217 | msa_sampled, neff = ccmpred.sampling.sample_with_mutation_rate( 218 | tree, nseq, seq0, x, opt.mutation_rate) 219 | 220 | # sample an alignment that has approximately the specified Neff 221 | else: 222 | seq0 = None 223 | 224 | if opt.alnfile: 225 | neff = ccm.neff_entropy 226 | else: 227 | neff = opt.neff 228 | 229 | if opt.seq0_file: 230 | seq0 = read_root_sequence(opt.seq0_file, opt.aln_format) 231 | 232 | msa_sampled, neff = ccmpred.sampling.sample_to_neff_increasingly( 233 | tree, nseq, neff, ncol, x, opt.seq0_mrf, root_seq=seq0) 234 | 235 | 236 | 237 | # if gappy positions have been removed 238 | # insert columns with gaps at that position 239 | if ccm.max_gap_pos < 100: 240 | msa_sampled = ccmpred.gaps.backinsert_gapped_positions_aln( 241 | msa_sampled, ccm.gapped_positions 242 | ) 243 | 244 | 245 | print("\nWriting sampled alignment to {0}".format(opt.outalnfile)) 246 | with open(opt.outalnfile, "w") as f: 247 | descs=["synthetic sequence generated with CCMgen" for _ in range(msa_sampled.shape[0])] 248 | ccmpred.io.alignment.write_msa(f, msa_sampled, ids, is_indices=True, format=opt.aln_format, descriptions=descs) 249 | 250 | 251 | if __name__ == '__main__': 252 | main() 253 | -------------------------------------------------------------------------------- /ccmpred/scripts/run_ccmpred.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import argparse 3 | import sys 4 | import os 5 | 6 | from ccmpred import CCMpred 7 | import ccmpred.logo 8 | 9 | 10 | EPILOG = """ 11 | CCMpredPy is a fast python implementation of contact prediction method based on correlated mutations. 12 | From an alignment given as alnfile, it will infer the parameters of a Potts model with 21 states for amino acids and gaps. 13 | Either pseudo-likelihood maximization or contrastive divergence can be chosen as inference algorithm. 14 | The L2 norms of the pairwise coupling potentials will be written to the output matfile. 15 | """ 16 | 17 | 18 | class StoreConstParametersAction(argparse.Action): 19 | def __init__(self, option_strings, dest, nargs=None, arg_default=None, default=None, **kwargs): 20 | self.arg_default = arg_default 21 | default = (default, arg_default) 22 | super(StoreConstParametersAction, self).__init__(option_strings, dest, nargs=nargs, default=default, **kwargs) 23 | 24 | def __call__(self, parser, namespace, values, option_string=None): 25 | if values is None or values == self.const: 26 | values = self.arg_default 27 | setattr(namespace, self.dest, (self.const, values)) 28 | 29 | 30 | def parse_args(): 31 | parser = argparse.ArgumentParser(description="Recover direct couplings from a multiple sequence alignment", epilog=EPILOG) 32 | 33 | parser.add_argument("alnfile", help="Input alignment file to use") 34 | 35 | grp_general = parser.add_argument_group("General Options") 36 | grp_general.add_argument("--num-threads", dest="num_threads", type=int, default=1, 37 | help="Specify the number of threads. [default: %(default)s]") 38 | grp_general.add_argument("--aln-format", dest="aln_format", default="fasta", 39 | help="File format for MSAs [default: \"%(default)s\"]") 40 | grp_general.add_argument("--no-logo", dest="logo", default=True, action="store_false", 41 | help="Disable showing the CCMpred logo [default: %(default)s]") 42 | 43 | 44 | grp_out = parser.add_argument_group("Output Options") 45 | grp_out.add_argument("-m", "--mat-file", dest="matfile", type=str, 46 | help="Write contact score matrix to file. [default: %(default)s]") 47 | grp_out.add_argument("-b", "--write-binary-raw", dest="out_binary_raw_file", type=str, 48 | help="Write single and pairwise potentials as binary MessagePack file. [default: %(default)s]") 49 | grp_out.add_argument("--plot-opt-progress", dest="plot_opt_progress", type=str, 50 | help="Continously plot optimization progress as an interactive HTML. [default: %(default)s]") 51 | 52 | 53 | grp_in = parser.add_argument_group("Optional Input Options") 54 | grp_in.add_argument("-i", "--init-from-raw", dest="initrawfile", default=None, 55 | help="Init single and pair potentials from a binary raw file") 56 | grp_in.add_argument("--do-not-optimize", dest="optimize", action="store_false", default=True, 57 | help="Do not optimize potentials. Requires providing initial model parameters with -i.") 58 | 59 | 60 | 61 | grp_pll = parser.add_argument_group("Pseudo-Likelihood Options") 62 | grp_pll.add_argument("--ofn-pll", dest="objfun", action="store_const", const="pll", default="pll", 63 | help="Use pseudo-log-likelihood(pLL)") 64 | grp_pll.add_argument("--lbfgs-ftol", dest="ftol", default=1e-4, type=float, 65 | help="LBFGS: convergence criterion ftol. [default: %(default)s]") 66 | grp_pll.add_argument("--lbfgs-max-linesearch", dest="max_linesearch", default=5, type=int, 67 | help="LBFGS: maximum number of linesearch steps. [default: %(default)s]") 68 | grp_pll.add_argument("--lbfgs-maxcor", dest="max_cor", default=5, type=int, 69 | help="LBFGS: maximum number of corrections for memory. [default: %(default)s]") 70 | 71 | 72 | grp_cd = parser.add_argument_group("(Persistent) Contrastive Divergence Options") 73 | grp_cd.add_argument("--ofn-cd",dest="objfun",action="store_const",const="cd",help="Use contrastive divergence (CD)") 74 | grp_cd.add_argument("--nr-markov-chains", dest="nr_seq_sample", type=int, default=500, help="Number of parallel " 75 | "Markov chains used for sampling at each iteration. [default: %(default)s] ") 76 | grp_cd.add_argument("--gibbs_steps", dest="cd_gibbs_steps", type=int, default=1, 77 | help="Number of Gibbs steps used to evolve each Markov chain " 78 | "in each iteration of the optimization. [default: %(default)s]") 79 | grp_cd.add_argument("--persistent", dest="cd_persistent", action="store_true", default=False, help="Switch on " 80 | "PERSISTENT CD once the learning rate is small enough (< alpha_0 / 10) [default: %(default)s]") 81 | grp_cd.add_argument("--alpha0", dest="alpha0", default=1e-3, type=float, 82 | help="GD: Set initial learning rate. [default: %(default)s]") 83 | grp_cd.add_argument("--no-decay", dest="decay", action="store_false", default=True, 84 | help="GD: Do not use decaying learning rate** (`--no-decay`): Do not use decaying learnign " 85 | "rates. Decay is started when convergence criteria falls below value of START_DECAY. " 86 | "[default: %(default)s]") 87 | grp_cd.add_argument("--decay-start", dest="decay_start", default=1e-1, type=float, 88 | help="GD: Start decay when convergence criteria < START_DECAY." 89 | "[default: %(default)s]") 90 | grp_cd.add_argument("--decay-rate", dest="decay_rate", default=5e-6, type=float, 91 | help="GD: Set rate of decay for learning rate. [default: %(default)s]") 92 | grp_cd.add_argument("--decay-type", dest="decay_type", default="sig", type=str, 93 | choices=['sig', 'sqrt', 'exp', 'lin'], 94 | help="GD: Decay type. [default: %(default)s]") 95 | 96 | 97 | grp_con = parser.add_argument_group("Convergence Settings") 98 | grp_con.add_argument("--maxit", dest="maxit", default=2000, type=int, 99 | help="Stop when MAXIT number of iterations is reached. [default: %(default)s]") 100 | grp_con.add_argument("--early-stopping", dest="early_stopping", default=False, action="store_true", 101 | help="Apply convergence criteria instead of only maxit. [default: %(default)s]") 102 | grp_con.add_argument("--epsilon", dest="epsilon", default=1e-5, type=float, 103 | help="Converged when relative change in f (or xnorm) in last CONVERGENCE_PREV iterations " 104 | "< EPSILON. [default: %(default)s]") 105 | grp_con.add_argument("--convergence_prev", dest="convergence_prev", default=5, type=int, 106 | help="Set CONVERGENCE_PREV parameter. [default: %(default)s]") 107 | 108 | 109 | 110 | grp_constraints = parser.add_argument_group("Use with Contraints (non-contacts will obtain zero couplings)") 111 | grp_constraints.add_argument("--pdb-file", dest="pdbfile", help="Input PDB file") 112 | grp_constraints.add_argument("--contact-threshold", dest="contact_threshold", type=int, default=8, 113 | help="Definition of residue pairs forming a contact wrt distance of their Cbeta atoms in " 114 | "angstrom. [default: %(default)s]") 115 | 116 | 117 | 118 | grp_corr = parser.add_argument_group("Corrections applied to Contact Score") 119 | grp_corr.add_argument("--apc", dest="apc_file", type=str, default=None, 120 | help="Path to contact matrix file corrected with average product correction (APC). " 121 | "[default: %(default)s] ") 122 | grp_corr.add_argument("--entropy-correction", dest="entropy_correction_file", type=str, default=None, 123 | help="Path to contact matrix file corrected with entropy correction. " 124 | "[default: %(default)s]") 125 | 126 | 127 | grp_wt = parser.add_argument_group("Sequence Weighting") 128 | grp_wt.add_argument("--wt-simple", dest="weight", action="store_const", const="simple", 129 | default="simple", help='Use simple weighting [default: %(default)s]') 130 | grp_wt.add_argument("--wt-uniform", dest="weight", action="store_const", const="uniform", 131 | help='Use uniform weighting') 132 | grp_wt.add_argument("--wt-cutoff", dest="wt_cutoff", type=float, default=0.8, 133 | help="Sequence identity threshold. [default: %(default)s]") 134 | 135 | 136 | grp_rg = parser.add_argument_group("Regularization") 137 | grp_rg.add_argument("--reg-lambda-single", dest="lambda_single", type=float, default=10, 138 | help='Regularization coefficient for single potentials (L2 regularization) ' 139 | '[default: %(default)s]') 140 | grp_rg.add_argument("--reg-lambda-pair-factor", dest="lambda_pair_factor", type=float, default=0.2, 141 | help='Regularization parameter for pair potentials (L2 regularization with ' 142 | 'lambda_pair = lambda_pair-factor * scaling) [default: %(default)s]') 143 | grp_rg.add_argument("--v-center", dest="single_prior", action="store_const", const="v-center", default="v-center", 144 | help="Use mu=v* in Gaussian prior for single emissions and initialization. [default: %(default)s]") 145 | grp_rg.add_argument("--v-zero", dest="single_prior", action="store_const", const="v-zero", 146 | help="Use mu=0 in Gaussian prior for single emissions and initialisation.") 147 | 148 | 149 | 150 | grp_gap = parser.add_argument_group("Gap Treatment") 151 | grp_gap.add_argument("--max-gap-pos", dest="max_gap_pos", default=100, type=int, 152 | help="Ignore alignment positions with > MAX_GAP_POS percent gaps. " 153 | "[default: %(default)s == no removal of positions]") 154 | grp_gap.add_argument("--max-gap-seq", dest="max_gap_seq", default=100, type=int, 155 | help="Remove sequences with > MAX_GAP_SEQ percent gaps. [default: %(default)s == no removal of sequences]") 156 | 157 | 158 | grp_pc = parser.add_argument_group("Pseudocounts") 159 | grp_pc.add_argument("--pc-uniform", dest="pseudocounts", action="store_const", const="uniform_pseudocounts", 160 | default="uniform_pseudocounts", 161 | help="Use uniform pseudocounts, e.g 1/21 [default: %(default)s]") 162 | grp_pc.add_argument("--pc-submat", dest="pseudocounts", action="store_const", 163 | const="substitution_matrix_pseudocounts", help="Use substitution matrix pseudocounts") 164 | grp_pc.add_argument("--pc-constant", dest="pseudocounts", action="store_const", 165 | const="constant_pseudocounts", help="Use constant pseudocounts ") 166 | grp_pc.add_argument("--pc-none", dest="pseudocounts", action="store_const", 167 | const="no_pseudocounts", help="Use no pseudocounts") 168 | grp_pc.add_argument("--pc-single-count", dest="pseudocount_single", default=1, type=int, 169 | help="Specify number of pseudocounts [default: %(default)s]") 170 | grp_pc.add_argument("--pc-pair-count", dest="pseudocount_pair", default=1, type=int, 171 | help="Specify number of pseudocounts for pairwise frequencies [default: %(default)s]") 172 | 173 | 174 | scores = parser.add_argument_group("Alternative Coevolution Scores") 175 | scores.add_argument("--compute-omes", dest="omes", action="store_true", default=False, 176 | help="Compute OMES scores as in Kass and Horovitz 2002. [default: %(default)s]") 177 | scores.add_argument("--omes-fodoraldrich", dest="omes_fodoraldrich", action="store_true", default=False, 178 | help="OMES option: according to Fodor & Aldrich 2004. [default: %(default)s]") 179 | scores.add_argument("--compute-mi", dest="mi", action="store_true", default=False, 180 | help="Compute mutual information (MI) . [default: %(default)s]") 181 | scores.add_argument("--mi-normalized", dest="mi_normalized", action="store_true", default=False, 182 | help="MI option: Compute normalized MI according to Martin et al 2005 . [default: %(default)s]") 183 | scores.add_argument("--mi-pseudocounts", dest="mi_pseudocounts", action="store_true", default=False, 184 | help="MI option: Compute MI with pseudocounts . [default: %(default)s]") 185 | 186 | 187 | 188 | args = parser.parse_args() 189 | 190 | 191 | if not args.optimize and not args.initrawfile: 192 | parser.error("--do-not-optimize is only supported when -i (--init-from-raw) is specified!") 193 | 194 | return args 195 | 196 | 197 | def main(): 198 | 199 | # read command line options 200 | opt = parse_args() 201 | 202 | # print logo 203 | if opt.logo: 204 | ccmpred.logo.logo() 205 | 206 | # set OMP environment variable for number of threads 207 | os.environ['OMP_NUM_THREADS'] = str(opt.num_threads) 208 | print("Using {0} threads for OMP parallelization.".format(os.environ["OMP_NUM_THREADS"])) 209 | 210 | # instantiate CCMpred 211 | ccm = CCMpred() 212 | 213 | # specify possible file paths 214 | ccm.set_alignment_file(opt.alnfile) 215 | ccm.set_matfile(opt.matfile) 216 | ccm.set_pdb_file(opt.pdbfile) 217 | ccm.set_initraw_file(opt.initrawfile) 218 | 219 | # read alignment and possible remove gapped sequences and positions 220 | ccm.read_alignment(opt.aln_format, opt.max_gap_pos, opt.max_gap_seq) 221 | 222 | # compute sequence weights (in order to reduce sampling bias) 223 | ccm.compute_sequence_weights(opt.weight, opt.wt_cutoff) 224 | 225 | # compute amino acid counts and frequencies adding pseudo counts for non-observed amino acids 226 | ccm.compute_frequencies(opt.pseudocounts, opt.pseudocount_single, opt.pseudocount_pair) 227 | 228 | # read pdb file if CCMpred is setup as a constrained run 229 | if opt.pdbfile: 230 | ccm.read_pdb(opt.contact_threshold) 231 | 232 | 233 | # if alternative scores are specified: compute these and exit 234 | if opt.omes: 235 | ccm.compute_omes(opt.omes_fodoraldrich) 236 | ccm.write_matrix() 237 | sys.exit(0) 238 | 239 | if opt.mi: 240 | ccm.compute_mutual_info(opt.mi_normalized, opt.mi_pseudocounts) 241 | ccm.write_matrix() 242 | sys.exit(0) 243 | 244 | # setup L2 regularization 245 | ccm.specify_regularization(opt.lambda_single, opt.lambda_pair_factor, 246 | reg_type="L2", scaling="L", single_prior=opt.single_prior) 247 | 248 | # intialise single and pair potentials either: 249 | # - according to regularization priors 250 | # - from initrawfile (accounting for removal of many gapped positions, if applicable) 251 | ccm.intialise_potentials() 252 | 253 | 254 | # optimize objective function (pLL or CD/PCD) with optimization algorithm (LBFGS, CG, GD or ADAM) 255 | if opt.optimize: 256 | 257 | #initialize log object 258 | ccm.initiate_logging(opt.plot_opt_progress) 259 | 260 | #minimize objective function with corresponding optimization algorithm 261 | ccm.minimize(opt) 262 | else: 263 | print("\nDo not optimize but use model parameters provided by {0}\n".format(opt.initrawfile)) 264 | 265 | 266 | 267 | 268 | ### Post Processing 269 | 270 | 271 | #specify meta data, and write (corrected) contact matrices to files 272 | if opt.matfile: 273 | 274 | # Compute contact score (frobenius norm) by recentering potentials 275 | # TODO: other scores can be added ... 276 | ccm.compute_contact_matrix(recenter_potentials=True, frob=True) 277 | 278 | # compute corrected contact maps (removing entropy/phylogenetic biases) 279 | # TODO: other corrections can be added ... 280 | ccm.compute_correction( 281 | apc_file=opt.apc_file, 282 | entropy_correction_file=opt.entropy_correction_file 283 | ) 284 | 285 | ccm.write_matrix() 286 | 287 | # write model parameters in binary format 288 | if opt.out_binary_raw_file: 289 | ccm.write_binary_raw(opt.out_binary_raw_file) 290 | 291 | 292 | exitcode = 0 293 | if opt.optimize: 294 | if ccm.algret['code'] < 0: 295 | exitcode =-ccm.algret['code'] 296 | sys.exit(exitcode) 297 | 298 | 299 | 300 | if __name__ == '__main__': 301 | main() 302 | -------------------------------------------------------------------------------- /ccmpred/substitution_matrices.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def matrianglify(data, size=20): 5 | """ Make a symmetric size * size matrix out of an array of triangle array data""" 6 | mat = np.zeros((size, size)) 7 | mat[np.tril_indices(size)] = data 8 | mat[np.triu_indices(size)] = data 9 | 10 | return mat 11 | 12 | 13 | # BLOSUM62 matrix as P(a, b) 14 | # from HH-suite hhmatrices.h @ https://github.com/soedinglab/hh-suite/blob/5015f267a051f9b7ebffcec04b7f0596ad01bbbd/src/hhmatrices.cpp 15 | BLOSUM62 = matrianglify([ 16 | # A R N D C Q E G H I L K M F P S T W Y V 17 | 0.0215, 18 | 0.0023, 0.0178, 19 | 0.0019, 0.0020, 0.0141, 20 | 0.0022, 0.0016, 0.0037, 0.0213, 21 | 0.0016, 0.0004, 0.0004, 0.0004, 0.0119, 22 | 0.0019, 0.0025, 0.0015, 0.0016, 0.0003, 0.0073, 23 | 0.0030, 0.0027, 0.0022, 0.0049, 0.0004, 0.0035, 0.0161, 24 | 0.0058, 0.0017, 0.0029, 0.0025, 0.0008, 0.0014, 0.0019, 0.0378, 25 | 0.0011, 0.0012, 0.0014, 0.0010, 0.0002, 0.0010, 0.0014, 0.0010, 0.0093, 26 | 0.0032, 0.0012, 0.0010, 0.0012, 0.0011, 0.0009, 0.0012, 0.0014, 0.0006, 0.0184, 27 | 0.0044, 0.0024, 0.0014, 0.0015, 0.0016, 0.0016, 0.0020, 0.0021, 0.0010, 0.0114, 0.0371, 28 | 0.0033, 0.0062, 0.0024, 0.0024, 0.0005, 0.0031, 0.0041, 0.0025, 0.0012, 0.0016, 0.0025, 0.0161, 29 | 0.0013, 0.0008, 0.0005, 0.0005, 0.0004, 0.0007, 0.0007, 0.0007, 0.0004, 0.0025, 0.0049, 0.0009, 0.0040, 30 | 0.0016, 0.0009, 0.0008, 0.0008, 0.0005, 0.0005, 0.0009, 0.0012, 0.0008, 0.0030, 0.0054, 0.0009, 0.0012, 0.0183, 31 | 0.0022, 0.0010, 0.0009, 0.0012, 0.0004, 0.0008, 0.0014, 0.0014, 0.0005, 0.0010, 0.0014, 0.0016, 0.0004, 0.0005, 0.0191, 32 | 0.0063, 0.0023, 0.0031, 0.0028, 0.0010, 0.0019, 0.0030, 0.0038, 0.0011, 0.0017, 0.0024, 0.0031, 0.0009, 0.0012, 0.0017, 0.0126, 33 | 0.0037, 0.0018, 0.0022, 0.0019, 0.0009, 0.0014, 0.0020, 0.0022, 0.0007, 0.0027, 0.0033, 0.0023, 0.0010, 0.0012, 0.0014, 0.0047, 0.0125, 34 | 0.0004, 0.0003, 0.0002, 0.0002, 0.0001, 0.0002, 0.0003, 0.0004, 0.0002, 0.0004, 0.0007, 0.0003, 0.0002, 0.0008, 0.0001, 0.0003, 0.0003, 0.0065, 35 | 0.0013, 0.0009, 0.0007, 0.0006, 0.0003, 0.0007, 0.0009, 0.0008, 0.0015, 0.0014, 0.0022, 0.0010, 0.0006, 0.0042, 0.0005, 0.0010, 0.0009, 0.0009, 0.0102, 36 | 0.0051, 0.0016, 0.0012, 0.0013, 0.0014, 0.0012, 0.0017, 0.0018, 0.0006, 0.0120, 0.0095, 0.0019, 0.0023, 0.0026, 0.0012, 0.0024, 0.0036, 0.0004, 0.0015, 0.0196 37 | ]) 38 | -------------------------------------------------------------------------------- /ccmpred/trees.py: -------------------------------------------------------------------------------- 1 | import Bio.Phylo.BaseTree 2 | import Bio.Phylo 3 | import numpy as np 4 | import ccmpred.sampling 5 | 6 | 7 | class CCMTree(object): 8 | """This class represents an empty phylogenetic tree according to some specific topology""" 9 | 10 | def __init__(self): 11 | """Initialise all class attributes""" 12 | 13 | self.id0 = ["root"] 14 | self.ids = None 15 | self.branch_lengths = None 16 | self.n_vertices = None 17 | self.n_leaves = None 18 | self.tree = None 19 | self.type = None 20 | 21 | def load_tree(self, tree_file): 22 | """ 23 | 24 | Parameters 25 | ---------- 26 | tree_file: str 27 | path to a newick type tree topology file 28 | 29 | Returns 30 | ------- 31 | bool 32 | True if successful, False otherwise. 33 | 34 | """ 35 | 36 | self.type = "newick" 37 | 38 | try: 39 | self.tree = Bio.Phylo.read(tree_file, "newick") 40 | except ValueError as e: 41 | print("Error while reading tree file {0} : {1}".format(tree_file, e)) 42 | return False 43 | except OSError as e: 44 | print("Error while reading tree file {0} : {1}".format(tree_file, e)) 45 | return False 46 | 47 | 48 | self.determine_tree_properties() 49 | 50 | 51 | def specify_tree(self, nseq, tree_source): 52 | """ 53 | Parameters 54 | ---------- 55 | nseq: int 56 | Specifies the number of leave nodes representing sequences 57 | tree_source: str 58 | specifies the tree topology [star|binary] 59 | 60 | Returns 61 | ------- 62 | bool 63 | True if successful, False otherwise. 64 | 65 | """ 66 | 67 | 68 | if tree_source == "binary": 69 | self.type = "binary" 70 | self.tree = create_binary_tree(nseq, root_name=self.id0[0]) 71 | elif tree_source == "star": 72 | self.type = "star" 73 | self.tree = create_star_tree(nseq, root_name=self.id0[0]) 74 | 75 | self.determine_tree_properties() 76 | 77 | return True 78 | 79 | 80 | def determine_tree_properties(self): 81 | 82 | tree_split = split_tree(self.tree, self.id0) 83 | tree_bfs = [c for c in bfs_iterator(tree_split.clade)] 84 | 85 | self.n_children = np.array([len(c.clades) for c in tree_bfs], dtype='uint64') 86 | self.branch_lengths = np.array([c.branch_length for c in tree_bfs], dtype=np.dtype('float64')) 87 | self.n_vertices = len(tree_bfs) 88 | self.n_leaves = len(tree_split.get_terminals()) 89 | self.ids = [l.name for l in tree_split.get_terminals()] 90 | 91 | depth_min, depth_max = get_child_depth_range(tree_split.clade) 92 | print( 93 | "Created {0} tree with {1} leaves, {2} nodes, avg branch length={3}, depth_min={4:.4e}, depth_max={5:.4e}\n".format( 94 | self.type, self.n_leaves, self.n_vertices, np.round(np.mean(self.branch_lengths[2:]), decimals=3), 95 | depth_min, depth_max)) 96 | 97 | 98 | 99 | def split_tree(tree, id0): 100 | """Reroot tree so that the clades in id0 are direct descendants of the root node""" 101 | 102 | id_to_node = dict((cl.name, cl) for cl in bfs_iterator(tree.clade)) 103 | 104 | new_tree = Bio.Phylo.BaseTree.Tree() 105 | new_tree.clade.clades = [id_to_node[i] for i in id0] 106 | 107 | for cl in new_tree.clade.clades: 108 | cl.branch_length = 0 109 | 110 | new_tree.clade.branch_length = 0 111 | 112 | return new_tree 113 | 114 | def bfs_iterator(clade): 115 | """Breadth-first iterator along a tree clade""" 116 | 117 | def inner(clade): 118 | for c in clade.clades: 119 | yield c 120 | 121 | for c in clade.clades: 122 | for ci in inner(c): 123 | yield ci 124 | 125 | yield clade 126 | 127 | for ci in inner(clade): 128 | yield ci 129 | 130 | def get_child_depth_range(clade): 131 | """Return the minimum and maximum child depth""" 132 | level = [(0, clade)] 133 | 134 | mn = float('inf') 135 | mx = float('-inf') 136 | while level: 137 | new_level = [] 138 | 139 | for d, parent in level: 140 | dc = d + parent.branch_length 141 | 142 | if parent.clades: 143 | for c in parent.clades: 144 | new_level.append((dc, c)) 145 | else: 146 | mn = min(mn, dc) 147 | mx = max(mx, dc) 148 | 149 | level = new_level 150 | 151 | return mn, mx 152 | 153 | def get_seq0_mrf(x, ncol, gibbs_steps): 154 | """ 155 | Specify the root sequence in the tree representing the common ancestor. 156 | 157 | A new sequence of length NCOL will be sampled from a poly-A sequence of length NCOL 158 | according to a Markov-Random-Field (MRF aka Potts) model specified by parameters X. 159 | 160 | Parameters 161 | ---------- 162 | x : ndarray 163 | 1D float containing concatenation of single and pair potentials specifiying the MRF 164 | ncol : int 165 | protein/sequence length 166 | gibbs_steps: int 167 | number of Gibbs steps used in Gibbs sampling procedure 168 | (one Gibbs step corresponds to sampling a new amino acid for every position) 169 | 170 | Returns 171 | ------- 172 | ndarray 173 | 1D integer array representing the newly sampled sequence 174 | 175 | """ 176 | 177 | # generate a poly-A alignment 178 | seq0 = np.zeros((1, ncol), dtype="uint8") 179 | 180 | # gibbs sample a new sequence 181 | seq0 = ccmpred.sampling.gibbs_sample_sequences(x, seq0, gibbs_steps) 182 | 183 | return seq0 184 | 185 | def create_binary_tree(nseqs, depth=1, root_name="root"): 186 | """ 187 | Create a binary tree topology. 188 | 189 | The depth of the tree is specified by DEPTH and the number of leave nodes by NSEQS (should be a power of 2). 190 | 191 | 192 | Parameters 193 | ---------- 194 | nseqs : int 195 | the number of leave nodes that represent sequences 196 | depth : int, optional(default=1) 197 | the depth of the tree 198 | root_name: str, optional(default="") 199 | name of the root sequence 200 | 201 | Returns 202 | ------- 203 | Bio.Phylo.BaseTree.Tree 204 | topology of a binary tree 205 | 206 | """ 207 | 208 | splits = np.ceil(np.log2(nseqs)) 209 | 210 | depth_per_clade = float(depth) / splits 211 | 212 | def fill_tree_rec(parent, splits): 213 | if splits == 0: 214 | return 215 | 216 | c1 = Bio.Phylo.BaseTree.Clade(name=parent.name + "A", branch_length=depth_per_clade) 217 | c2 = Bio.Phylo.BaseTree.Clade(name=parent.name + "B", branch_length=depth_per_clade) 218 | 219 | fill_tree_rec(c1, splits - 1) 220 | fill_tree_rec(c2, splits - 1) 221 | 222 | parent.clades = [c1, c2] 223 | 224 | t = Bio.Phylo.BaseTree.Tree(rooted=False) 225 | t.clade.name = root_name 226 | t.clade.branch_length = 0 227 | fill_tree_rec(t.clade, splits) 228 | 229 | return t 230 | 231 | def create_star_tree(nseqs, depth=1, root_name="root"): 232 | """ 233 | Create a star tree topology. 234 | 235 | The depth of the tree is specified by DEPTH and the number of leave nodes by NSEQS (should be a power of 2). 236 | 237 | 238 | Parameters 239 | ---------- 240 | nseqs : int 241 | the number of leave nodes that represent sequences 242 | depth : int, optional(default=1) 243 | the depth of the tree 244 | root_name: str, optional(default="") 245 | name of the root sequence 246 | 247 | Returns 248 | ------- 249 | Bio.Phylo.BaseTree.Tree 250 | topology of a star tree 251 | 252 | """ 253 | 254 | t = Bio.Phylo.BaseTree.Tree(rooted=False) 255 | t.clade.name = root_name 256 | t.clade.branch_length = 0 257 | 258 | t.clade.clades = [ 259 | Bio.Phylo.BaseTree.Clade(name="C{0}".format(i), branch_length=depth) 260 | for i in range(nseqs) 261 | ] 262 | 263 | return t 264 | -------------------------------------------------------------------------------- /ccmpred/weighting/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from ccmpred.weighting.cext import count_ids, calculate_weights_simple 3 | import ccmpred.counts 4 | from ccmpred.pseudocounts import PseudoCounts 5 | 6 | def get_HHsuite_neff(msa): 7 | """ 8 | Adapted from the HHsuite manual: 9 | 10 | The number of effective sequences is exp of the average sequence entropy over all columns of the alignment. 11 | Hence, Neff is bounded by 0 from below and 20 from above. 12 | In practice, it is bounded by the entropy of a column with background amino acid distribution f_a: 13 | Neff < sum_a=1^20 f_a log f_a approx 16 14 | 15 | Parameters 16 | ---------- 17 | msa 18 | 19 | Returns 20 | ------- 21 | 22 | """ 23 | 24 | # frequencies including gaps 25 | single_counts = ccmpred.counts.single_counts(msa) 26 | single_freqs = (single_counts + 1e-3) / np.sum(single_counts, axis=1)[:, np.newaxis] 27 | 28 | 29 | single_freqs = single_freqs[:, :20] 30 | entropies = - np.sum(single_freqs * np.log2(single_freqs), axis=1) 31 | 32 | neff = 2 ** np.mean(entropies) 33 | 34 | return neff 35 | 36 | def weights_uniform(msa): 37 | """Uniform weights""" 38 | return np.ones((msa.shape[0],), dtype="float64") 39 | 40 | 41 | def weights_simple(msa, cutoff=0.8): 42 | """Simple sequence reweighting from the Morcos et al. 2011 DCA paper""" 43 | 44 | if cutoff >= 1: 45 | return weights_uniform(msa) 46 | 47 | return calculate_weights_simple(msa, cutoff) 48 | 49 | 50 | 51 | WEIGHTING_TYPE = { 52 | 'simple': lambda msa, cutoff: weights_simple(msa, cutoff), 53 | 'uniform': lambda msa, cutoff: weights_uniform(msa) 54 | } 55 | -------------------------------------------------------------------------------- /ccmpred/weighting/cext/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numpy.ctypeslib as npct 3 | import ctypes 4 | import os.path 5 | 6 | array_1d_double = npct.ndpointer(dtype=np.dtype('double'), ndim=1, flags='CONTIGUOUS') 7 | array_2d_char = npct.ndpointer(dtype=np.dtype('uint8'), ndim=2, flags='CONTIGUOUS') 8 | array_2d_uint64 = npct.ndpointer(dtype=np.dtype('uint64'), ndim=2, flags='CONTIGUOUS') 9 | 10 | libweighting = npct.load_library('libweighting', os.path.join(os.path.dirname(__file__), '_build')) 11 | 12 | libweighting.count_ids.restype = None 13 | libweighting.count_ids.argtypes = [ 14 | array_2d_char, # *msa 15 | array_2d_uint64, # *n_ids 16 | ctypes.c_uint64, # nrow 17 | ctypes.c_uint64, # ncol 18 | ] 19 | 20 | libweighting.calculate_weights_simple.restype = None 21 | libweighting.calculate_weights_simple.argtypes = [ 22 | array_2d_char, # *msa 23 | array_1d_double, # *weights 24 | ctypes.c_double, # cutoff 25 | ctypes.c_uint64, # nrow 26 | ctypes.c_uint64, # ncol 27 | ] 28 | 29 | 30 | def count_ids(msa): 31 | nrow = msa.shape[0] 32 | ids = np.zeros((nrow, nrow), dtype="uint64") 33 | libweighting.count_ids(msa, ids, *msa.shape) 34 | 35 | return ids + ids.T - np.diag(ids.diagonal()) 36 | 37 | 38 | def calculate_weights_simple(msa, cutoff): 39 | nrow = msa.shape[0] 40 | weights = np.zeros((nrow,), dtype='double') 41 | libweighting.calculate_weights_simple(msa, weights, cutoff, *msa.shape) 42 | 43 | return weights 44 | 45 | 46 | if __name__ == '__main__': 47 | msa = np.array( 48 | [ 49 | [0, 1, 2], 50 | [0, 3, 4], 51 | [0, 3, 2], 52 | [5, 6, 7] 53 | ], 54 | dtype=np.uint8 55 | ) 56 | 57 | print(msa) 58 | print(count_ids(msa)) 59 | -------------------------------------------------------------------------------- /ccmpred/weighting/cext/weighting.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include 4 | #include 5 | #include 6 | #include 7 | 8 | #include "weighting.h" 9 | 10 | /** 11 | * Count the number of sequence identities for all rows in an MSA 12 | * 13 | * @param[in] seq The MSA to work on 14 | * @param[out] counts The number of sequence identities 15 | * @param[in] nrow The number of columns in the MSA 16 | * @param[in] ncol The number of rows in the MSA 17 | */ 18 | void count_ids( 19 | const uint8_t *msa, 20 | uint64_t *ids, 21 | const uint64_t nrow, 22 | const uint64_t ncol 23 | ) { 24 | uint64_t nij = nrow * (nrow + 1) / 2; 25 | 26 | omp_set_dynamic(0); 27 | 28 | #pragma omp parallel 29 | { 30 | uint64_t ij; 31 | 32 | #pragma omp for nowait private(ij) 33 | for(ij = 0; ij < nij; ij++) { 34 | 35 | // compute i and j from ij 36 | // http://stackoverflow.com/a/244550/1181102 37 | uint64_t i, j; 38 | { 39 | uint64_t ii = nrow * (nrow + 1) / 2 - 1 - ij; 40 | uint64_t K = floor((sqrt(8 * ii + 1) - 1) / 2); 41 | i = nrow - 1 - K; 42 | j = ij - nrow * i + i * (i + 1) / 2; 43 | } 44 | 45 | uint64_t my_ids = 0; 46 | for(uint64_t k = 0; k < ncol; k++) { 47 | if(msa[i * ncol + k] == msa[j * ncol + k]) { 48 | my_ids++; 49 | } 50 | } 51 | 52 | ids[i * nrow + j] = my_ids; 53 | } 54 | } 55 | } 56 | 57 | 58 | void calculate_weights_simple( 59 | const uint8_t *msa, 60 | double *weights, 61 | double cutoff, 62 | const uint64_t nrow, 63 | const uint64_t ncol 64 | ) { 65 | uint64_t nij = nrow * (nrow + 1) / 2; 66 | 67 | omp_set_dynamic(0); 68 | 69 | #pragma omp parallel 70 | { 71 | 72 | uint64_t ij; 73 | 74 | #pragma omp for nowait private(ij) 75 | for(ij = 0; ij < nij; ij++) { 76 | 77 | // compute i and j from ij 78 | // http://stackoverflow.com/a/244550/1181102 79 | uint64_t i, j; 80 | { 81 | uint64_t ii = nrow * (nrow + 1) / 2 - 1 - ij; 82 | uint64_t K = floor((sqrt(8 * ii + 1) - 1) / 2); 83 | i = nrow - 1 - K; 84 | j = ij - nrow * i + i * (i + 1) / 2; 85 | } 86 | 87 | 88 | uint64_t my_ids = 0; 89 | uint64_t idthres = ceil(cutoff * ncol); 90 | for(uint64_t k = 0; k < ncol; k++) { 91 | if(msa[i * ncol + k] == msa[j * ncol + k] ) { 92 | my_ids++; 93 | } 94 | } 95 | 96 | 97 | 98 | 99 | if(my_ids >= idthres) { 100 | #pragma omp atomic 101 | weights[i]++; 102 | #pragma omp atomic 103 | weights[j]++; 104 | } 105 | 106 | } 107 | } 108 | 109 | for(uint64_t i = 0; i < nrow; i++) { 110 | weights[i] = 1.0 / (weights[i] - 1); 111 | } 112 | 113 | fflush(stdout); 114 | } 115 | -------------------------------------------------------------------------------- /ccmpred/weighting/cext/weighting.h: -------------------------------------------------------------------------------- 1 | #ifndef WEIGHTING_H 2 | #define WEIGHTING_H 3 | 4 | #include 5 | #define GAP 20 6 | 7 | void count_ids( 8 | const uint8_t *msa, 9 | uint64_t *ids, 10 | const uint64_t nrow, 11 | const uint64_t ncol 12 | ); 13 | 14 | void calculate_weights_simple( 15 | const uint8_t *msa, 16 | double *weights, 17 | double cutoff, 18 | const uint64_t nrow, 19 | const uint64_t ncol 20 | ); 21 | 22 | 23 | #endif 24 | -------------------------------------------------------------------------------- /ci_support/1atzA.braw.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ci_support/1atzA.braw.gz -------------------------------------------------------------------------------- /ci_support/mrf_params.braw.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/ci_support/mrf_params.braw.gz -------------------------------------------------------------------------------- /ci_support/phylo.newick: -------------------------------------------------------------------------------- 1 | ((((((0:0.16667,1:0.16667)6df929e2-2da7-4bf7-8648-c74e118a778d:0.16667,(2:0.16667,3:0.16667)577d47bf-2a72-4aa8-87d3-acd4231f7d54:0.16667)98786d16-1782-4fac-89a2-f52090da3639:0.16667,((4:0.16667,5:0.16667)87e05f7c-af92-4197-ac3a-3f10577b35d1:0.16667,(6:0.16667,7:0.16667)63567a61-133e-4784-abcb-515c013f10be:0.16667)8e05d0d4-521b-430b-ba71-d0220b1e8350:0.16667)4c7b4015-83bc-41c3-ba34-535ea40f7cde:0.16667,(((8:0.16667,9:0.16667)34d6ba60-ba0f-4464-86fa-5401c1682d9b:0.16667,(10:0.16667,11:0.16667)27f94871-c557-413c-b369-7d96e0534a70:0.16667)e7d7f0e9-8189-46b9-83b0-a14d12081fa1:0.16667,((12:0.16667,13:0.16667)c4f0c5ea-a173-4339-925c-8c3884be556d:0.16667,(14:0.16667,15:0.16667)6b05b82c-3a54-4afa-933f-91eb7255d79d:0.16667)5a9c9457-ea99-43dd-8c10-79a479aaf44a:0.16667)a1d6236e-7432-418e-8050-fc21edaa6bb8:0.16667)9bf01622-715d-454d-85a9-713ac626c7c0:0.16667,((((16:0.16667,17:0.16667)f289fcdf-7e24-4f45-aa0e-f5e4f82fcb29:0.16667,(18:0.16667,19:0.16667)3a35be31-4498-4cb6-a519-7d02726362be:0.16667)658a8453-4e75-4274-8a02-172832b23dd3:0.16667,((20:0.16667,21:0.16667)71a0d481-f62f-421e-b6c6-23ef55867a9c:0.16667,(22:0.16667,23:0.16667)95275b51-fdc9-48a1-a8cc-39c15390d4a7:0.16667)394eb006-4b24-449d-96ef-114be429a0ce:0.16667)ed1128f1-11a7-4803-8c6f-6c3b7bb6da0f:0.16667,(((24:0.16667,25:0.16667)5989f16f-0d2a-4f4a-ae34-ddc91afe5f93:0.16667,(26:0.16667,27:0.16667)a82cf7ed-3e95-4712-8d10-0bc26bdfb87c:0.16667)39012f67-76c6-4c81-9ba8-a53e15de5e5a:0.16667,((28:0.16667,29:0.16667)339aba95-a80b-4252-9424-a37acfa31d6d:0.16667,(30:0.16667,31:0.16667)a96d5b22-5276-4d78-aae0-bde31e0aeb55:0.16667)90768ce1-6a1c-41d3-974c-ebff07c6cc91:0.16667)0dcf1544-72f3-447b-8682-82d20306d281:0.16667)1900092e-1d1c-4d2b-96c8-befa5ddad166:0.16667)ae31cd88-a99f-4476-9876-49b525993d65:0.16667,(((((32:0.16667,33:0.16667)d13716f4-c0a6-481f-b055-574c1acb4e61:0.16667,(34:0.16667,35:0.16667)1ecd3fbd-5333-4000-a17a-d3006174406e:0.16667)99f6c263-a33d-412e-8fa5-e612b6692646:0.16667,((36:0.16667,37:0.16667)b860915d-c75c-4da5-b2e9-e0a41cb0b520:0.16667,(38:0.16667,39:0.16667)fd10aa23-344e-4b4f-a37a-f48548b5ae0c:0.16667)a6c60d0a-e005-43eb-9827-16900b414fcd:0.16667)9daac0ef-9d25-4947-a531-d74d27753432:0.16667,(((40:0.16667,41:0.16667)28cd4007-537f-4773-b60d-88c6ac350982:0.16667,(42:0.16667,43:0.16667)2d788dbe-aea8-4378-8ef8-493d65b1db12:0.16667)46b597a4-4330-4451-9bfc-1ca67958ebe8:0.16667,((44:0.16667,45:0.16667)170f012b-9e48-46ae-bee3-79dfe4b9835d:0.16667,(46:0.16667,47:0.16667)d5606d8a-9b75-45ad-88b7-6ea60c6a1b8c:0.16667)52ed9b25-2529-4f99-b2d6-e175624b3584:0.16667)cdbcdfd8-938a-40d2-9ed4-fe98b3a6def3:0.16667)5497cf05-a28e-4bf3-ba7f-74587192d143:0.16667,((((48:0.16667,49:0.16667)ec6cb031-6fe9-4458-904a-11b8637eb07f:0.16667,(50:0.16667,51:0.16667)d063be88-ce9d-430d-b390-287dc792b25c:0.16667)d5a8b09d-1aa6-4a27-af3a-2ec3e18418c3:0.16667,((52:0.16667,53:0.16667)eb8ba81c-eee0-469b-a1c8-f1f110396562:0.16667,(54:0.16667,55:0.16667)9fd2da20-caf7-4c13-81d1-6af9da49106f:0.16667)fb4ef037-3797-493a-a646-5fec4c5f05d4:0.16667)954f7def-41e0-47ec-acb5-e3a560f3fa16:0.16667,(((56:0.16667,57:0.16667)3e4cbb37-3896-4f0a-834d-02c05903b0a2:0.16667,(58:0.16667,59:0.16667)075a3d1e-8ff5-4748-8e27-cbbd1bdd1446:0.16667)c8d354fc-ee87-4164-8ba2-b9eb599e1826:0.16667,((60:0.16667,61:0.16667)e5891dff-7724-489b-80b0-57e9820b38e6:0.16667,(62:0.16667,63:0.16667)afef3bca-d495-4825-8519-1e13e5f78ff1:0.16667)c84d298f-682e-4409-b551-b209ab2105a3:0.16667)6e0f2d8f-a5ce-4336-b4c5-5e0cf3597df1:0.16667)efe3201f-2846-4657-b746-196ffbee17ff:0.16667)4d878a75-5691-42c1-b50d-abce5b9acf39:0.16667)root:0.00000; 2 | -------------------------------------------------------------------------------- /ci_support/random_start_sequence.py: -------------------------------------------------------------------------------- 1 | import ccmpred.raw 2 | from Bio.Seq import Seq 3 | from Bio.SeqRecord import SeqRecord 4 | from Bio import SeqIO 5 | import random 6 | import sys 7 | 8 | def write_new_tree_for_ccmgen(file_coupling : str, file_name : str): 9 | 10 | print("Enter") 11 | raw = ccmpred.raw.parse_msgpack(file_coupling) 12 | Field = raw.x_single 13 | size_prot = Field.shape[0] 14 | 15 | sequence = ''.join([random.choice('ACDEFGHIKLMNPQRSTVWY-') for x in range(size_prot)]) 16 | 17 | print("Random sequence : %s"%sequence) 18 | 19 | record = SeqRecord( 20 | Seq(sequence), 21 | id="ID_0.1", 22 | name="RandomSequences", 23 | description="Random Sequences for the root of a phylogeny tree", 24 | ) 25 | 26 | SeqIO.write(record, file_name, "fasta") 27 | 28 | if __name__ == '__main__': 29 | # Map command line arguments to function arguments. 30 | write_new_tree_for_ccmgen(*sys.argv[1:]) 31 | -------------------------------------------------------------------------------- /ci_support/run_tests.sh: -------------------------------------------------------------------------------- 1 | set -e 2 | ## New test 3 | python ci_support/random_start_sequence.py ci_support/1atzA.braw.gz ci_support/seq0_file.fasta 4 | 5 | ccmgen --tree-newick ci_support/1atzA_rootname.tree --seq0-file ci_support/seq0_file.fasta --mutation-rate 1 --num-threads 1 ci_support/1atzA.braw.gz sequences.msa 6 | ## Commented to reduce time for test 7 | #ccmgen --tree-newick ci_support/1atzA_rootname.tree --seq0-file ci_support/seq0_file.fasta --mutation-rate-neff --num-threads 1 --alnfile ci_support/1atzA.fas ci_support/1atzA.braw.gz sequences.msa 8 | 9 | ##Old test 10 | ccmgen --tree-newick ci_support/phylo.newick --aln-format psicov --mutation-rate 1 --num-threads 1 ci_support/mrf_params.braw.gz sequences.msa 11 | ccmgen --tree-newick ci_support/phylo.newick --aln-format fasta --mutation-rate 1 --num-threads 1 ci_support/mrf_params.braw.gz sequences.msa 12 | -------------------------------------------------------------------------------- /example/1atzA.alignment_statistics.mcmc_pcd_vs_original.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.alignment_statistics.mcmc_pcd_vs_original.png -------------------------------------------------------------------------------- /example/1atzA.apc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.apc.png -------------------------------------------------------------------------------- /example/1atzA.braw.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.braw.gz -------------------------------------------------------------------------------- /example/1atzA.pcd.apc.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/soedinglab/CCMgen/4540896203260e810b847916390c4e465d04be6b/example/1atzA.pcd.apc.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, Extension, find_packages 2 | 3 | def ext(name, sources=[], include_dirs=[], library_dirs=[], libraries=[], extra_compile_args=['-g', '-fopenmp', '-std=c99'], extra_link_args=['-g', '-fopenmp']): 4 | return Extension(name, include_dirs=include_dirs, library_dirs=library_dirs, libraries=libraries, sources=sources, extra_compile_args=extra_compile_args, extra_link_args=extra_link_args) 5 | 6 | setup( 7 | name="ccmgen", 8 | version="1.0.0", 9 | description="Residue-residue contact prediction from correlated mutations predicted quickly and precisely", 10 | license="AGPLv3", 11 | author="Susann Vorberg, Stefan Seemayer, Johannes Soeding", 12 | author_email="Susann.Vorberg@gmail.com", 13 | url="https://github.com/soedinglab/ccmgen", 14 | packages=find_packages(), 15 | install_requires=['msgpack-python', 'numpy', 'plotly==3.0.0rc10', 'scipy', 'pandas', 'biopython', 'colorlover'], 16 | ext_modules=[ 17 | ext( 18 | 'ccmpred.objfun.pll.cext.libpll', 19 | sources=['ccmpred/objfun/pll/cext/pll.c'] 20 | ), 21 | ext( 22 | 'ccmpred.objfun.cd.cext.libcd', 23 | sources=[ 24 | 'ccmpred/objfun/cd/cext/cd.c', 25 | 'ccmpred/objfun/cd/cext/cdutil.c' 26 | ] 27 | ), 28 | ext( 29 | 'ccmpred.counts.libmsacounts', 30 | sources=['ccmpred/counts/msacounts.c'] 31 | ), 32 | ext( 33 | 'ccmpred.gaps.cext.libgaps', 34 | sources=['ccmpred/gaps/cext/gaps.c'], 35 | extra_compile_args=['-g','-std=c99'], 36 | extra_link_args=['-g'], 37 | ), 38 | ext( 39 | 'ccmpred.weighting.cext.libweighting', 40 | sources=['ccmpred/weighting/cext/weighting.c'] 41 | ), 42 | ext( 43 | 'ccmpred.sampling.cext.libtreecd', 44 | include_dirs=['ccmpred/objfun/cd/cext'], 45 | sources=[ 46 | 'ccmpred/objfun/cd/cext/cd.c', 47 | 'ccmpred/objfun/cd/cext/cdutil.c', 48 | 'ccmpred/sampling/cext/treecd.c', 49 | ] 50 | ), 51 | ], 52 | entry_points={ 53 | 'console_scripts': [ 54 | 'ccmpred=ccmpred.scripts.run_ccmpred:main', 55 | 'ccmgen=ccmpred.scripts.run_ccmgen:main', 56 | 'ccm_replace_gaps=ccmpred.scripts.replace_gaps:main', 57 | 'ccm_plot=ccmpred.scripts.plot_ccmpred:main', 58 | 'ccm_convert_aln=ccmpred.scripts.convert:main' 59 | ] 60 | } 61 | ) 62 | --------------------------------------------------------------------------------