├── .gitignore ├── LICENCE ├── Logo.png ├── MANIFEST.in ├── README.md ├── rsalor ├── __init__.py ├── msa.py ├── rsa │ ├── __init__.py │ ├── rsa_biopython.py │ ├── rsa_dssp.py │ ├── rsa_music.py │ └── rsa_solver.py ├── sequence │ ├── __init__.py │ ├── amino_acid.py │ ├── fasta_reader.py │ ├── mutation.py │ ├── pairwise_alignment.py │ └── sequence.py ├── structure │ ├── __init__.py │ ├── residue.py │ └── structure.py ├── utils │ ├── CSV.py │ ├── __init__.py │ ├── ali_to_fasta.py │ ├── logger.py │ └── utils.py └── weights │ ├── CMakeLists.txt │ ├── __init__.py │ ├── computeWeightsBackend.cpp │ ├── compute_weights.py │ ├── include │ └── msa.h │ └── msa.cpp ├── setup.py └── test_data ├── 6acv_A_29-94.fasta └── 6acv_A_29-94.pdb /.gitignore: -------------------------------------------------------------------------------- 1 | # Ignore cache build and config files 2 | __pycache__/ 3 | build/ 4 | rsalor.egg-info/ 5 | dist/ 6 | 7 | # Ignore experiments 8 | src/ 9 | tmp/ 10 | fig/ 11 | 0_* 12 | 13 | # Ignore compiled files 14 | *.so 15 | *.a 16 | *.o -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c), 2025, Matsvei Tsishyn 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/3BioCompBio/RSALOR/0fa6cdb14eab2b6c6c99bcb170d82b246d6231b0/Logo.png -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | # Include files required for building C++ extension 2 | include rsalor/weights/include/* 3 | recursive-include rsalor/weights *.cpp 4 | 5 | # Exclude files and directories that should not be in the package 6 | exclude Logo.png 7 | exclude fig/* 8 | exclude src/* 9 | exclude tmp/* 10 | exclude 0_* 11 | exclude conda-env.yml 12 | exclude test_data/* 13 | global-exclude *.py[cod] 14 | global-exclude __pycache__/* 15 | global-exclude *.so 16 | global-exclude *.a 17 | global-exclude *.o 18 | 19 | # Exclude build artifacts 20 | global-exclude rsalor/weights/build/* 21 | global-exclude *.egg-info/* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # RSALOR 3 | 4 | [![PyPi Version](https://img.shields.io/pypi/v/rsalor.svg)](https://pypi.org/project/rsalor/) [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) 5 |
6 | [RSALOR Logo] 7 |
8 | 9 | `rsalor` is a Python package that computes the `RSA*LOR` score for each missence mutation in a protein. It combines multiple computational steps into a fast and user-friendly tool. 10 | 11 | **Please cite**: 12 | - [Matsvei Tsishyn, Pauline Hermans, Fabrizio Pucci, Marianne Rooman (2025). Residue conservation and solvent accessibility are (almost) all you need for predicting mutational effects in proteins. Bioinformatics, btaf322](https://doi.org/10.1093/bioinformatics/btaf322). 13 | 14 | - [Pauline Hermans, Matsvei Tsishyn, Martin Schwersensky, Marianne Rooman, Fabrizio Pucci (2024). Exploring evolution to uncover insights into protein mutational stability. Molecular Biology and Evolution, 42(1), msae267](https://doi.org/10.1093/molbev/msae267). 15 | 16 | 17 | ## Installation and Usage 18 | 19 | Installation with `pip`: 20 | ```bash 21 | pip install rsalor 22 | ``` 23 | 24 | Make sure the first sequence in your MSA file is the target sequence to mutate. 25 | From directory `./test_data/` execute the following Python code: 26 | ```python 27 | # Import 28 | from rsalor import MSA 29 | 30 | # Log basic usage instructions and arguments of the package 31 | MSA.help() 32 | 33 | # Initialize MSA 34 | msa_path = "./6acv_A_29-94.fasta" 35 | pdb_path = "./6acv_A_29-94.pdb" 36 | chain = "A" 37 | msa = MSA(msa_path, pdb_path, chain, num_threads=8, verbose=True) 38 | 39 | # You can ignore structure and RSA by omitting the pdb_path argument 40 | #msa = MSA(msa_path, num_threads=8, verbose=True) 41 | 42 | # Get LOR and other scores for all mutations 43 | scores = msa.get_scores() # [{'mutation_fasta': 'S1A', 'mutation_pdb': 'SA1A', 'RSA': 61.54, 'LOR': 5.05, ...}, ...] 44 | 45 | # Or directly save scores to a CSV file 46 | msa.save_scores("./6acv_A_29-94_scores.csv", sep=";") 47 | ``` 48 | 49 | ## Requirements 50 | 51 | - Python 3.9 or later 52 | - Python packages `numpy` ans `biopython` (version 1.75 or later) 53 | - A C++ compiler that supports C++11 (such as GCC) 54 | 55 | ## Short description 56 | 57 | The `rsalor` package combines structural data (Relative Solvent Accessibility, RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins. 58 | 59 | It parses a Multiple Sequence Alignment (MSA), removes redundant sequences, and assigns a weight to each sequence based on sequence identity clustering. The package then computes the weighted Log Odd Ratio (LOR) and Log Ratio (LR) for each single missense mutation. Additionally, it calculates the Relative Solvent Accessibility (RSA) for each residue and combines the LOR/LR and RSA scores, as described in the reference paper. The package resolves discrepancies between the MSA's target sequence and the protein structure (e.g., missing residues in structure) by aligning the PDB structure with the MSA target sequence. 60 | 61 | The sign of RSALOR / LOR is defined such that the result of mutations from a highly represented amino acid to a less represented amino acid is positive, which generally corresponds to a decrease in protein stability or fitness. In other words, large positive values predict highly destabilizing / disruptive mutations, while values close to zero or negative predict positive or neutral mutations. 62 | 63 | ## Compile from source 64 | 65 | For performance reasons, `rsalor` uses a C++ backend to weight sequences in the MSA. The C++ code needs to be compiled to use it directly from source. To compile the code, follow these steps: 66 | ```bash 67 | git clone https://github.com/3BioCompBio/RSALOR # Clone the repository 68 | cd RSALOR/rsalor/weights/ # Navigate to the C++ code directory 69 | mkdir build # Create a build directory 70 | cd build # Enter the build directory 71 | cmake .. # Generate make files 72 | make # Compile the C++ code 73 | mv ./lib_computeWeightsBackend* ../ # Move the compiled file to the correct directory 74 | ``` 75 | -------------------------------------------------------------------------------- /rsalor/__init__.py: -------------------------------------------------------------------------------- 1 | from rsalor.msa import MSA 2 | -------------------------------------------------------------------------------- /rsalor/msa.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from os import cpu_count 5 | from typing import Union, List, Dict, Literal, Callable 6 | import tempfile 7 | import numpy as np 8 | from rsalor.utils import time_str 9 | from rsalor.sequence import AminoAcid 10 | from rsalor.sequence import Mutation 11 | from rsalor.sequence import Sequence 12 | from rsalor.sequence import FastaReader, FastaStream 13 | from rsalor.sequence import PairwiseAlignment 14 | from rsalor.structure import Structure 15 | from rsalor.weights import compute_weights, read_weights, write_weights 16 | from rsalor.utils import CSV 17 | from rsalor.utils import Logger 18 | 19 | 20 | # Main ------------------------------------------------------------------------- 21 | class MSA: 22 | """Class MSA: Combines structural data (RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins. 23 | Main class of the RSALOR package. 24 | """ 25 | 26 | 27 | # Constants ---------------------------------------------------------------- 28 | ACCEPTED_EXTENTIONS = ["fasta", "a2m"] 29 | N_STATES = len(AminoAcid.ONE_2_ID) + 1 30 | GAP_ID = N_STATES - 1 31 | GAP_CHAR = AminoAcid.GAP_ONE 32 | ONE_2_ID = {aa_one: aa_id for aa_one, aa_id in AminoAcid.ONE_2_ID.items()} 33 | ONE_2_ID_GAP = {aa_one: aa_id for aa_one, aa_id in AminoAcid.ONE_2_ID.items()} 34 | ONE_2_ID_GAP[GAP_CHAR] = GAP_ID 35 | 36 | 37 | # Constructor -------------------------------------------------------------- 38 | def __init__( 39 | self, 40 | msa_path: str, 41 | pdb_path: Union[None, str]=None, 42 | chain: Union[None, str]=None, 43 | theta_regularization: float=0.01, 44 | n_regularization: float=0.0, 45 | count_target_sequence: bool=True, 46 | remove_redundant_sequences: bool=True, 47 | seqid_weights: Union[None, float]=0.80, 48 | min_seqid: Union[None, float]=0.35, 49 | num_threads: int=1, 50 | rsa_solver: Literal["biopython", "DSSP", "MuSiC"]="biopython", 51 | rsa_solver_path: Union[None, str]=None, 52 | trimmed_msa_path: Union[None, str]=None, 53 | allow_msa_overwrite: bool=False, 54 | weights_cache_path: Union[None, str]=None, 55 | rsa_cache_path: Union[None, str]=None, 56 | verbose: bool=False, 57 | disable_warnings: bool=False, 58 | name: Union[None, str]=None, 59 | ): 60 | """\nRSA*LOR: Combines structural data (RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins. 61 | 62 | ---------------------------------------------------------------------------- 63 | usage (Python): 64 | from rsalor import MSA # Import pip package 65 | msa = MSA('./msa1.fasta', './pdb1.pdb', 'A') # Initialize MSA object with an MSA file, a PDB file and corresponding chain in the PDB 66 | scores = msa.get_scores() # Compute RSA*LOR scores of all single-site missense mutations 67 | msa.save_scores("./msa1_scores.csv") # Save scores to a '.csv' file 68 | 69 | ---------------------------------------------------------------------------- 70 | Main arguments: 71 | msa_path (str) path to MSA '.fasta' or '.a2m' file 72 | 73 | Structure arguments: 74 | pdb_path (None | str, None) path to PDB '.pdb' file (leave empty to ignore structure) 75 | chain (None | str, None) chain in the PDB to consider 76 | 77 | LOR/LR arguments: 78 | theta_regularization (float, 0.01) regularization term for LOR/LR at amino acid frequencies level 79 | n_regularization (float, 0.0) regularization term for LOR/LR at amino acid counts level 80 | count_target_sequence (bool, True) count target (first) sequence of the MSA in frequencies 81 | remove_redundant_sequences (bool, True) pre-process MSA to remove redundent sequences 82 | seqid_weights (None | float, 0.80) seqid threshold to consider two sequences in the same cluster for weighting (set None to ignore) 83 | min_seqid (None | float, 0.35) sequences which seqid with target sequence is below will be discarded (set None to ignore) 84 | num_threads (int, 1) number of threads (CPUs) for weights evaluation (in the C++ backend) 85 | 86 | RSA arguments: 87 | rsa_solver ('biopython'/'DSSP'/'MuSiC') used solver to compute RSA (DSSP and MuSiC require the software to be installed) 88 | rsa_solver_path (None | str, None) path to DSSP/MuSiC executable to compute RSA (leave empty if software is in system PATH) 89 | 90 | Files management arguments: 91 | trimmed_msa_path (None | str, None) set to save the trimmed + non-redundent MSA file (leave empty to ignore) 92 | allow_msa_overwrite (bool, False) allow to overwrite initial MSA file with the trimmed + non-redundent MSA file 93 | 94 | Cache arguments: 95 | weights_cache_path (None | str, None) set to read (is file exists) or write (is files does not exists) weights (leave empty to ignore) 96 | rsa_cache_path (None | str, None) set to read (is file exists) or write (is files does not exists) rsa values (leave empty to ignore) 97 | 98 | Logging arguments: 99 | verbose (bool, False) log execution steps 100 | disable_warnings (bool, False) disable logging of Warnings 101 | name (None | str, None) name of the MSA object (for logging) 102 | """ 103 | 104 | # MSA path Guardians 105 | self.name = "" # Required for logs, so we set directly. 106 | self._verify_input_msa_path(msa_path) 107 | 108 | # Fill basic properties 109 | self.msa_path: str = msa_path 110 | self.msa_filename: str = os.path.basename(self.msa_path) 111 | self.name: str = name 112 | if self.name is None: 113 | for extention in self.ACCEPTED_EXTENTIONS: 114 | if self.msa_filename.endswith(f".{extention}"): 115 | self.name = self.msa_filename.removesuffix(f".{extention}") 116 | break 117 | self.pdb_path: str = pdb_path 118 | self.chain: str = chain 119 | self.rsa_solver: str = rsa_solver 120 | self.rsa_solver_path: str = rsa_solver_path 121 | self.rsa_cache_path: str = rsa_cache_path 122 | self.theta_regularization: float = theta_regularization 123 | self.n_regularization: float = n_regularization 124 | self.remove_redundant_sequences: bool = remove_redundant_sequences 125 | self.count_target_sequence: bool = count_target_sequence 126 | self.seqid_weights: Union[None, float] = seqid_weights 127 | self.min_seqid: Union[None, float] = min_seqid 128 | self.num_threads: int = num_threads 129 | self.weights_cache_path: str = weights_cache_path 130 | self.trimmed_msa_path: Union[None, str] = trimmed_msa_path 131 | self.allow_msa_overwrite: bool = allow_msa_overwrite 132 | self.verbose: bool = verbose 133 | self.disable_warnings: bool = disable_warnings 134 | self.logger = Logger(verbose, disable_warnings, step_prefix="RSALOR", warning_note=f" in {self}", error_note=f" in {self}") 135 | 136 | # Too much CPU warning 137 | num_cpu_total = cpu_count() 138 | if num_cpu_total is None or num_cpu_total < self.num_threads: 139 | self.logger.warning(f"num_threads={num_threads} exeeds total number of CPUs detected on current machine (num_cpu_total={num_cpu_total}).") 140 | 141 | # Init structure (if pdb_path is specified) 142 | self._init_structure() 143 | 144 | # Read sequences 145 | self._read_sequences() 146 | 147 | # Filter sequences that are too far from target sequence 148 | if min_seqid is not None: 149 | self._remove_far_seqid_sequences() 150 | 151 | # Align Structure and Sequence (if pdb_path is specified) 152 | self._align_structure_to_sequence() 153 | 154 | # Save trimmed MSA (if trimmed_msa_path is specified) 155 | if self.trimmed_msa_path is not None: 156 | self.logger.step("save trimmed MSA (without target sequence gaps and non-std AAs, without redundent sequences) to a file.") 157 | self.logger.log(f" * trimmed_msa_path: '{trimmed_msa_path}'") 158 | self._verify_trimmed_seq_path() 159 | self.write(trimmed_msa_path) 160 | 161 | # Assign weights 162 | self._init_weights() 163 | 164 | # Counts and Frequencies 165 | self._init_counts() 166 | 167 | 168 | # Constructor dependencies ------------------------------------------------- 169 | def _init_structure(self) -> None: 170 | """Parse PDB file and compute RSA (Relative Solvent Accessibility).""" 171 | 172 | # Case: pdb_path is None -> just log some warnings and continue 173 | if self.pdb_path is None: 174 | if self.chain is not None: 175 | warning_log = "pdb_path is not set, so structure and RSA are ignored." 176 | warning_log += f" However chain is set to '{self.chain}'." 177 | warning_log += f" Please specify pdb_path to consider structure and RSA." 178 | self.logger.warning(warning_log) 179 | if self.rsa_solver_path is not None: 180 | warning_log = "pdb_path is not set, so structure and RSA are ignored." 181 | warning_log += f" However rsa_solver_path is set to '{self.rsa_solver_path}'." 182 | warning_log += f" Please specify pdb_path to consider structure and RSA." 183 | self.logger.warning(warning_log) 184 | self.structure = None 185 | return None 186 | 187 | # Set Structure 188 | self.logger.step(f"parse PDB structure '{os.path.basename(self.pdb_path)}' (chain '{self.chain}') and compute RSA.") 189 | assert self.chain is not None, f"{self.error_prefix}: pdb_path='{self.pdb_path}' is set, so please set also the PDB chain to consider." 190 | self.structure = Structure( 191 | self.pdb_path, 192 | self.chain, 193 | rsa_solver=self.rsa_solver, 194 | rsa_solver_path=self.rsa_solver_path, 195 | rsa_cache_path=self.rsa_cache_path, 196 | verbose=self.verbose, 197 | ) 198 | 199 | # Non assigned RSA warnings 200 | self._verify_rsa_values() 201 | 202 | def _read_sequences(self) -> None: 203 | """Read sequences from MSA FASTA file.""" 204 | 205 | # Read MSA 206 | self.logger.step(f"read sequences from MSA file '{self.msa_filename}'.") 207 | 208 | # Inspect target sequence for gaps and non-standard AAs 209 | # Also set up alignment between MSA and trimmed MSA positions 210 | target_sequence = FastaReader.read_first_sequence(self.msa_path) 211 | self.fasta_to_fasta_trimmed: Dict[str, str] = {} 212 | self.fasta_trimmed_to_fasta: Dict[str, str] = {} 213 | tgt_seq_len = len(target_sequence) 214 | n_gaps = 0 215 | non_standard = [] 216 | keep_position: List[bool] = [] 217 | i_res_trimmed = 0 218 | for i_res, res in enumerate(target_sequence): 219 | if res in self.ONE_2_ID: # Standard AA -> keep 220 | fasta_res = str(i_res+1) 221 | fasta_trimmed_res = str(i_res_trimmed+1) 222 | self.fasta_to_fasta_trimmed[fasta_res] = fasta_trimmed_res 223 | self.fasta_trimmed_to_fasta[fasta_trimmed_res] = fasta_res 224 | i_res_trimmed += 1 225 | keep_position.append(True) 226 | elif res == self.GAP_CHAR: # Gap -> remove 227 | n_gaps += 1 228 | keep_position.append(False) 229 | else: # Other -> remove 230 | non_standard.append(res) 231 | keep_position.append(False) 232 | n_remove = n_gaps + len(non_standard) 233 | do_trimming = n_remove > 0 234 | n_keep = len(target_sequence) - n_remove 235 | if n_keep < 1: 236 | raise ValueError(f"{self.error_prefix}: target sequence does not contain any standard amino acid residues.") 237 | if do_trimming: 238 | self.logger.warning(f"target sequence contains some gaps or non-standard amino acids: MSA will be trimmed: {len(target_sequence)} -> {n_keep} (num trimmed positions: {n_remove}).") 239 | if n_gaps > 0: 240 | self.logger.warning(f"target sequence contains {n_gaps} gaps -> those positions will be trimmed.") 241 | if len(non_standard) > 0: 242 | non_std_str = "".join(non_standard) 243 | if len(non_std_str) > 10: 244 | non_std_str = non_std_str[0:7] + "..." 245 | self.logger.warning(f"target sequence contains {len(non_standard)} non-standard amino acids ('{non_std_str}') -> those positions will be trimmed.") 246 | 247 | # Read sequences from file 248 | self.sequences: List[Sequence] = [] 249 | fasta_stream = FastaStream(self.msa_path) # Caution with this one 250 | n_tot_sequences = 0 251 | # Keep redundant sequences 252 | if not self.remove_redundant_sequences: 253 | sequence = fasta_stream.get_next() 254 | while sequence is not None: 255 | self._verify_sequence_length(sequence, tgt_seq_len, n_tot_sequences) 256 | if do_trimming: 257 | sequence.trim(keep_position) 258 | if len(sequence) == 0: 259 | continue 260 | self.sequences.append(sequence) 261 | sequence = fasta_stream.get_next() 262 | n_tot_sequences += 1 263 | # Keep only non-redundant sequences 264 | # the filter is done during execution to optimize time and RAM (could help with huge MSAs) 265 | else: 266 | sequences_set = set() 267 | sequence = fasta_stream.get_next() 268 | while sequence is not None: 269 | self._verify_sequence_length(sequence, tgt_seq_len, n_tot_sequences) 270 | if do_trimming: 271 | sequence.trim(keep_position) 272 | if len(sequence) == 0: 273 | continue 274 | sequence_str = sequence.sequence 275 | if sequence_str not in sequences_set: 276 | self.sequences.append(sequence) 277 | sequences_set.add(sequence_str) 278 | sequence = fasta_stream.get_next() 279 | n_tot_sequences += 1 280 | self.logger.log(f" * remove redundant sequences : {n_tot_sequences} -> {len(self.sequences)}") 281 | fasta_stream.close() 282 | 283 | # Verify MSA consisency 284 | assert self.depth > 1, f"{self.error_prefix}: MSA contains no or only 1 sequence." 285 | assert self.length > 0, f"{self.error_prefix}: MSA target (first) sequence is of length 0." 286 | 287 | # Log 288 | self.logger.log(f" * MSA length (tgt seq length) : {len(self.target_sequence)}") 289 | self.logger.log(f" * MSA depth (num sequences) : {len(self.sequences)}") 290 | 291 | # Set target sequence name 292 | self.target_sequence.name += " (trimmed MSA)" 293 | 294 | def _remove_far_seqid_sequences(self) -> None: 295 | """Filter sequences that are too far from target sequence by sequence identity.""" 296 | 297 | # Guardian 298 | assert 0.0 <= self.min_seqid < 1.0, f"{self.error_prefix}: min_seqid={self.min_seqid} should be stricktly between 0 and 1." 299 | 300 | # Log 301 | self.logger.step(f"filter sequences that are too far from target sequence.") 302 | 303 | # Compute sequences to keep 304 | keep_sequences: List[Sequence] = [] 305 | target_sequence_str = self.sequences[0].sequence 306 | for current_sequence in self.sequences: 307 | current_sequence_str = current_sequence.sequence 308 | 309 | # Compute seqid with target sequence 310 | current_seqid = self._seqid_to_target(target_sequence_str, current_sequence_str) 311 | 312 | if current_seqid > self.min_seqid: 313 | keep_sequences.append(current_sequence) 314 | 315 | # Update MSA sequences 316 | l1, l2 = len(self.sequences), len(keep_sequences) 317 | self.sequences = keep_sequences 318 | 319 | # Log results 320 | self.logger.log(f" * filter: {l1} -> {l2} (min_seqid={self.min_seqid:.2f})") 321 | 322 | # Guardians 323 | if l2 == 0: 324 | error_log = f"{self.error_prefix}: remove_far_seqid_sequences(): no sequence left." 325 | error_log += f"\n - No sequences left in the MSA after removing sequences that are too far from target sequence (by sequence indentity)" 326 | error_log += f"\n - min_seqid={self.min_seqid}: please increase value or set to None." 327 | raise ValueError(error_log) 328 | 329 | def _seqid_to_target(self, seq1: str, seq2: str) -> float: 330 | """Computes sequence identity between two sequences in the MSA.""" 331 | gap = self.GAP_CHAR 332 | num_identical_residues = sum([int(aa1 == aa2) for aa1, aa2 in zip(seq1, seq2)]) 333 | num_aligned_residues = sum([int(aa != gap) for aa in seq2]) 334 | return num_identical_residues / num_aligned_residues 335 | 336 | def _align_structure_to_sequence(self) -> None: 337 | """Align residues position between PDB sequence and target sequence of the MSA.""" 338 | 339 | # Init 340 | self.str_seq_align: PairwiseAlignment 341 | self.pdb_to_fasta_trimmed: Dict[str, str] = {} 342 | self.fasta_trimmed_to_pdb: Dict[str, str] = {} 343 | self.rsa_array: List[Union[None, float]] = [None for _ in range(self.length)] 344 | self.rsa_factor_array: List[Union[None, float]] = [None for _ in range(self.length)] 345 | if self.structure is None: 346 | return None 347 | 348 | # Log 349 | self.logger.step("align Structure (from PDB) and Sequence (from MSA).") 350 | 351 | # Init alignment 352 | self.str_seq_align = PairwiseAlignment(self.structure.sequence, self.target_sequence) 353 | 354 | # Map positions 355 | i_pdb, i_fasta_trimmed = 0, 0 356 | n_no_rsa, n_no_residue = 0, 0 357 | for aa_pdb, aa_fasta_trimmed in zip(self.str_seq_align.align1, self.str_seq_align.align2): 358 | if aa_pdb != self.GAP_CHAR and aa_fasta_trimmed != self.GAP_CHAR: 359 | residue = self.structure.chain_residues[i_pdb] 360 | fasta_trimmed_id = str(i_fasta_trimmed+1) 361 | self.pdb_to_fasta_trimmed[residue.resid] = fasta_trimmed_id 362 | self.fasta_trimmed_to_pdb[fasta_trimmed_id] = residue.resid 363 | self.rsa_array[i_fasta_trimmed] = residue.rsa 364 | if residue.rsa is None: 365 | n_no_rsa += 1 366 | if aa_pdb != self.GAP_CHAR: 367 | i_pdb += 1 368 | if aa_fasta_trimmed != self.GAP_CHAR: 369 | if aa_pdb == self.GAP_CHAR: # Position in MSA but not is PDB 370 | n_no_residue += 1 371 | i_fasta_trimmed += 1 372 | 373 | # Log 374 | n_assigned = len([rsa for rsa in self.rsa_array if rsa is not None]) 375 | self.logger.log(f" * {n_assigned} / {len(self.rsa_array)} assigned RSA values for positions in trimmed MSA") 376 | 377 | # Set RSA factor 378 | self.set_rsa_factor() 379 | 380 | # Alignment Warnings 381 | if n_no_residue: 382 | self.logger.warning(f"{n_no_residue} / {len(self.rsa_array)} positions in trimmed MSA with no corresponding residues in PDB structure.") 383 | if n_no_rsa: 384 | self.logger.warning(f"{n_no_rsa} / {len(self.rsa_array)} positions in trimmed MSA corresponding to PDB residues without assigned RSA.") 385 | critical_alignment_warning = False 386 | if self.str_seq_align.mismatch > 0: 387 | critical_alignment_warning = True 388 | self.logger.warning(f"{self.str_seq_align.mismatch} / {len(self.rsa_array)} mismatch between trimmed MSA and PDB.", critical=True) 389 | if self.str_seq_align.internal_gap2 > 0: 390 | critical_alignment_warning = True 391 | self.logger.warning(f"{self.str_seq_align.internal_gap2} internal residues in the PDB do not correspond to a position in trimmed MSA.", critical=True) 392 | if critical_alignment_warning and not self.disable_warnings: 393 | self.str_seq_align.show(n_lines=80, only_critical_chunks=True) 394 | self.logger.warning("Please, make sure the first sequence in your MSA file is the target sequence to mutate.", critical=True) 395 | 396 | def set_rsa_factor(self, rsa_factor_function: Union[Callable[[float], float], None]=None) -> None: 397 | 398 | # Set default function 399 | if rsa_factor_function is None: 400 | rsa_factor_function = self.inverse_rsa 401 | 402 | # Log 403 | self.logger.step(f"set RSA factor (RSA -> w(RSA) with w='{rsa_factor_function.__name__}').") 404 | 405 | # Set RSA factor 406 | for i, rsa in enumerate(self.rsa_array): 407 | if rsa is not None: 408 | self.rsa_factor_array[i] = (1.0 - min(rsa, 100.0) / 100.0) 409 | 410 | def _init_weights(self) -> None: 411 | """Initialize weights for all sequences of the MSA (using C++ backend or from a cache file).""" 412 | 413 | # Case: keep all weights to 1 414 | if self.seqid_weights is None: 415 | # Put weight of first sequence to 0.0 manually to ignore it if required 416 | if not self.count_target_sequence: 417 | self.sequences[0].weight = 0.0 418 | return None 419 | 420 | # Read from cached file case 421 | if self.weights_cache_path is not None and os.path.isfile(self.weights_cache_path): 422 | self.logger.step("read weights from cached file.") 423 | self.logger.log(f" * weights_cache_path: '{self.weights_cache_path}'") 424 | weights = read_weights(self.weights_cache_path) 425 | if len(weights) != len(self.sequences): 426 | error_log = f"{self.error_prefix}: read_weights(weights_cache_path='{self.weights_cache_path}'): " 427 | error_log += f"\nnumber of parsed weights ({len(weights)}) does not match number of sequences ({len(self.sequences)}) in MSA." 428 | error_log += f"\n * Please remove current weights_cache file and re-run weights or set weights_cache_path to None." 429 | raise ValueError(error_log) 430 | 431 | # Re-compute case weights case 432 | else: 433 | self.logger.step("compute weights using C++ backend.") 434 | dt = (0.00000000015 * self.length * self.depth**2) / self.num_threads 435 | dt_str = time_str(dt) 436 | self.logger.log(f" * seqid (to compute clusters) : {self.seqid_weights}") 437 | self.logger.log(f" * expected computation-time : {dt_str} (with {self.num_threads} CPUs)") 438 | 439 | # Case when processed+trimmed MSA in saved 440 | if self.trimmed_msa_path is not None: 441 | weights = compute_weights( 442 | self.trimmed_msa_path, 443 | self.length, 444 | self.depth, 445 | self.seqid_weights, 446 | self.count_target_sequence, 447 | self.num_threads, 448 | self.verbose 449 | ) 450 | # Case when processed+trimmed MSA is not saved 451 | else: 452 | with tempfile.TemporaryDirectory() as tmp_dir: 453 | tmp_msa_path = os.path.join(tmp_dir, f"{self.name}.fasta") 454 | self.write(tmp_msa_path) 455 | weights = compute_weights( 456 | tmp_msa_path, 457 | self.length, 458 | self.depth, 459 | self.seqid_weights, 460 | self.count_target_sequence, 461 | self.num_threads, 462 | self.verbose 463 | ) 464 | 465 | # Verify coherence of computed weights 466 | if len(weights) != len(self.sequences): 467 | error_log = f"{self.error_prefix}: compute_weights(): " 468 | error_log += f"number of computed weights ({len(weights)}) does not match number of sequences ({len(self.sequences)}) in MSA." 469 | raise ValueError(error_log) 470 | 471 | # Assign weights 472 | for i, wi in enumerate(weights): 473 | self.sequences[i].weight = wi 474 | 475 | # Save weights in cache file if required 476 | if self.weights_cache_path is not None and not os.path.isfile(self.weights_cache_path): 477 | self.logger.step(f"save computed weights to file '{self.weights_cache_path}'.") 478 | self.logger.log(f" * weights_cache_path: '{self.weights_cache_path}'") 479 | write_weights(weights, self.weights_cache_path) 480 | 481 | def _init_counts(self) -> None: 482 | """Initialize residues counts and frequences from the MSA.""" 483 | 484 | # Log 485 | self.logger.step("initialize residues counts and frequencies.") 486 | 487 | # Set Neff 488 | self.Neff: float = sum([sequence.weight for sequence in self.sequences]) 489 | self.logger.log(f" * Neff (sum of weights): {self.Neff:.2f}") 490 | 491 | # Counts 492 | self.counts = np.zeros((self.length, self.N_STATES), float) 493 | for sequence in self.sequences: 494 | for l, aa in enumerate(sequence): 495 | aa_id = self.ONE_2_ID.get(aa, self.GAP_ID) 496 | self.counts[l, aa_id] += sequence.weight 497 | self.gap_counts = self.counts[:, self.GAP_ID] 498 | self.nongap_counts = self.Neff - self.gap_counts 499 | 500 | # Frequencies 501 | self.frequencies = self.counts / self.Neff 502 | self.gap_frequencies = self.frequencies[:, self.GAP_ID] 503 | self.nongap_frequencies = 1.0 - self.gap_frequencies 504 | 505 | # CI (Conservation Index) 506 | self.global_aa_frequencies = np.sum(self.frequencies, axis=0) / self.length 507 | self.CI = np.sqrt(0.5 * np.sum(((self.frequencies - self.global_aa_frequencies)[:, 0:20])**2, axis=1)) 508 | 509 | # Manage regularization and LOR/LR scores 510 | self.update_regularization(self.theta_regularization, self.n_regularization) 511 | 512 | def update_regularization(self, theta_regularization: float, n_regularization: float) -> "MSA": 513 | """Update regularization parameters and recompute regularized frequencies. 514 | 515 | Arguments: 516 | theta_regularization (float): Regularization at the level of frequencies (add theta to all positional frequencies and normalize) 517 | n_regularization (float): Regularization at the level of counts (add n to all positional counts and normalize) 518 | """ 519 | 520 | # Log 521 | self.logger.step("compute regularized frequencies.") 522 | self.logger.log(f" * theta_regularization : {theta_regularization}") 523 | self.logger.log(f" * n_regularization : {n_regularization}") 524 | 525 | # Regularization Guardians 526 | assert theta_regularization >= 0.0, f"{self.error_prefix}: theta_regularization={theta_regularization} should be positive." 527 | assert n_regularization >= 0.0, f"{self.error_prefix}: n_regularization={n_regularization} sould be positive." 528 | assert theta_regularization > 0.0 or n_regularization > 0.0, f"{self.error_prefix}: both theta_regularization and n_regularization can not be zero to avoid divering values." 529 | 530 | # Set regularization properties 531 | self.theta_regularization = theta_regularization 532 | self.n_regularization = n_regularization 533 | 534 | # Apply n_regularization 535 | self.frequencies_reg = (self.counts + self.n_regularization) / (self.Neff + (float(self.N_STATES) * self.n_regularization)) 536 | 537 | # Apply theta_regularization 538 | reg_term: float = self.theta_regularization / float(self.N_STATES) 539 | reg_factor: float = 1.0 - self.theta_regularization 540 | self.frequencies_reg = reg_factor * self.frequencies_reg + reg_term 541 | 542 | # Compute dependent values 543 | self.gap_frequencies_reg = self.frequencies_reg[:, self.GAP_ID] 544 | self.nongap_frequencies_reg = 1.0 - self.gap_frequencies_reg 545 | 546 | # Set LOR and LR 547 | self.LR = np.log(self.frequencies_reg) 548 | self.LOR = np.log(self.frequencies_reg / (1.0 - self.frequencies_reg)) 549 | 550 | return self 551 | 552 | 553 | # Base Properties ---------------------------------------------------------- 554 | @classmethod 555 | def help(cls) -> None: 556 | """Log main usage (help) of MSA class in the 'rsalor' package.""" 557 | print(cls.__init__.__doc__) 558 | 559 | def __str__(self) -> str: 560 | return f"MSA('{self.name}')" 561 | 562 | def __iter__(self): 563 | return iter(self.sequences) 564 | 565 | def __getitem__(self, id: int) -> str: 566 | return self.sequences[id] 567 | 568 | @property 569 | def target_sequence(self) -> Sequence: 570 | return self.sequences[0] 571 | 572 | @property 573 | def length(self) -> int: 574 | """Length of each sequence from the MSA.""" 575 | return len(self.target_sequence) 576 | 577 | @property 578 | def depth(self) -> int: 579 | """Number of sequences in the MSA.""" 580 | return len(self.sequences) 581 | 582 | @property 583 | def error_prefix(self) -> str: 584 | """Return error in MSA prefix.""" 585 | return f"\033[91mERROR\033[0m in {self}" 586 | 587 | @staticmethod 588 | def inverse_rsa(rsa_value: float) -> float: 589 | return 1.0 - min(rsa_value, 100.0) / 100.0 590 | 591 | # Scores (such as LOR) Properties ------------------------------------------ 592 | def get_frequency(self, residue_id: int, amino_acid_one_char: str, regularized: bool=True): 593 | """Get a given amino acid (regularized) frequency at a given position: 594 | 595 | NOTE: residue_id in FASTA convention (first position is 1) on the trimmed MSA 596 | 597 | Arguments: 598 | residue_id (int): position index in fasta convention (first residues is 1) 599 | amino_acid_one_char (str): amino acid one-letter-code or gap code '-' 600 | regularized (bool): set True for regularized frequencies 601 | """ 602 | if regularized: 603 | return self.frequencies_reg[residue_id - 1, self.ONE_2_ID_GAP[amino_acid_one_char]] 604 | else: 605 | return self.frequencies[residue_id - 1, self.ONE_2_ID_GAP[amino_acid_one_char]] 606 | 607 | def eval_mutations( 608 | self, 609 | mutations_list: List[str], 610 | mutations_reference: Literal["fasta_trimmed", "fasta", "pdb"]="fasta_trimmed", 611 | metric: Literal["LOR", "LR"]="LOR", 612 | use_rsa_factor: bool=False, 613 | disable_wt_warning: bool=False, 614 | ) -> List[float]: 615 | """Return list of LOR (log-add ratio) or LR (log ratio) for each mutation in mutations_list 616 | * for a mutation: LOR('H13K') = log(freq(H, 13) / 1 - freq(H, 13)) - log(freq(K, 13) / 1 - freq(K, 13)) 617 | * by default, position of the mutation is given in the fasta convention (first residue position is 1) on the trimmed MSA 618 | 619 | NOTE: mutation can be indicated in 3 different references: 620 | - 'fasta': residues are numbered using the FASTA convention (first residue is 1) using the input MSA target sequence as reference 621 | - 'fasta_trimmed': residues are numbered using the FASTA convention from the trimmed MSA (without target sequence gaps and non-std AAs) 622 | - 'pdb': residues are numbered as in the PDB file 623 | 624 | Arguments: 625 | mutations_list (List[str]): list of mutations as strings 626 | mutations_reference (str): "fasta_trimmed", "fasta", "pdb" to specify which mutation convention to use 627 | metric (str): "LOR" or "LR" to specify which metric to compute 628 | use_rsa_factor (bool): set True to multiply the score by the RSA factor at this position 629 | disable_wt_warning (bool): set True to not throw WARNING is mutation wt-aa does not match aa in the target sequence 630 | """ 631 | 632 | # Set metric 633 | ALLOWED_METRICS = ["LOR", "LR"] 634 | assert metric in ALLOWED_METRICS, f"{self.error_prefix}.eval_mutations(): metric='{metric}' should be in {ALLOWED_METRICS}." 635 | if metric == "LOR": 636 | E_matrix = self.LOR 637 | else: 638 | E_matrix = self.LR 639 | 640 | # Uniformize mutations to 'fasta_trimmed' reference 641 | ALLOWED_MUTATIONS_TYPES = ["fasta_trimmed", "fasta", "pdb"] 642 | assert mutations_reference in ALLOWED_MUTATIONS_TYPES, f"{self.error_prefix}: mutations_reference='{mutations_reference}' sould be in {ALLOWED_MUTATIONS_TYPES}." 643 | if mutations_reference == "fasta" or mutations_reference == "pdb": 644 | residues_map = self.fasta_to_fasta_trimmed if mutations_reference == "fasta" else self.pdb_to_fasta_trimmed 645 | mutations_list_converted = [] 646 | for mutation in mutations_list: 647 | wt, resid, mt = mutation[0], mutation[1:-1], mutation[-1] 648 | if resid not in residues_map: 649 | error_log = f"{self.error_prefix}.eval_mutations():" 650 | error_log += f"\nMutation '{mutation}' can not be converted from '{mutations_reference}' reference to 'fasta_trimmed' reference." 651 | error_log += f"\n - residue '{resid}' may be outside of the range of the MSA" 652 | if mutations_reference == "pdb": 653 | error_log += f"\n - residue '{resid}' may be missing in the PDB structure" 654 | elif mutations_reference == "fasta": 655 | error_log += f"\n - residue '{resid}' may be a gap or a non-standard amino acid in the target sequence of initial MSA" 656 | raise ValueError(error_log) 657 | mutation_converted = wt + residues_map[resid] + mt 658 | mutations_list_converted.append(mutation_converted) 659 | mutations_list_reference = [Mutation(mut) for mut in mutations_list_converted] 660 | else: 661 | mutations_list_reference = [Mutation(mut) for mut in mutations_list] 662 | 663 | # Compute mutations 664 | dE_arr = [] 665 | for i, mutation in enumerate(mutations_list_reference): 666 | assert 1 <= mutation.position <= self.length, f"{self.error_prefix}.eval_mutations(): position of mutation='{mutation}' is out of range of target sequence of the MSA." 667 | if not disable_wt_warning: 668 | aa_target = self.target_sequence[mutation.position-1] 669 | aa_mutation = mutation.wt_aa.one 670 | # Trigger incorrect wt aa warning 671 | if aa_mutation != aa_target: 672 | mutation_description = f"'{mutation}'" 673 | if mutations_reference != "fasta_trimmed": 674 | mutation_description = f"{mutation_description} ('{mutations_list[i]}' in '{mutations_reference}' reference)" 675 | self.logger.warning(f"eval_mutations(): mutation {mutation_description}: wt-aa does not match target sequence aa '{aa_target}'.") 676 | dE = E_matrix[mutation.position-1, mutation.wt_aa.id] - E_matrix[mutation.position-1, mutation.mt_aa.id] 677 | dE_arr.append(dE) 678 | 679 | # Modulate by RSA factor 680 | if use_rsa_factor: 681 | for i, (mutation, dE) in enumerate(zip(mutations_list_reference, dE_arr)): 682 | rsa_factor = self.rsa_factor_array[mutation.position-1] 683 | if rsa_factor is None: 684 | dE_arr[i] = None 685 | else: 686 | dE_arr[i] = rsa_factor * dE 687 | 688 | return dE_arr 689 | 690 | def get_scores(self, round_digit: Union[None, int]=None, log_results: bool=False,) -> List[dict]: 691 | """Compute scores (gap_freq, wt_freq, mt_freq, RSA, LOR, RSA*LOR, ...) for each single-site mutation. 692 | 693 | NOTE: mutation are indicated in 3 different references: 694 | - 'fasta': residues are numbered using the FASTA convention (first residue is 1) using the input MSA target sequence as reference 695 | - 'fasta_trimmed': residues are numbered using the FASTA convention from the trimmed MSA (without target sequence gaps and non-std AAs) 696 | - 'pdb': residues are numbered as in the PDB file 697 | 698 | output: List of dictionary with the scores: 699 | [{mutation_fasta: 'A13G', LOR: 0.4578, ...}, ...] 700 | """ 701 | 702 | # Log 703 | self.logger.step("compute scores for all single-site mutations.") 704 | 705 | # Compute scores for each single site mutation 706 | all_aas = AminoAcid.get_all() 707 | scores = [] 708 | for i, wt in enumerate(self.target_sequence.sequence): 709 | wt_i = AminoAcid.ONE_2_ID[wt] 710 | resid_fasta_trimmed = str(i+1) 711 | resid_fasta = self.fasta_trimmed_to_fasta[resid_fasta_trimmed] 712 | resid_pdb = self.fasta_trimmed_to_pdb.get(resid_fasta_trimmed, None) 713 | RSA = self.rsa_array[i] 714 | RSA_factor = self.rsa_factor_array[i] 715 | CI = self.CI[i] 716 | gap_freq = self.gap_frequencies[i] 717 | wt_freq = self.frequencies[i, wt_i] 718 | for mt_aa in all_aas: 719 | mt = mt_aa.one 720 | mt_i = mt_aa.id 721 | mutation_fasta_trimmed = wt + resid_fasta_trimmed + mt 722 | mutation_fasta = wt + resid_fasta + mt 723 | mutation_pdb = None 724 | if resid_pdb is not None: 725 | mutation_pdb = wt + resid_pdb + mt 726 | mt_freq = self.frequencies[i, mt_i] 727 | LOR = self.LOR[i, wt_i] - self.LOR[i, mt_i] 728 | LR = self.LR[i, wt_i] - self.LR[i, mt_i] 729 | RSALOR, RSALR = None, None 730 | if RSA_factor is not None: 731 | RSALOR = RSA_factor * LOR 732 | RSALR = RSA_factor * LR 733 | score = { 734 | "mutation_fasta": mutation_fasta, 735 | "mutation_fasta_trimmed": mutation_fasta_trimmed, 736 | "mutation_pdb": mutation_pdb, 737 | "gap_freq": gap_freq, 738 | "wt_freq": wt_freq, 739 | "mt_freq": mt_freq, 740 | "CI": CI, 741 | "RSA": RSA, 742 | "LOR": LOR, 743 | "LR": LR, 744 | "RSA*LOR": RSALOR, 745 | "RSA*LR": RSALR, 746 | } 747 | scores.append(score) 748 | 749 | # Round float values if required 750 | if round_digit is not None: 751 | for score in scores: 752 | for prop in ["gap_freq", "wt_freq", "mt_freq", "CI", "RSA", "LOR", "LR", "RSA*LOR", "RSA*LR"]: 753 | val = score[prop] 754 | if val is not None: 755 | score[prop] = round(val, round_digit) 756 | 757 | # Log 758 | if log_results: 759 | scores_csv = CSV(list(scores[0].keys()), name=self.name) 760 | scores_csv.add_entries(scores[0:40]) 761 | scores_csv.show(n_entries=40, max_colsize=23) 762 | 763 | return scores 764 | 765 | def save_scores( 766 | self, 767 | scores_path: str, 768 | round_digit: Union[None, int]=None, 769 | sep: str=";", 770 | missing_value: Union[None, str]="XXX", 771 | log_results: bool=False 772 | ) -> List[dict]: 773 | """Compute scores (gap_freq, wt_freq, mt_freq, RSA, LOR, RSA*LOR, ...) for each single-site mutation and save it to scores_path as a '.csv' file. 774 | 775 | NOTE: mutation are indicated in 3 different references: 776 | - 'fasta': residues are numbered using the FASTA convention (first residue is 1) using the input MSA target sequence as reference 777 | - 'fasta_trimmed': residues are numbered using the FASTA convention from the trimmed MSA (without target sequence gaps and non-std AAs) 778 | - 'pdb': residues are numbered as in the PDB file 779 | 780 | output: List of dictionary with the scores: 781 | [{mutation_fasta: 'A13G', LOR: 0.4578, ...}, ...] 782 | """ 783 | 784 | # Compute scores 785 | scores = self.get_scores(round_digit) 786 | 787 | # Log 788 | self.logger.step("save scores to a file.") 789 | self.logger.log(f" * scores_path: '{scores_path}'") 790 | 791 | # Format in CSV 792 | scores_properties = list(scores[0].keys()) 793 | scores_csv = CSV(scores_properties, name=self.name) 794 | scores_csv.set_sep(sep) 795 | scores_csv.add_entries(scores) 796 | 797 | # Change None to missing_value 798 | if missing_value is not None: 799 | for entry in scores_csv: 800 | for prop in scores_properties: 801 | if entry[prop] is None: 802 | entry[prop] = missing_value 803 | 804 | # Log 805 | if log_results: 806 | scores_csv.show(n_entries=40, max_colsize=23) 807 | 808 | # Save and return 809 | if scores_path is not None: 810 | scores_csv.write(scores_path) 811 | return scores 812 | 813 | 814 | # IO Methods --------------------------------------------------------------- 815 | def write(self, msa_path: str) -> "MSA": 816 | """Save MSA to a FASTA MSA file.""" 817 | 818 | # Guardians 819 | msa_path = os.path.abspath(msa_path) 820 | assert os.path.isdir(os.path.dirname(msa_path)), f"{self.error_prefix}.write(): directory of msa_path='{msa_path}' does not exists." 821 | assert msa_path.endswith(".fasta"), f"{self.error_prefix}.write(): msa_path='{msa_path}' should end with '.fasta'." 822 | 823 | # Write 824 | with open(msa_path, "w") as fs: 825 | fs.write("".join([seq.to_fasta_string() for seq in self.sequences])) 826 | return self 827 | 828 | 829 | # Guardians Dependencies --------------------------------------------------- 830 | # Helpers to verify coherence of inputs and current state 831 | 832 | def _verify_input_msa_path(self, msa_path: str) -> None: 833 | """For correct format and existance of input msa_path.""" 834 | 835 | # Existance 836 | assert os.path.exists(msa_path), f"{self.error_prefix}: msa_path='{msa_path}' files does not exist." 837 | 838 | # Hint for '.ali' format 839 | if msa_path.endswith(".ali"): 840 | error_log = f"{self.error_prefix}: msa_path='{msa_path}' should be in FASTA format." 841 | error_log += f"\n * msa_path: '{msa_path}'" 842 | error_log += f"\n * input msa_path is expected to be a MSA file in FASTA ('.fasta') format." 843 | error_log += f"\n * Please convert the MSA to '.fasta' with python script: " 844 | error_log += "\nfrom rsalor.utils import ali_to_fasta" 845 | error_log += "\nali_to_fasta('./my_msa.ali', './my_msa.fasta')\n" 846 | raise ValueError(error_log) 847 | 848 | # ERROR for bad MSA extention 849 | if msa_path.split(".")[-1] not in self.ACCEPTED_EXTENTIONS: 850 | error_log = f"{self.error_prefix}: msa_path='{msa_path}' should be in FASTA format (with file extention in {self.ACCEPTED_EXTENTIONS})." 851 | raise ValueError(error_log) 852 | 853 | def _verify_sequence_length(self, sequence: Sequence, target_length: int, i: int) -> None: 854 | """For coherence of all sequences in the MSA.""" 855 | if len(sequence) != target_length: 856 | seq_str = sequence.sequence 857 | if len(seq_str) > 40: 858 | seq_str = seq_str[0:37] + "..." 859 | error_log = f"{self.error_prefix}._read_sequences(): msa_path='{self.msa_path}':" 860 | error_log += f"\n -> length of sequence [{i+1}] l={len(sequence)} ('{seq_str}') does not match length of target sequence l={target_length}." 861 | raise ValueError(error_log) 862 | 863 | def _verify_trimmed_seq_path(self) -> None: 864 | """For coherence of trimmed_msa_path and for safety to not overwrite initial input MSA.""" 865 | trimmed_msa_path = str(os.path.abspath(self.trimmed_msa_path)) 866 | assert os.path.isdir(os.path.dirname(trimmed_msa_path)), f"{self.error_prefix}: directory of trimmed_msa_path='{trimmed_msa_path}' does not exists." 867 | assert trimmed_msa_path.endswith(".fasta"), f"{self.error_prefix}: trimmed_msa_path='{trimmed_msa_path}' should end with '.fasta'." 868 | if os.path.normpath(self.msa_path) == os.path.normpath(trimmed_msa_path) and not self.allow_msa_overwrite: 869 | error_log = f"{self.error_prefix}: trimmed_msa_path='{trimmed_msa_path}' is same as input MSA path." 870 | error_log == "\nIf trimmed_msa_path is set, the trimmed MSA (without target sequence gaps and non-std AAs) will be saved to this path." 871 | error_log += "\nWARNING: This operation will overwrite initial input MSA file." 872 | error_log += "\nTo continue, set argument 'allow_msa_overwrite' to True." 873 | raise ValueError(error_log) 874 | 875 | def _verify_rsa_values(self) -> None: 876 | """Warnings for non-assigned RSA residues.""" 877 | norsa_std, norsa_non_std = 0, 0 878 | for residue in self.structure.chain_residues: 879 | if residue.rsa is None: 880 | if residue.amino_acid.is_standard(): 881 | norsa_std += 1 882 | else: 883 | norsa_non_std += 1 884 | norsa = norsa_std + norsa_non_std 885 | if norsa > 0: 886 | warning_log = f"{norsa} / {len(self.structure.chain_residues)} residues with no assigned RSA values ({norsa_std} std and {norsa_non_std} non-std) in PDB target chain '{self.chain}'." 887 | warning_log += "\n -> This can be caused by non-standard AAs or missing atoms." 888 | warning_log += "\n -> For optimal RSA estimations, we highly recommend to 'repair' the PDB and standardize AAs." 889 | self.logger.warning(warning_log) 890 | -------------------------------------------------------------------------------- /rsalor/rsa/__init__.py: -------------------------------------------------------------------------------- 1 | from rsalor.rsa.rsa_solver import RSASolver 2 | from rsalor.rsa.rsa_biopython import RSABiopython 3 | from rsalor.rsa.rsa_dssp import RSADSSP 4 | from rsalor.rsa.rsa_music import RSAMuSiC 5 | -------------------------------------------------------------------------------- /rsalor/rsa/rsa_biopython.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from typing import Dict 5 | from Bio.PDB import PDBParser 6 | from Bio.PDB.SASA import ShrakeRupley 7 | from rsalor.sequence import AminoAcid 8 | from rsalor.rsa.rsa_solver import RSASolver 9 | 10 | # RSAMuSiC --------------------------------------------------------------------- 11 | class RSABiopython(RSASolver): 12 | """ 13 | RSABiopython(): Solver for RSA (Relative Solvent Accessibility) from a '.pdb' file using python package biopython. 14 | Uses the “rolling ball” algorithm developed by Shrake & Rupley algorithm 15 | doc: https://biopython.org/docs/dev/api/Bio.PDB.SASA.html 16 | 17 | usage: 18 | rsa_map = RSABiopython().run('./my_pdb.pdb') 19 | """ 20 | 21 | # Constants ---------------------------------------------------------------- 22 | # Taken from https://pmc.ncbi.nlm.nih.gov/articles/PMC3836772/#pone-0080635-t001 23 | MAX_SURFACE_MAP = { 24 | "ALA": 1.29, 25 | "ARG": 2.74, 26 | "ASN": 1.95, 27 | "ASP": 1.93, 28 | "CYS": 1.67, 29 | "GLN": 2.23, 30 | "GLU": 2.25, 31 | "GLY": 1.04, 32 | "HIS": 2.24, 33 | "ILE": 1.97, 34 | "LEU": 2.01, 35 | "LYS": 2.36, 36 | "MET": 2.24, 37 | "PHE": 2.40, 38 | "PRO": 1.59, 39 | "SER": 1.55, 40 | "THR": 1.55, 41 | "TRP": 2.85, 42 | "TYR": 2.63, 43 | "VAL": 1.74, 44 | } 45 | MAX_SURFACE_DEFAULT = 2.01 # mean value 46 | 47 | # Methods ------------------------------------------------------------------ 48 | def __str__(self) -> str: 49 | return "RSASolver['biopython' (Shrake & Rupley algorithm)]" 50 | 51 | def execute_solver(self, pdb_path: str) -> Dict[str, float]: 52 | """Compute RSA by running biopython python package: Bio.PDB.SASA: ShrakeRupley 53 | doc: https://biopython.org/docs/dev/api/Bio.PDB.SASA.html 54 | 55 | args: 56 | pdb_path (str): path to PDB file 57 | 58 | output: 59 | {resid: str => RSA: float} (such as {'A13': 48.57, ...}) 60 | """ 61 | 62 | # Parse PDB file 63 | pdb_name = os.path.basename(pdb_path).removesuffix(".pdb") 64 | pdb_parser = PDBParser(QUIET=True) 65 | structure = pdb_parser.get_structure(pdb_name, pdb_path) 66 | 67 | # Compute ASA 68 | shrake_rupley = ShrakeRupley( 69 | #probe_radius=1.40, # radius of the probe in A. Default is 1.40, roughly the radius of a water molecule. 70 | #n_points=200, # resolution of the surface of each atom. Default is 100. A higher number of points results in more precise measurements, but slows down the calculation. 71 | #radii_dict=None, # user-provided dictionary of atomic radii to use in the calculation. Values will replace/complement those in the default ATOMIC_RADII dictionary. 72 | ) 73 | shrake_rupley.compute(structure, level="R") 74 | 75 | # Convert to RSA and format 76 | rsa_map: Dict[str, float] = {} 77 | for chain_obj in structure[0]: 78 | chain = chain_obj.id 79 | chain_structure = structure[0][chain] 80 | for residue in chain_structure: 81 | 82 | # Find 'resid' = {chain}{res_position} 83 | (res_insertion, res_id, res_alternate_location) = residue.id 84 | resid = f"{chain}{res_insertion}{res_id}".replace(" ", "") 85 | 86 | # Get AA 3-letter code and standardize if required 87 | aa_three = residue.resname 88 | aa_three = AminoAcid._NON_STANDARD_AAS.get(aa_three, aa_three) 89 | 90 | # Get RSA 91 | asa = residue.sasa 92 | if isinstance(asa, float): 93 | rsa_map[resid] = asa / self.get_max_surf(aa_three) 94 | return rsa_map 95 | 96 | @classmethod 97 | def get_max_surf(cls, aa_three: str) -> float: 98 | return cls.MAX_SURFACE_MAP.get(aa_three, cls.MAX_SURFACE_DEFAULT) 99 | -------------------------------------------------------------------------------- /rsalor/rsa/rsa_dssp.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | import sys 5 | from typing import Dict, Union 6 | import tempfile 7 | from contextlib import contextmanager 8 | from Bio.PDB import PDBParser 9 | from Bio.PDB.DSSP import DSSP 10 | from rsalor.utils import find_file 11 | from rsalor.rsa.rsa_solver import RSASolver 12 | 13 | # RSADSSP ---------------------------------------------------------------------- 14 | class RSADSSP(RSASolver): 15 | """ 16 | RSADSSP(): Solver for RSA (Relative Solvent Accessibility) from a '.pdb' file using DSSP software. 17 | 18 | usage: 19 | rsa_map = RSADSSP('./soft/DSSP/dssp').run('./my_pdb.pdb') 20 | """ 21 | 22 | # Constants ---------------------------------------------------------------- 23 | CANDIDATES_PATHS = ["mkdssp", "dssp"] 24 | HELPER_LOG = """------------------------------------------------------- 25 | RSA Solver: DSSP issue: 26 | In order to solve Relative Solvent Accessiblity (RSA), RSALOR package uses: 27 | Python package biopython -> interface with the DSSP algorithms (https://biopython.org/docs/1.75/api/Bio.PDB.DSSP.html). 28 | The DSSP software (free for academic use) has to be installed on your computer. 29 | Please install DSSP (https://swift.cmbi.umcn.nl/gv/dssp/) and specify the path to its executable or add it to system PATH. 30 | DSSP source code can be found here: https://github.com/cmbi/hssp 31 | 32 | NOTE: you can still use the RSALOR package without DSSP if you only want LOR values of the MSA without using RSA (just set pdb_path=None). 33 | -------------------------------------------------------""" 34 | 35 | # Methods ------------------------------------------------------------------ 36 | def __str__(self) -> str: 37 | return "RSASolver['DSSP']" 38 | 39 | def execute_solver(self, pdb_path: str) -> Dict[str, float]: 40 | """Compute RSA by running DSSP. 41 | 42 | args: 43 | pdb_path (str): path to PDB file 44 | 45 | output: 46 | {resid: str => RSA: float} (such as {'A13': 48.57, ...}) 47 | """ 48 | 49 | # Init DSSP path (check DSSP executable existance only if software is executed) 50 | self._init_dssp_path() 51 | 52 | # Run DSSP 53 | pdb_with_cryst1_line = self._inject_cryst1_line(pdb_path) # Manage CRYST1 line 54 | # Case: CRYST1 line is already present or this version of DSSP does not requires it 55 | if pdb_with_cryst1_line is None: 56 | rsa_map = self._run_dssp_backend(pdb_path) 57 | # Case: inject CRYST1 line and run DSSP on modified PDB 58 | else: 59 | with tempfile.NamedTemporaryFile(delete=True) as temp_file: 60 | tmp_pdb_path = temp_file.name 61 | with open(tmp_pdb_path, "w") as fs: 62 | fs.write(pdb_with_cryst1_line) 63 | rsa_map = self._run_dssp_backend(tmp_pdb_path) 64 | 65 | return rsa_map 66 | 67 | # Dependencies ------------------------------------------------------------- 68 | def _init_dssp_path(self) -> str: 69 | """Find an existing executable file for DSSP on the computer.""" 70 | if self.executable_path is not None: 71 | dssp_path_list = [self.executable_path] + self.CANDIDATES_PATHS 72 | else: 73 | dssp_path_list = self.CANDIDATES_PATHS 74 | dssp_path = find_file(dssp_path_list, is_software=True, name="DSSP", description=self.HELPER_LOG, verbose=self.verbose) 75 | self.dssp_path = dssp_path 76 | 77 | def _inject_cryst1_line(self, pdb_path: str) -> Union[None, str]: 78 | """Inject CRYST1 line in a PDB file if there is not one. 79 | -> If CRYST1 line is present, return None 80 | -> Else return a string of the PDB file with the CRYST1 line 81 | """ 82 | 83 | # No need to inject CRYST1 line with mkdssp 84 | if self.dssp_path.endswith("mkdssp"): 85 | return None 86 | 87 | # Constants 88 | CRYST1_HEADER = "CRYST1" 89 | ATOM_HEADER = "ATOM" 90 | DEFAULT_CRYST1_LINE = "CRYST1 1.000 1.000 1.000 90.00 90.00 90.00 P 1 1 \n" 91 | 92 | # Read lines 93 | new_lines = [] 94 | with open(pdb_path, "r") as fs: 95 | line = fs.readline() 96 | 97 | # Read lines to detect CRYST1 line 98 | while line: 99 | if line.startswith(CRYST1_HEADER): # Return None to specify that CRYST1 line is already here 100 | return None 101 | if line.startswith(ATOM_HEADER): 102 | new_lines.append(DEFAULT_CRYST1_LINE) 103 | new_lines.append(line) 104 | line = fs.readline() 105 | break 106 | new_lines.append(line) 107 | line = fs.readline() 108 | 109 | # After injecting CRYST1 line, continue following lines 110 | while line: 111 | new_lines.append(line) 112 | line = fs.readline() 113 | 114 | # Return pdb string with injected CRYST1 line 115 | return "".join(new_lines) 116 | 117 | def _run_dssp_backend(self, pdb_path: str) -> Dict[str, float]: 118 | """Run DSSP software with the BioPython interface.""" 119 | 120 | # Parse PDB with BioPython 121 | pdb_name = os.path.basename(pdb_path).removesuffix(".pdb") 122 | structure = PDBParser(QUIET=True).get_structure(pdb_name, pdb_path) 123 | model = structure[0] 124 | 125 | # Run DSSP 126 | if not self.verbose: # Run DSSP with WARNINGS desabled 127 | with suppress_stderr(): 128 | dssp = DSSP(model, pdb_path, dssp=self.dssp_path) 129 | else: # Run DSSP normally 130 | dssp = DSSP(model, pdb_path, dssp=self.dssp_path) 131 | 132 | # Parse Residues 133 | resid_set = set() 134 | residues_keys = list(dssp.keys()) 135 | rsa_map: Dict[str, float] = {} 136 | for res_key in residues_keys: 137 | chain, (res_insertion, res_id, res_alternate_location) = res_key 138 | resid = f"{chain}{res_insertion}{res_id}".replace(" ", "") 139 | if resid not in resid_set: 140 | res_data = dssp[res_key] 141 | resid_set.add(resid) 142 | rsa = res_data[3] 143 | if isinstance(rsa, float): 144 | rsa = round(rsa * 100.0, 4) 145 | rsa_map[resid] = rsa 146 | 147 | # Return 148 | return rsa_map 149 | 150 | # Just to delete WARNINGS from DSSP and BioPython ------------------------------ 151 | # Because BioPython and DSSP does not provide a disable WARNINGS option ... 152 | @contextmanager 153 | def suppress_stderr(): 154 | """Redirect standard error to null (with some magic)""" 155 | original_stderr = sys.stderr 156 | sys.stderr = open(os.devnull, 'w') 157 | try: 158 | yield 159 | finally: 160 | sys.stderr.close() 161 | sys.stderr = original_stderr -------------------------------------------------------------------------------- /rsalor/rsa/rsa_music.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from typing import Dict 5 | import tempfile 6 | import subprocess 7 | from rsalor.utils import find_file 8 | from rsalor.rsa.rsa_solver import RSASolver 9 | 10 | # RSAMuSiC --------------------------------------------------------------------- 11 | class RSAMuSiC(RSASolver): 12 | """ 13 | RSAMuSiC(): Solver for RSA (Relative Solvent Accessibility) from a '.pdb' file using MuSiC software. 14 | 15 | usage: 16 | rsa_map = RSAMuSiC('./soft/MuSiC/music').run('./my_pdb.pdb') 17 | """ 18 | 19 | # Constants ---------------------------------------------------------------- 20 | 21 | # Saved condidate paths to simplify execution on different machines 22 | CANDIDATES_PATHS = [ 23 | "music", "music_retro", # MuSiC executables 24 | "/home/Softs/MuSiC-4.1/music", # Nautilus and Santorin 25 | ] 26 | 27 | # Helper: hint to solve problem form the user: install software please 28 | HELPER_LOG = """------------------------------------------------------- 29 | RSA Solver: MuSiC issue: 30 | In order to solve Relative Solvent Accessiblity (RSA), RSALOR package uses MuSiC software. 31 | MuSiC is our in house protein structure software (https://soft.dezyme.com/). 32 | Please install MuSiC and specify the path to its executable or add it to system PATH. 33 | 34 | Alternatively, if you do not have access to MuSiC, set rsa_solver='DSSP' and install DSSP (free for academic uses) 35 | DSSP: https://swift.cmbi.umcn.nl/gv/dssp/ 36 | 37 | NOTE: you can still use the RSALOR package without MuSiC if you only want LOR values of the MSA without using RSA (just set pdb_path=None). 38 | -------------------------------------------------------""" 39 | 40 | # Methods ------------------------------------------------------------------ 41 | def __str__(self) -> str: 42 | return "RSASolver['MuSiC']" 43 | 44 | def execute_solver(self, pdb_path: str) -> Dict[str, float]: 45 | """Compute RSA by running MuSiC: 'music -cat' 46 | 47 | args: 48 | pdb_path (str): path to PDB file 49 | 50 | output: 51 | {resid: str => RSA: float} (such as {'A13': 48.57, ...}) 52 | """ 53 | 54 | # Init MuSiC path (check MuSiC executable existance only if software is executed) 55 | self._init_music_path() 56 | 57 | # Using temporary directory 58 | with tempfile.TemporaryDirectory() as tmp_dir: 59 | 60 | # Init 61 | name = os.path.basename(pdb_path).removesuffix(".pdb") 62 | pdb_dir = os.path.abspath(os.path.dirname(pdb_path)) 63 | path_in_path = os.path.join(tmp_dir, "path.in") 64 | cat_path = os.path.join(tmp_dir, f"{name}.cat") 65 | log_path = os.path.join(tmp_dir, f"log_{name}.txt") 66 | 67 | # Generate path.in file 68 | path_in = "\n".join([ 69 | f"DATA {os.path.join(os.path.dirname(self.music_path), 'MuSiC/Data/')}", 70 | f"PDB {pdb_dir}/", 71 | f"OUTPUT {tmp_dir}/", 72 | f"CAT {tmp_dir}/\n" 73 | ]) 74 | with open(path_in_path, "w") as fs: 75 | fs.write(path_in) 76 | 77 | # Adapt run to MuSiC 4.0 or 4.1 assuming version is specified in MuSiC folder name 78 | music_last_folder_name = os.path.basename(os.path.dirname(self.music_path)) 79 | sidechain_parameter = "FULLATOM" if "4.1" in music_last_folder_name else "" 80 | 81 | # Run MuSiC cat 82 | music_cmd = f"{self.music_path} -cat {name} {sidechain_parameter} {name} -init {path_in_path} -log {name}" 83 | if self.verbose: 84 | print(f" * run MuSiC command: '{music_cmd}'") 85 | process = subprocess.run(music_cmd.split(), shell=False, capture_output=True, text=True) 86 | 87 | # Check execution errors 88 | if process.returncode != 0: 89 | self._log_music_execution_error(process, log_path, music_cmd) 90 | raise ValueError(f"ERROR in {self}.execute_solver(): execution of MuSiC failed.") 91 | if not os.path.isfile(cat_path): 92 | self._log_music_execution_error(process, log_path, music_cmd) 93 | raise ValueError(f"ERROR in {self}.execute_solver(): execution of MuSiC succeeded but no output '.cat' file is generated at '{cat_path}'.") 94 | 95 | # Log output 96 | #if self.verbose: 97 | # print("\nMuSiC logs: ") 98 | # music_lines = result.stdout.split("\n") 99 | # music_lines = [line for line in music_lines if len(line) > 0] 100 | # print("\n".join(music_lines) + "\n") 101 | 102 | # Parse MuSiC -cat output 103 | try: 104 | rsa_map = self._parse_cat(cat_path) 105 | except: 106 | raise ValueError(f"ERROR in {self}.execute_solver(): failed to parse RSA from generated file '{cat_path}'.") 107 | assert len(rsa_map) > 0, f"ERROR in {self}.execute_solver(): no valid RSA data found in file '{cat_path}'." 108 | 109 | # Return 110 | return rsa_map 111 | 112 | # Dependencies ------------------------------------------------------------- 113 | def _init_music_path(self) -> str: 114 | """Find an existing executable file for MuSiC on the computer.""" 115 | if self.executable_path is not None: 116 | music_path_list = [self.executable_path] + self.CANDIDATES_PATHS 117 | else: 118 | music_path_list = self.CANDIDATES_PATHS 119 | music_path = find_file(music_path_list, is_software=True, name="MuSiC", description=self.HELPER_LOG, verbose=self.verbose) 120 | self.music_path = music_path 121 | 122 | def _parse_cat(self, file_path: str) -> Dict[str, float]: 123 | """Parse music '.cat' file and return RSA mapping: {resid: str => RSA: float}.""" 124 | 125 | # Guardians 126 | assert os.path.isfile(file_path), f"ERROR in {self}._parse_cat(): file_path='{file_path}' does not exists." 127 | assert file_path.endswith(".cat"), f"ERROR in {self}._parse_cat(): file_path='{file_path}' should end with '.cat'." 128 | 129 | # Read cat file 130 | with open(file_path, "r") as fs: 131 | 132 | # Skip lines before #RESIDUES section 133 | line = fs.readline() 134 | while line and not line.startswith("#RESIDUES"): 135 | if line.startswith("#RESIDUES"): break 136 | line = fs.readline() 137 | line = fs.readline() 138 | 139 | # Read #RESIDUES section 140 | rsa_map: Dict[str, float] = {} 141 | while line: 142 | 143 | # Break after #RESIDUES section ends 144 | if line.startswith("#"): break 145 | 146 | # Parse values from line 147 | resid = line[0:6].replace(" ", "") 148 | rsa = float(line[30:40]) 149 | 150 | # Save values 151 | rsa_map[resid] = rsa 152 | line = fs.readline() 153 | 154 | # Sanity chech and return 155 | return rsa_map 156 | 157 | def _log_music_execution_error(self, music_run_process, log_path: str, music_cmd: str) -> None: 158 | print("\nERROR in MuSiC execution.") 159 | print(" * MuSiC command: ") 160 | print(f" $ {music_cmd}") 161 | print(" * Standard Output (stdout): ") 162 | print(music_run_process.stdout) 163 | print(" * Error Output (stderr): ") 164 | print(music_run_process.stderr) 165 | print(" * Log file content: ") 166 | if os.path.isfile(log_path): 167 | with open(log_path, "r") as fs: 168 | log_lines = "\n".join(list(fs.readlines())) 169 | print(log_lines) -------------------------------------------------------------------------------- /rsalor/rsa/rsa_solver.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from abc import ABC, abstractmethod 5 | from typing import Dict, Union 6 | 7 | 8 | # Abstract RSASolver class ----------------------------------------------------- 9 | class RSASolver(ABC): 10 | """ 11 | Abstract container class for RSASolver: to compute RSA (Relative Solvent Accessibility) from a PDB file. 12 | 13 | usage: 14 | rsa_map = RSASolver('./soft/software_executable').run('./my_pdb.pdb') 15 | """ 16 | 17 | # Constants ---------------------------------------------------------------- 18 | COMMENT_CHAR = "#" 19 | 20 | # Constructor -------------------------------------------------------------- 21 | def __init__( 22 | self, 23 | executable_path: Union[None, str]=None, 24 | verbose: bool=False, 25 | ): 26 | self.executable_path = executable_path 27 | self.verbose = verbose 28 | 29 | # Methods ------------------------------------------------------------------ 30 | def __str__(self) -> str: 31 | return "RSASolver['AbstractSolver']" 32 | 33 | def run( 34 | self, 35 | pdb_path: str, 36 | rsa_cache_path: Union[None, str]=None, 37 | ) -> Dict[str, float]: 38 | """Compute RSA by running the solver or using the caced file. 39 | 40 | args: 41 | pdb_path (str): path to PDB file 42 | rsa_cache_path (str): path to/from where save/read RSA (if file exists, solver execution will be skipped and output directly read from file) 43 | 44 | output: 45 | {resid: str => RSA: float} (such as {'A13': 48.57, ...}) 46 | """ 47 | 48 | # Parse RSA if cache file exists 49 | if rsa_cache_path is not None and os.path.isfile(rsa_cache_path): 50 | if self.verbose: 51 | print(f" * read RSA values from rsa_cache_path '{rsa_cache_path}'") 52 | rsa_map = self.read(rsa_cache_path) 53 | return rsa_map 54 | 55 | # PDB file Guardians 56 | assert os.path.isfile(pdb_path), f"ERROR in {self}.execute_solver(): pdb_path='{pdb_path}' file does not exists." 57 | assert pdb_path.endswith(".pdb"), f"ERROR in {self}.execute_solver(): pdb_path='{pdb_path}' should be a '.pdb' file." 58 | 59 | # Run solver 60 | if self.verbose: 61 | print(f" * execute RSA solver: {self}") 62 | rsa_map = self.execute_solver(pdb_path) 63 | 64 | # Write RSA map 65 | if rsa_cache_path is not None and not os.path.isfile(rsa_cache_path): 66 | if self.verbose: 67 | print(f" * save RSA values in rsa_cache_path '{rsa_cache_path}'") 68 | self.write(rsa_cache_path, rsa_map) 69 | 70 | # Return 71 | return rsa_map 72 | 73 | @abstractmethod 74 | def execute_solver(self, pdb_path: str) -> Dict[str, float]: 75 | """Compute RSA by running the solver. 76 | 77 | args: 78 | pdb_path (str): path to PDB file 79 | 80 | output: 81 | {resid: str => RSA: float} (such as {'A13': 48.57, ...}) 82 | """ 83 | pass 84 | 85 | def read(self, file_path: str) -> Dict[str, float]: 86 | """Read rsa_map file and return RSA mapping: {resid: str => RSA: float}.""" 87 | 88 | # Guardians 89 | assert os.path.isfile(file_path), f"ERROR in {self}.read(): file file_path='{file_path}' does not exist." 90 | 91 | # Parse and return 92 | rsa_map: Dict[str, float] = {} 93 | with open(file_path, "r") as fs: 94 | lines = [line.split() for line in fs.readlines() if len(line) > 2 and line[0] != self.COMMENT_CHAR] 95 | for line in lines: 96 | resid, rsa = line[0], line[1] 97 | rsa_map[resid] = float(rsa) 98 | 99 | # Guardian and return 100 | assert len(rsa_map) > 0, f"ERROR in {self}.read(): No RSA data found in file_path='{file_path}'." 101 | return rsa_map 102 | 103 | def write(self, file_path: str, rsa_map: Dict[str, float]) -> "RSASolver": 104 | """Write rsa_map to a file.""" 105 | 106 | # Guardians 107 | file_path = os.path.abspath(file_path) 108 | assert os.path.isdir(os.path.dirname(file_path)), f"ERROR in {self}.write(): directory of file_path='{file_path}' does not exists." 109 | 110 | # Stringity 111 | rsa_map_str = "\n".join(f"{resid} {rsa}" for resid, rsa in rsa_map.items()) 112 | 113 | # Write 114 | with open(file_path, "w") as fs: 115 | fs.write(rsa_map_str) 116 | 117 | return self -------------------------------------------------------------------------------- /rsalor/sequence/__init__.py: -------------------------------------------------------------------------------- 1 | from rsalor.sequence.amino_acid import AminoAcid 2 | from rsalor.sequence.mutation import Mutation 3 | from rsalor.sequence.sequence import Sequence 4 | from rsalor.sequence.fasta_reader import FastaReader, FastaStream 5 | from rsalor.sequence.pairwise_alignment import PairwiseAlignment 6 | -------------------------------------------------------------------------------- /rsalor/sequence/amino_acid.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | from typing import List, Dict 4 | 5 | # AminoAcid -------------------------------------------------------------------- 6 | class AminoAcid: 7 | """Container class for the 20 standard proteogenic amino acids. 8 | * Manages mapping between amino acids: id, one-letter-code and three-letter-code 9 | * Manages non-standard amino acids three-letters-codes and their corresponding standard AA 10 | 11 | usage: 12 | 13 | alanine_id: int = AminoAcid.ONE_2_ID['A'] \n 14 | one_letter_code_3: str = AminoAcid.ID_2_ONE[3] \n 15 | ala: AminoAcid= AminoAcid('A') \n 16 | non_standard_metionine: AminoAcid = AminoAcid.parse_three('MSE') 17 | """ 18 | 19 | # Static Properties -------------------------------------------------------- 20 | 21 | # Standard Amino Acids metadata 22 | _AA_LIST = [ 23 | ( 0, "A", "ALA", "alanine"), 24 | ( 1, "C", "CYS", "cysteine"), 25 | ( 2, "D", "ASP", "aspartate"), 26 | ( 3, "E", "GLU", "glutamate"), 27 | ( 4, "F", "PHE", "phenylalanine"), 28 | ( 5, "G", "GLY", "glycine"), 29 | ( 6, "H", "HIS", "histidine"), 30 | ( 7, "I", "ILE", "isoleucine"), 31 | ( 8, "K", "LYS", "lysine"), 32 | ( 9, "L", "LEU", "leucine"), 33 | (10, "M", "MET", "methionine"), 34 | (11, "N", "ASN", "asparagine"), 35 | (12, "P", "PRO", "proline"), 36 | (13, "Q", "GLN", "glutamine"), 37 | (14, "R", "ARG", "arginine"), 38 | (15, "S", "SER", "serine"), 39 | (16, "T", "THR", "thréonine"), 40 | (17, "V", "VAL", "valine"), 41 | (18, "W", "TRP", "tryptophane"), 42 | (19, "Y", "TYR", "tyrosine"), 43 | ] 44 | 45 | # Gap and Unknown Amino Acid properties 46 | GAP_ID = 20 47 | GAP_ONE = "-" 48 | GAP_THREE = "GAP" 49 | GAP_NAME = "gap" 50 | UNK_ID = -1 51 | UNK_ONE = "X" 52 | UNK_THREE = "XXX" 53 | UNK_NAME = "unknown" 54 | 55 | # Non-Standard Amino Acids three-letter-codes mapped to closest Standard Amino Acids 56 | _NON_STANDARD_AAS = { 57 | # WARNING: We do not represent here ambiguous mappings like: "GLX" => "GLN" or "GLU" 58 | "4HT": "TRP", "CLG": "LYS", "HSE": "SER", "BIF": "PHE", "B3D": "ASP", "BB8": "PHE", "3MY": "TYR", "SNK": "HIS", 59 | "3CF": "PHE", "A5N": "ASN", "LED": "LEU", "TOX": "TRP", "CR5": "GLY", "ILM": "ILE", "0A9": "PHE", "DAS": "ASP", 60 | "NYS": "CYS", "73P": "LYS", "MSO": "MET", "IYR": "TYR", "PR9": "PRO", "R4K": "TRP", "L5P": "LYS", "31Q": "CYS", 61 | "OCY": "CYS", "BH2": "ASP", "XSN": "ASN", "SXE": "SER", "GMA": "GLU", "SEP": "SER", "CYD": "CYS", "YPZ": "TYR", 62 | "GPL": "LYS", "RVX": "SER", "YCM": "CYS", "SEL": "SER", "DNE": "LEU", "LEN": "LEU", "4FB": "PRO", "4OU": "PHE", 63 | "LGY": "LYS", "TTQ": "TRP", "DBB": "THR", "LBZ": "LYS", "QX7": "ALA", "H14": "PHE", "CIR": "ARG", "73O": "TYR", 64 | "EI4": "ARG", "LVN": "VAL", "SRZ": "SER", "55I": "PHE", "UF0": "SER", "YHA": "LYS", "QM8": "ALA", "TQQ": "TRP", 65 | "QIL": "ILE", "Q75": "MET", "11Q": "PRO", "A8E": "VAL", "DHV": "VAL", "3BY": "PRO", "2ZC": "SER", "T9E": "THR", 66 | "CSZ": "CYS", "5CS": "CYS", "KPI": "LYS", "0AH": "SER", "HSK": "HIS", "TH6": "THR", "ARO": "ARG", "E9V": "HIS", 67 | "UXQ": "PHE", "MHL": "LEU", "CAS": "CYS", "8RE": "LYS", "LLP": "LYS", "PTH": "TYR", "ORQ": "ARG", "73N": "ARG", 68 | "BTK": "LYS", "HVA": "VAL", "LMQ": "GLN", "FME": "MET", "XX1": "LYS", "I7F": "SER", "4N9": "PRO", "TYJ": "TYR", 69 | "BOR": "ARG", "HL2": "LEU", "73C": "SER", "0CS": "ALA", "AGM": "ARG", "CYW": "CYS", "ASL": "ASP", "I3D": "TRP", 70 | "NPH": "CYS", "JKH": "PRO", "QMB": "ALA", "XCN": "CYS", "PHI": "PHE", "NAL": "ALA", "LYZ": "LYS", "6M6": "CYS", 71 | "VAD": "VAL", "EXL": "TRP", "WFP": "PHE", "823": "ASN", "CLH": "LYS", "C6C": "CYS", "DCY": "CYS", "DPP": "ALA", 72 | "KHB": "LYS", "DNW": "ALA", "BUC": "CYS", "CSU": "CYS", "H5M": "PRO", "RXL": "VAL", "FOE": "CYS", "GHP": "GLY", 73 | "2KP": "LYS", "OMX": "TYR", "ZCL": "PHE", "MGG": "ARG", "DLS": "LYS", "30V": "CYS", "02K": "ALA", "DA2": "ARG", 74 | "TYY": "TYR", "HRG": "ARG", "PHL": "PHE", "PRJ": "PRO", "M2L": "LYS", "SUN": "SER", "TSY": "CYS", "PF5": "PHE", 75 | "4CF": "PHE", "1OP": "TYR", "CSB": "CYS", "POM": "PRO", "ELY": "LYS", "TRQ": "TRP", "BP5": "ALA", "5VV": "ASN", 76 | "6DN": "LYS", "MIS": "SER", "MLZ": "LYS", "EME": "GLU", "4J5": "ARG", "MPQ": "GLY", "LLO": "LYS", "FQA": "LYS", 77 | "PR7": "PRO", "NLW": "LEU", "OMY": "TYR", "5CT": "LYS", "PRK": "LYS", "DPQ": "TYR", "N0A": "PHE", "3QN": "LYS", 78 | "K5H": "CYS", "HNC": "CYS", "TYO": "TYR", "Q3P": "LYS", "BWV": "ARG", "4L0": "PRO", "ZAL": "ALA", "IAM": "ALA", 79 | "AGQ": "TYR", "07O": "CYS", "PCA": "GLN", "2MR": "ARG", "TRN": "TRP", "4AR": "ARG", "HLY": "LYS", "DHI": "HIS", 80 | "J2F": "TYR", "C3Y": "CYS", "GL3": "GLY", "BTR": "TRP", "OYL": "HIS", "IGL": "GLY", "2GX": "PHE", "8LJ": "PRO", 81 | "AYA": "ALA", "XYC": "ALA", "CY1": "CYS", "CGU": "GLU", "PM3": "PHE", "03Y": "CYS", "CE7": "ASN", "HSL": "SER", 82 | "BXT": "SER", "MHU": "PHE", "HOX": "PHE", "5GM": "ILE", "DVA": "VAL", "CYR": "CYS", "YOF": "TYR", "DDZ": "ALA", 83 | "4PQ": "TRP", "ECC": "GLN", "GHG": "GLN", "IPG": "GLY", "PPN": "PHE", "L3O": "LEU", "AEA": "CYS", "7N8": "PHE", 84 | "AHO": "ALA", "TBG": "VAL", "BFD": "ASP", "HPE": "PHE", "5MW": "LYS", "U2X": "TYR", "N10": "SER", "TGH": "TRP", 85 | "51T": "TYR", "DDE": "HIS", "DBZ": "ALA", "FF9": "LYS", "HTN": "ASN", "NVA": "VAL", "HS9": "HIS", "ACB": "ASP", 86 | "9KP": "LYS", "FTR": "TRP", "ALS": "ALA", "DYJ": "PRO", "RPI": "ARG", "FTY": "TYR", "TQZ": "CYS", "FVA": "VAL", 87 | "CS4": "CYS", "QVA": "CYS", "XPR": "PRO", "0QL": "CYS", "TCQ": "TYR", "OXX": "ASP", "ZZJ": "ALA", "LDH": "LYS", 88 | "3CT": "TYR", "H7V": "ALA", "4N7": "PRO", "PYA": "ALA", "WVL": "VAL", "DMK": "ASP", "EFC": "CYS", "0BN": "PHE", 89 | "MHO": "MET", "ECX": "CYS", "ESB": "TYR", "KGC": "LYS", "3WX": "PRO", "MBQ": "TYR", "ILX": "ILE", "DSG": "ASN", 90 | "P2Q": "TYR", "LSO": "LYS", "6CW": "TRP", "SDP": "SER", "MP8": "PRO", "HTR": "TRP", "B3S": "SER", "TYB": "TYR", 91 | "PAQ": "TYR", "HS8": "HIS", "RX9": "ILE", "DHA": "SER", "CHP": "GLY", "MMO": "ARG", "FCL": "PHE", "05O": "TYR", 92 | "ICY": "CYS", "DIV": "VAL", "N65": "LYS", "Q78": "PHE", "KCR": "LYS", "TY8": "TYR", "GVL": "SER", "MLL": "LEU", 93 | "DNP": "ALA", "5XU": "ALA", "O7D": "TRP", "NFA": "PHE", "DBY": "TYR", "QCS": "CYS", "ZYK": "PRO", "IIL": "ILE", 94 | "ABA": "ALA", "4AW": "TRP", "BSE": "SER", "LLY": "LYS", "4D4": "ARG", "MNL": "LEU", "FGL": "GLY", "SET": "SER", 95 | "MYN": "ARG", "C4R": "CYS", "CZZ": "CYS", "CZS": "ALA", "Y1V": "LEU", "CWR": "SER", "NBQ": "TYR", "KYQ": "LYS", 96 | "2TY": "TYR", "1PA": "PHE", "6V1": "CYS", "FGP": "SER", "BB9": "CYS", "AGT": "CYS", "CYG": "CYS", "VI3": "CYS", 97 | "PH6": "PRO", "NZH": "HIS", "DAB": "ALA", "B2A": "ALA", "6WK": "CYS", "PR4": "PRO", "7O5": "ALA", "OHS": "ASP", 98 | "3YM": "TYR", "Z3E": "THR", "NC1": "SER", "CAF": "CYS", "BPE": "CYS", "BB7": "CYS", "RE0": "TRP", "TSQ": "PHE", 99 | "4CY": "MET", "G5G": "LEU", "TDD": "LEU", "KCX": "LYS", "0AR": "ARG", "HSV": "HIS", "2ML": "LEU", "4PH": "PHE", 100 | "V44": "CYS", "IAS": "ASP", "FH7": "LYS", "PTM": "TYR", "SAR": "GLY", "SVX": "SER", "MEN": "ASN", "CS1": "CYS", 101 | "HOO": "HIS", "NYB": "CYS", "HMR": "ARG", "05N": "PRO", "V61": "PHE", "41H": "PHE", "BMT": "THR", "4HL": "TYR", 102 | "I2M": "ILE", "4N8": "PRO", "2RX": "SER", "CS3": "CYS", "MEA": "PHE", "B2F": "PHE", "CYF": "CYS", "GNC": "GLN", 103 | "4HJ": "SER", "CSJ": "CYS", "2SO": "HIS", "Q2E": "TRP", "CXM": "MET", "4WQ": "ALA", "5OW": "LYS", "TRX": "TRP", 104 | "B3Y": "TYR", "DAH": "PHE", "5PG": "GLY", "ESC": "MET", "DTY": "TYR", "CGA": "GLU", "TFW": "TRP", "SMF": "PHE", 105 | "S1H": "SER", "SAC": "SER", "QCI": "GLN", "CMT": "CYS", "TY2": "TYR", "0A8": "CYS", "OMH": "SER", "QPA": "CYS", 106 | "MK8": "LEU", "DLE": "LEU", "T0I": "TYR", "ALT": "ALA", "3X9": "CYS", "5CW": "TRP", "9E7": "LYS", "MGN": "GLN", 107 | "PBF": "PHE", "AEI": "THR", "TYI": "TYR", "SNN": "ASN", "74P": "LYS", "OHI": "HIS", "KST": "LYS", "SBL": "SER", 108 | "JJJ": "CYS", "JJL": "CYS", "2RA": "ALA", "DIL": "ILE", "02Y": "ALA", "CYJ": "LYS", "2HF": "HIS", "FC0": "PHE", 109 | "NLN": "LEU", "XW1": "ALA", "QMM": "GLN", "TOQ": "TRP", "WPA": "PHE", "TIH": "ALA", "NLB": "LEU", "BG1": "SER", 110 | "PTR": "TYR", "0WZ": "TYR", "ZYJ": "PRO", "SNC": "CYS", "BBC": "CYS", "B3E": "GLU", "4GJ": "CYS", "MSA": "GLY", 111 | "TPO": "THR", "HIQ": "HIS", "PHA": "PHE", "THC": "THR", "JJK": "CYS", "API": "LYS", "TY5": "TYR", "LPD": "PRO", 112 | "MND": "ASN", "PRV": "GLY", "M3L": "LYS", "HR7": "ARG", "86N": "GLU", "DSN": "SER", "5R5": "SER", "IC0": "GLY", 113 | "ARM": "ARG", "4AK": "LYS", "HT7": "TRP", "E9M": "TRP", "4DP": "TRP", "IML": "ILE", "BCS": "CYS", "7OZ": "ALA", 114 | "2MT": "PRO", "GLZ": "GLY", "0E5": "THR", "U3X": "PHE", "HYP": "PRO", "M0H": "CYS", "7XC": "PHE", "AZK": "LYS", 115 | "AHB": "ASN", "NCB": "ALA", "ASA": "ASP", "TPL": "TRP", "0TD": "ASP", "HTI": "CYS", "LRK": "LYS", "ME0": "MET", 116 | "143": "CYS", "FY2": "TYR", "1TY": "TYR", "QPH": "PHE", "F2F": "PHE", "3PX": "PRO", "PLJ": "PRO", "N9P": "ALA", 117 | "3ZH": "HIS", "C5C": "CYS", "PFF": "PHE", "NEP": "HIS", "CSA": "CYS", "4J4": "CYS", "O7G": "VAL", "TTS": "TYR", 118 | "KFP": "LYS", "FZN": "LYS", "TYN": "TYR", "AA4": "ALA", "LYX": "LYS", "HP9": "PHE", "TH5": "THR", "D2T": "ASP", 119 | "MED": "MET", "TRW": "TRP", "HLU": "LEU", "CSO": "CYS", "23F": "PHE", "PG9": "GLY", "EJA": "CYS", "RE3": "TRP", 120 | "66D": "ILE", "4OG": "TRP", "MSE": "MET", "MDF": "TYR", "DBU": "THR", "SEN": "SER", "Y57": "LYS", "XA6": "PHE", 121 | "M2S": "MET", "FLT": "TYR", "GME": "GLU", "LE1": "VAL", "FY3": "TYR", "OZW": "PHE", "FP9": "PRO", "FHL": "LYS", 122 | "MLE": "LEU", "DAR": "ARG", "BHD": "ASP", "LA2": "LYS", "SLZ": "LYS", "CSX": "CYS", "OCS": "CYS", "DMH": "ASN", 123 | "2CO": "CYS", "NLE": "LEU", "LME": "GLU", "HIC": "HIS", "ZBZ": "CYS", "MYK": "LYS", "2JG": "SER", "ORN": "ALA", 124 | "YTF": "GLN", "1AC": "ALA", "OLD": "HIS", "B2I": "ILE", "HZP": "PRO", "4AF": "PHE", "OMT": "MET", "CSP": "CYS", 125 | "APK": "LYS", "DPR": "PRO", "CY0": "CYS", "5T3": "LYS", "CY3": "CYS", "3GL": "GLU", "4II": "PHE", "0AK": "ASP", 126 | "ALC": "ALA", "LP6": "LYS", "HIP": "HIS", "60F": "CYS", "CML": "CYS", "CYQ": "CYS", "NA8": "ALA", "MH6": "SER", 127 | "GFT": "SER", "WLU": "LEU", "AZH": "ALA", "KBE": "LYS", "LCK": "LYS", "LAY": "LEU", "0LF": "PRO", "KKD": "ASP", 128 | "K7K": "SER", "CSR": "CYS", "B3K": "LYS", "OSE": "SER", "F2Y": "TYR", "NMM": "ARG", "P1L": "CYS", "PRS": "PRO", 129 | "OBS": "LYS", "ZDJ": "TYR", "BYR": "TYR", "HY3": "PRO", "ASB": "ASP", "NLY": "GLY", "0A1": "TYR", "DPL": "PRO", 130 | "SCS": "CYS", "I4G": "GLY", "6CV": "ALA", "HIA": "HIS", "LYN": "LYS", "54C": "TRP", "FGA": "GLU", "B27": "THR", 131 | "TYE": "TYR", "DTH": "THR", "PSH": "HIS", "EXA": "LYS", "BLE": "LEU", "P9S": "CYS", "23P": "ALA", "1TQ": "TRP", 132 | "RVJ": "ALA", "ALO": "THR", "FL6": "ASP", "4LZ": "TYR", "TMD": "THR", "FHO": "LYS", "0FL": "ALA", "AN6": "LEU", 133 | "4OV": "SER", "432": "SER", "SCH": "CYS", "DGL": "GLU", "2TL": "THR", "TPQ": "TYR", "3AH": "HIS", "CSD": "CYS", 134 | "PR3": "CYS", "IZO": "MET", "DV9": "GLU", "41Q": "ASN", "DI7": "TYR", "34E": "VAL", "MHS": "HIS", "GGL": "GLU", 135 | "ALY": "LYS", "O6H": "TRP", "8JB": "CYS", "SVV": "SER", "KOR": "MET", "PYX": "CYS", "6CL": "LYS", "WRP": "TRP", 136 | "SCY": "CYS", "G1X": "TYR", "2KK": "LYS", "TYQ": "TYR", "MIR": "SER", "ALN": "ALA", "CMH": "CYS", "KPY": "LYS", 137 | "SVZ": "SER", "NMC": "GLY", "RGL": "ARG", "SME": "MET", "DAL": "ALA", "DTR": "TRP", "PEC": "CYS", "SGB": "SER", 138 | "NLO": "LEU", "AHP": "ALA", "SLL": "LYS", "TRF": "TRP", "CME": "CYS", "SEE": "SER", "MME": "MET", "DYA": "ASP", 139 | "33X": "ALA", "LYF": "LYS", "CZ2": "CYS", "TRO": "TRP", "DPN": "PHE", "IB9": "TYR", "POK": "ARG", "LET": "LYS", 140 | "CCS": "CYS", "DGN": "GLN", "NIY": "TYR", "E9C": "TYR", "SEB": "SER", "AIB": "ALA", "OAS": "SER", "V7T": "LYS", 141 | "K5L": "SER", "TYS": "TYR", "FIO": "ARG", "B2V": "VAL", "GLJ": "GLU", "JLP": "LYS", "MVA": "VAL", "0Y8": "PRO", 142 | "OTH": "THR", "00C": "CYS", "0EA": "TYR", "F7W": "TRP", "LEI": "VAL", "UMA": "ALA", "OLT": "THR", "4KY": "PRO", 143 | "MCS": "CYS", "TNQ": "TRP", "HIX": "ALA", "C1X": "LYS", "PAT": "TRP", "T8L": "THR", "DM0": "LYS", "CG6": "CYS", 144 | "KPF": "LYS", "DYS": "CYS", "BB6": "CYS", "LAL": "ALA", "DLY": "LYS", "DJD": "PHE", "LTU": "TRP", "TYT": "TYR", 145 | "VPV": "LYS", "D11": "THR", "LEF": "LEU", "1X6": "SER", "ML3": "LYS", "MAA": "ALA", "7ID": "ASP", "AAR": "ARG", 146 | "NZC": "THR", "R1A": "CYS", "CGV": "CYS", "D3P": "GLY", "TIS": "SER", "LYR": "LYS", "4IN": "TRP", "CY4": "CYS", 147 | "0AF": "TRP", "TLY": "LYS", "SVA": "SER", "4HH": "SER", "HQA": "ALA", "PHD": "ASP", "KYN": "TRP", "4FW": "TRP", 148 | "VHF": "GLU", "CTH": "THR", "B3X": "ASN", "MTY": "TYR", "MLY": "LYS", "SMC": "CYS", "TS9": "ILE", "PXU": "PRO", 149 | "DSE": "SER", "P3Q": "TYR", "BCX": "CYS", "FAK": "LYS", "SVY": "SER", "CSS": "CYS", "FDL": "LYS", "2LT": "TYR", 150 | "N80": "PRO", "B3A": "ALA", "LYO": "LYS", "VR0": "ARG", "YTH": "THR", 151 | } 152 | 153 | # Mappings to aa metadata 154 | _ID_MAP: Dict[int, tuple] = {aa[0]: aa for aa in _AA_LIST} 155 | _ONE_MAP: Dict[str, tuple] = {aa[1]: aa for aa in _AA_LIST} 156 | _THREE_MAP: Dict[str, tuple] = {aa[2]: aa for aa in _AA_LIST} 157 | #_AA_MAP = _ID_MAP | _ONE_MAP | _THREE_MAP 158 | 159 | # Translation tables 160 | THREE_2_ONE: Dict[str, str] = {aa[2]: aa[1] for aa in _AA_LIST} 161 | ONE_2_THREE: Dict[str, str] = {aa[1]: aa[2] for aa in _AA_LIST} 162 | ONE_2_ID: Dict[str, int] = {aa[1]: aa[0] for aa in _AA_LIST} 163 | ID_2_ONE: Dict[int, str] = {aa[0]: aa[1] for aa in _AA_LIST} 164 | 165 | # Construcors -------------------------------------------------------------- 166 | def __init__(self, aa_one: str) -> "AminoAcid": 167 | """Only accepts standard Amino Acid one-letter-codes.""" 168 | assert aa_one in self._ONE_MAP, f"ERROR in AminoAcid('{aa_one}'): invalid amino acid one-letter-code." 169 | aa_metadata = self._ONE_MAP[aa_one] 170 | self.id: int = aa_metadata[0] 171 | self.one: str = aa_metadata[1] 172 | self.three: str = aa_metadata[2] 173 | self.three_standard: str = aa_metadata[2] 174 | self.name: str = aa_metadata[3] 175 | 176 | @classmethod 177 | def parse_three(cls, aa_three: int) -> "AminoAcid": 178 | """Parse an AminoAcid from its three-letter-code (can handle non-standard AAs and mapping to corresponding standard AA).""" 179 | 180 | # Standard case 181 | aa_one = cls.THREE_2_ONE.get(aa_three, None) 182 | if aa_one is not None: 183 | return AminoAcid(aa_one) 184 | 185 | # Non-standard case 186 | aa_three_standard = cls._NON_STANDARD_AAS.get(aa_three, None) 187 | if aa_three_standard is not None: 188 | aa_one = cls.THREE_2_ONE[aa_three_standard] 189 | aa = AminoAcid(aa_one) 190 | aa.three = aa_three 191 | return aa 192 | 193 | # Unknown case 194 | return cls.get_unknown() 195 | 196 | @classmethod 197 | def get_all(cls) -> List["AminoAcid"]: 198 | """Get the list of all 20 standard AminoAcids.""" 199 | return [AminoAcid(aa_metadata[1]) for aa_metadata in cls._AA_LIST] 200 | 201 | @classmethod 202 | def get_unknown(cls) -> "AminoAcid": 203 | """Return an unknown AminoAcid.""" 204 | aa = AminoAcid("A") 205 | aa.id = AminoAcid.UNK_ID 206 | aa.one = AminoAcid.UNK_ONE 207 | aa.three = AminoAcid.UNK_THREE 208 | aa.three_standard = None 209 | aa.name = AminoAcid.UNK_NAME 210 | return aa 211 | 212 | @classmethod 213 | def get_gap(cls) -> "AminoAcid": 214 | """Return a gap 'AminoAcid'.""" 215 | aa = AminoAcid("A") 216 | aa.id = AminoAcid.GAP_ID 217 | aa.one = AminoAcid.GAP_ONE 218 | aa.three = AminoAcid.GAP_THREE 219 | aa.three_standard = None 220 | aa.name = AminoAcid.GAP_NAME 221 | return aa 222 | 223 | # Base properties ---------------------------------------------------------- 224 | def __str__(self) -> str: 225 | if self.is_standard: 226 | return f"AminoAcid('{self.one}', '{self.three}', id={self.id})" 227 | else: 228 | return f"AminoAcid('{self.one}', '{self.three}' (std='{self.three_standard}'), id={self.id})" 229 | 230 | def is_gap(self) -> bool: 231 | self.id == AminoAcid.GAP_ID 232 | 233 | def is_unknown(self) -> bool: 234 | return self.id == AminoAcid.UNK_ID 235 | 236 | def is_aminoacid(self) -> bool: 237 | return not self.is_gap() 238 | 239 | def is_standard(self) -> bool: 240 | return self.three == self.three_standard 241 | 242 | # Class methods ------------------------------------------------------------ 243 | @classmethod 244 | def id_exists(cls, id: int) -> bool: 245 | """Return if 'id' corresponds to the id of a standard Amino Acid.""" 246 | return id in cls._ID_MAP 247 | 248 | @classmethod 249 | def one_exists(cls, aa_one: str) -> bool: 250 | """Return if 'aa_one' corresponds to the one-letter-code of a standard Amino Acid.""" 251 | return aa_one in cls._ONE_MAP 252 | 253 | @classmethod 254 | def three_exists(cls, aa_three: str) -> bool: 255 | """Return if 'aa_three' corresponds to the three-letter-code of a standard Amino Acid.""" 256 | return aa_three in cls._THREE_MAP 257 | -------------------------------------------------------------------------------- /rsalor/sequence/fasta_reader.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from typing import List, Union 5 | from rsalor.sequence import Sequence 6 | 7 | 8 | # FastaReader ------------------------------------------------------------------ 9 | class FastaReader: 10 | """High level FASTA file reader.""" 11 | 12 | @classmethod 13 | def read_first_sequence(cls, fasta_path: str) -> Sequence: 14 | """Read first sequence from a FASTA file.""" 15 | fasta_stream = FastaStream(fasta_path) 16 | sequence = fasta_stream.get_next() 17 | fasta_stream.close() 18 | return sequence 19 | 20 | @classmethod 21 | def read_sequences(cls, fasta_path: str) -> List[Sequence]: 22 | """Read all sequences from a FASTA file.""" 23 | fasta_stream = FastaStream(fasta_path) 24 | sequences = fasta_stream.get_all() 25 | fasta_stream.close() 26 | return sequences 27 | 28 | @classmethod 29 | def count_sequences(cls, fasta_path: str) -> int: 30 | """Count the number of sequences in a FASTA file (just count the '>').""" 31 | 32 | # Guardians 33 | assert os.path.isfile(fasta_path), f"ERROR in FastaReader.count_sequences(): fasta_path='{fasta_path}' does not exists." 34 | assert fasta_path.split(".")[-1] in FastaStream.ACCEPTED_EXTENTIONS, f"ERROR in FastaReader.count_sequences(): fasta_path='{fasta_path}' should end with {FastaStream.ACCEPTED_EXTENTIONS}." 35 | 36 | # Count 37 | HEADER_START_CHAR = Sequence.HEADER_START_CHAR 38 | n = 0 39 | with open(fasta_path) as fs: 40 | line = fs.readline() 41 | while line: 42 | if line.startswith(HEADER_START_CHAR): 43 | n += 1 44 | line = fs.readline() 45 | return n 46 | 47 | 48 | # FastaStream ------------------------------------------------------------------ 49 | class FastaStream: 50 | """Low level class to stream sequences from a FASTA file (one by one to avoid loading the whole file in RAM). 51 | 52 | WARNING: Please use with caution and do not forget to '.close()'. 53 | 54 | usage: 55 | 56 | fasta_stream = FastaStream('./fasta/msa1.fasta') \n 57 | sequence1 = fasta_stream.get_next() \n 58 | sequence2 = fasta_stream.get_next() \n 59 | fasta_stream.close() 60 | """ 61 | 62 | # Constants ---------------------------------------------------------------- 63 | ACCEPTED_EXTENTIONS = ["fasta", "a2m"] 64 | HEADER_START_CHAR = Sequence.HEADER_START_CHAR 65 | 66 | # Constructor -------------------------------------------------------------- 67 | def __init__(self, fasta_path: str): 68 | 69 | # Guardians 70 | assert os.path.isfile(fasta_path), f"ERROR in FastaStream(): fasta_path='{fasta_path}' does not exists." 71 | assert fasta_path.split(".")[-1] in self.ACCEPTED_EXTENTIONS, f"ERROR in FastaStream(): fasta_path='{fasta_path}' should end with {self.ACCEPTED_EXTENTIONS}." 72 | 73 | # Init 74 | self.fasta_path = fasta_path 75 | self.file = open(fasta_path, "r") 76 | self.current_id = -1 77 | self.current_line = self._next_line() 78 | 79 | # First sequence sanity check 80 | assert self.current_line is not None, f"ERROR in FastaStream(): no sequences found in file '{fasta_path}'." 81 | assert self._is_current_line_header(), f"ERROR in FastaStream(): first line of file '{fasta_path}' sould be a fasta header (thus start with '{self.HEADER_START_CHAR}').\nline='{self.current_line}'" 82 | 83 | @property 84 | def is_open(self) -> bool: 85 | """Return if current file/stream is still open""" 86 | return self.current_line is not None 87 | 88 | # Methods ------------------------------------------------------------------ 89 | def close(self) -> None: 90 | """Close file.""" 91 | self.file.close() 92 | self.current_line = None 93 | 94 | def get_next(self) -> Union[None, Sequence]: 95 | """Get next Fasta sequence.""" 96 | if self.current_line is None: 97 | return None 98 | self.current_id += 1 99 | header = self.current_line.removesuffix("\n") 100 | seq_arr = [] 101 | self.current_line = self._next_line() 102 | while self.current_line: 103 | if self._is_current_line_header(): 104 | break 105 | seq_arr.append(self.current_line.removesuffix("\n")) 106 | self.current_line = self._next_line() 107 | 108 | seq = "".join(seq_arr) 109 | return Sequence(header, seq) 110 | 111 | def get_all(self) -> List[Sequence]: 112 | """Get all remaining Fasta sequences.""" 113 | fasta_sequence_list = [] 114 | fasta_sequence = self.get_next() 115 | while fasta_sequence is not None: 116 | fasta_sequence_list.append(fasta_sequence) 117 | fasta_sequence = self.get_next() 118 | return fasta_sequence_list 119 | 120 | # Dependencies ------------------------------------------------------------- 121 | def _next_line(self) -> Union[None, str]: 122 | line = self.file.readline() 123 | if line == "": 124 | self.close() 125 | return None 126 | return line 127 | 128 | def _is_current_line_header(self) -> bool: 129 | return self.current_line.startswith(Sequence.HEADER_START_CHAR) 130 | -------------------------------------------------------------------------------- /rsalor/sequence/mutation.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | from rsalor.utils import is_convertable_to 4 | from rsalor.sequence import AminoAcid 5 | 6 | # Mutation --------------------------------------------------------------------- 7 | class Mutation: 8 | """Container class for a single missence/synonymous mutation on a protein (FASTA) sequence. 9 | 10 | NOTE: Use FASTA residue position convention: so resdue position is an integer and starts at 1. 11 | NOTE: Trivial mutations are accepter (like 'A14A'). 12 | 13 | usage: 14 | mutation: Mutation = Mutation('A14G') 15 | """ 16 | 17 | # Constructor -------------------------------------------------------------- 18 | def __init__(self, mutation_str: str): 19 | 20 | # Unpack and guardians 21 | assert len(mutation_str) >= 3, f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': should be of length 3 or more." 22 | wt_aa, position, mt_aa = mutation_str[0], mutation_str[1:-1], mutation_str[-1] 23 | assert AminoAcid.one_exists(wt_aa), f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': wild-type amino acid is incorrect." 24 | assert AminoAcid.one_exists(mt_aa), f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': mutant amino acid is incorrect." 25 | assert is_convertable_to(position, int), f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': position must be a stricktly positive integer." 26 | position = int(position) 27 | assert position > 0, f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': position must be a stricktly positive integer." 28 | 29 | # Set 30 | self.wt_aa: AminoAcid = AminoAcid(wt_aa) 31 | self.position: int = position 32 | self.mt_aa: AminoAcid = AminoAcid(mt_aa) 33 | 34 | # Methods ------------------------------------------------------------------ 35 | def __str__(self) -> str: 36 | return f"{self.wt_aa.one}{self.position}{self.mt_aa.one}" 37 | 38 | def __int__(self) -> int: 39 | """Return unique integer code for each mutation.""" 40 | return self.position*10000 + self.wt_aa.id*100 + self.mt_aa.id 41 | 42 | def is_trivial(self) -> bool: 43 | """Return if mutation is trivial (like 'A14A').""" 44 | return self.wt_aa == self.mt_aa 45 | -------------------------------------------------------------------------------- /rsalor/sequence/pairwise_alignment.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from typing import List, Tuple, Dict, Union 5 | from Bio.Align import PairwiseAligner 6 | from rsalor.sequence import Sequence 7 | 8 | # PairwiseAlignment ------------------------------------------------------------ 9 | class PairwiseAlignment: 10 | """Class to perform pairwise alignments based on sequence identity. 11 | The aim is to reconsile slightly different inputs of the same protein sequence but with possibly small incoherences like missing residues. 12 | Like the SEQRES and ATOM lines of a PDB. 13 | Or a sequence extracted from a PDB and a sequence from an MSA(for instance, MSA or PDB could cover a different range of the sequence). 14 | 15 | NOTE: Put sequence which is expected to contain the least gaps first 16 | 17 | usage: 18 | 19 | seq1 = FastaSequence("msa_seq", "HHALYDYEARTK") \n 20 | seq2 = FastaSequence("pdb_seq", "ALYDYEART") \n 21 | align = PairwiseAlignment(seq1, seq2) \n 22 | align.show() \n 23 | seq1_to_seq2_id_mapping = align.get_mapping() 24 | """ 25 | 26 | # Constants ---------------------------------------------------------------- 27 | GAP_CHAR = "-" 28 | MATCH_CHAR = "|" 29 | MISMATCH_CHAR = "x" 30 | 31 | # Constructor -------------------------------------------------------------- 32 | def __init__( 33 | self, 34 | sequence1: Sequence, 35 | sequence2: Sequence, 36 | match_score: float=1.0, 37 | mismatch_score: float=-3.0, 38 | open_gap_score: float=-2.5, 39 | extend_gap_score: float=-2.0, 40 | tail_gap_score: float=-2.0, 41 | query_insertion_multiplier: float=3.0, 42 | ): 43 | 44 | # Length Guardians 45 | if len(sequence1) == 0 or len(sequence2) == 0: 46 | print(f" * sequence1: {sequence1}") 47 | print(f" * sequence2: {sequence2}") 48 | raise ValueError("ERROR in PairwiseAlignment(): input target or query sequence can not be of length 0.") 49 | 50 | # Init base properties 51 | self.sequence1 = sequence1 52 | self.sequence2 = sequence2 53 | self.len1 = len(sequence1) 54 | self.len2 = len(sequence2) 55 | self.len_min = min(self.len1, self.len2) 56 | self.len_max = max(self.len1, self.len2) 57 | self.len_ratio = self.len_min / self.len_max 58 | 59 | # Init aligner 60 | self.aligner = PairwiseAligner() 61 | self.aligner.mode = 'global' 62 | self.aligner.match_score = match_score 63 | self.aligner.mismatch_score = mismatch_score 64 | self.aligner.target_internal_open_gap_score = open_gap_score 65 | self.aligner.target_internal_extend_gap_score = extend_gap_score 66 | self.aligner.target_right_open_gap_score = tail_gap_score 67 | self.aligner.target_right_extend_gap_score = tail_gap_score 68 | self.aligner.target_left_open_gap_score = tail_gap_score 69 | self.aligner.target_left_extend_gap_score = tail_gap_score 70 | self.aligner.query_internal_open_gap_score = open_gap_score * query_insertion_multiplier 71 | self.aligner.query_internal_extend_gap_score = extend_gap_score * query_insertion_multiplier 72 | self.aligner.query_left_open_gap_score = tail_gap_score 73 | self.aligner.query_left_extend_gap_score = tail_gap_score 74 | self.aligner.query_right_open_gap_score = tail_gap_score 75 | self.aligner.query_right_extend_gap_score = tail_gap_score 76 | 77 | # Align 78 | alignments = self.aligner.align(self.sequence1.sequence, self.sequence2.sequence) 79 | try: # For Biopython versions 1.80 and later 80 | self.align1: str = alignments[0][0] 81 | self.align2: str = alignments[0][1] 82 | except: # For legacy Biopython versions 83 | alignment_str_list = str(alignments[0]).split() 84 | self.align1: str = alignment_str_list[0] 85 | self.align2: str = alignment_str_list[2] 86 | self.score: float = alignments.score 87 | 88 | # Alignment properties 89 | self.match: int = 0 90 | self.gap: int = 0 91 | self.mismatch: int = 0 92 | comparator_list = [] 93 | for aa1, aa2 in zip(self.align1, self.align2): 94 | if aa1 == self.GAP_CHAR or aa2 == self.GAP_CHAR: 95 | self.gap += 1 96 | comparator_list.append(self.GAP_CHAR) 97 | elif aa1 == aa2: 98 | self.match += 1 99 | comparator_list.append(self.MATCH_CHAR) 100 | else: 101 | self.mismatch += 1 102 | comparator_list.append(self.MISMATCH_CHAR) 103 | self.comparator = "".join(comparator_list) 104 | self.match_ratio: float = self.match / len(self) 105 | self.gap_ratio: float = self.gap / len(self) 106 | self.mismatch_ratio: float = self.mismatch / len(self) 107 | 108 | # Failed alignment error 109 | if self.match == 0: 110 | print("PairwiseAlignment(): failed to align sequences: zero matching positions is the alignemnt.") 111 | print(f" * sequence1: {sequence1}") 112 | print(f" * sequence2: {sequence2}") 113 | raise ValueError("ERROR in PairwiseAlignment(): alignment failed.") 114 | 115 | # Count gap types 116 | self.left_gap, self.right_gap = _count_tail_characters(self.comparator, self.GAP_CHAR) 117 | self.tail_gap: int = self.left_gap + self.right_gap 118 | self.internal_gap: int = self.gap - self.tail_gap 119 | 120 | # Count gap types by sequence 121 | self.gap1 = len(self) - self.len1 122 | self.gap2 = len(self) - self.len2 123 | self.left_gap1, self.right_gap1 = _count_tail_characters(self.align1, self.GAP_CHAR) 124 | self.tail_gap1: int = self.left_gap1 + self.right_gap1 125 | self.internal_gap1: int = self.gap1 - self.tail_gap1 126 | self.left_gap2, self.right_gap2 = _count_tail_characters(self.align2, self.GAP_CHAR) 127 | self.tail_gap2: int = self.left_gap2 + self.right_gap2 128 | self.internal_gap2: int = self.gap2 - self.tail_gap2 129 | 130 | # Some final measures 131 | self.sequence_identity: float = self.match / (self.match + self.mismatch) # excluding gapped positions 132 | self.coverage1: float = (self.match + self.mismatch) / self.len1 133 | self.coverage2: float = (self.match + self.mismatch) / self.len2 134 | self.coverage: float = (self.match + self.mismatch) / len(self) 135 | 136 | # Base Properties ---------------------------------------------------------- 137 | def __len__(self) -> int: 138 | return len(self.align1) 139 | 140 | def __str__(self) -> str: 141 | return f"PairwiseAlignment('{self.sequence1.name}' vs. '{self.sequence2.name}', l={len(self)}, ({self.match} |, {self.gap} -, {self.mismatch} x))" 142 | 143 | def show(self, n_lines: int=120, only_critical_chunks: bool=False) -> "PairwiseAlignment": 144 | """Show the complete alignemnt.""" 145 | assert n_lines > 0, f"ERROR in {self}.show(): n_lines={n_lines} should be > 0." 146 | print(self) 147 | l = len(self) 148 | i = 0 149 | while i < l: 150 | range_line = f"{i+1} - {min(i+n_lines, l)}" 151 | ali1_line = self.align1[i:i+n_lines] 152 | comp_line = self.comparator[i:i+n_lines] 153 | ali2_line = self.align2[i:i+n_lines] 154 | if only_critical_chunks: 155 | comp_line = comp_line.replace(self.MISMATCH_CHAR, f"\033[91m{self.MISMATCH_CHAR}\033[0m") 156 | ali2_line = ali2_line.replace(self.GAP_CHAR, f"\033[91m{self.GAP_CHAR}\033[0m") 157 | if not only_critical_chunks or self.MISMATCH_CHAR in comp_line or self.GAP_CHAR in ali2_line: 158 | print(range_line) 159 | print(ali1_line) 160 | print(comp_line) 161 | print(ali2_line) 162 | i += n_lines 163 | return self 164 | 165 | # Methods ------------------------------------------------------------------ 166 | def write(self, save_path: str) -> "PairwiseAlignment": 167 | """Save alignment to a '.fasta' file.""" 168 | save_path = os.path.abspath(save_path) 169 | assert save_path.endswith(".fasta"), f"ERROR in {self}.write(): save_path='{save_path}' sould be a '.fasta' file." 170 | assert os.path.isdir(os.path.dirname(save_path)), f"ERROR in {self}.write(): directory of save_path='{save_path}' does not exists." 171 | align_str = f">{self.sequence1.name}\n{self.align1}\n>{self.sequence2.name}\n{self.align2}\n" 172 | with open(save_path, "w") as fs: 173 | fs.write(align_str) 174 | return self 175 | 176 | def get_mapping( 177 | self, 178 | ids1: Union[None, List[Union[str, int]]]=None, 179 | ids2: Union[None, List[Union[str, int]]]=None, 180 | reversed: bool=False, 181 | ) -> Dict[Union[str, int], Union[str, int]]: 182 | """ 183 | Return mapping of the aligment between mathing residues from seq1 to seq2. 184 | * By default the ids are just consecutive integers starting at 1. 185 | 186 | args: 187 | ids1: overwrite ids for seq1 (default is [1, 2, 3, ...]) 188 | ids2: overwrite ids for seq2 (default is [1, 2, 3, ...]) 189 | revered: if True, give mapping from seq2 to seq1 190 | """ 191 | 192 | # Init ids 193 | if ids1 is None: 194 | ids1 = list(range(1, len(self.sequence1.sequence)+1)) 195 | else: 196 | assert len(ids1) == len(self.sequence1), f"ERROR in {self}.get_mapping(): length of ids1={len(ids1)} does not match length of sequence1={len(self.sequence1)}." 197 | if ids2 is None: 198 | ids2 = list(range(1, len(self.sequence2.sequence)+1)) 199 | else: 200 | assert len(ids2) == len(self.sequence2), f"ERROR in {self}.get_mapping(): length of ids2={len(ids2)} does not match length of sequence2={len(self.sequence2)}." 201 | 202 | # Manage reversed 203 | align1, align2 = self.align1, self.align2 204 | if reversed: 205 | align1, align2 = align2, align1 206 | ids1, ids2 = ids2, ids1 207 | 208 | # Generate mapping 209 | mapping = {} 210 | i1, i2 = 0, 0 211 | for aa1, aa2 in zip(align1, align2): 212 | if aa1 != self.GAP_CHAR: 213 | i1 += 1 214 | if aa2 != self.GAP_CHAR: 215 | i2 += 1 216 | if aa1 != self.GAP_CHAR and aa2 != self.GAP_CHAR: 217 | mapping[ids1[i1-1]] = ids2[i2-1] 218 | return mapping 219 | 220 | @classmethod 221 | def get_gaps_ranges(cls, align: str, tail_gaps: bool=True) -> List[Tuple[int, int]]: 222 | """Return gaps ranges of alignment string.""" 223 | 224 | # Detect gaps 225 | gaps_ranges = [] 226 | is_previous_gap = False 227 | for i, aa in enumerate(align): 228 | is_current_gap = aa == cls.GAP_CHAR 229 | # Open gap range 230 | if not is_previous_gap and is_current_gap: 231 | current_gap_rang = [i] 232 | # Close gap range 233 | elif is_previous_gap and not is_current_gap: 234 | current_gap_rang.append(i) 235 | gaps_ranges.append(current_gap_rang) 236 | is_previous_gap = is_current_gap 237 | 238 | # Mange right tail gap 239 | if align[-1] == cls.GAP_CHAR: 240 | current_gap_rang.append(len(align)) 241 | gaps_ranges.append(current_gap_rang) 242 | 243 | # Remove tail gaps if required 244 | if not tail_gaps: 245 | if align[0] == cls.GAP_CHAR: 246 | gaps_ranges = gaps_ranges[1:] 247 | if align[-1] == cls.GAP_CHAR: 248 | gaps_ranges = gaps_ranges[:-1] 249 | 250 | return gaps_ranges 251 | 252 | # Dependencies ----------------------------------------------------------------- 253 | def _count_tail_characters(input_sequence: str, count_char: str) -> Tuple[int, int]: 254 | c1, c2 = 0, 0 255 | # Left tail 256 | for char in input_sequence: 257 | if char == count_char: 258 | c1 += 1 259 | else: 260 | break 261 | # Right tail 262 | for char in input_sequence[::-1]: 263 | if char == count_char: 264 | c2 += 1 265 | else: 266 | break 267 | return c1, c2 268 | -------------------------------------------------------------------------------- /rsalor/sequence/sequence.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from typing import List, Union 5 | from rsalor.sequence import AminoAcid 6 | from rsalor.sequence import Mutation 7 | 8 | # Sequence --------------------------------------------------------------------- 9 | class Sequence: 10 | """Container class for a single sequence (name, sequence and weight). 11 | 12 | usage: 13 | 14 | seq: Sequence = Sequence('seq1', 'MQIFVKTLTGKTI--T') \n 15 | seq_name: str = seq.name \n 16 | seq_str: str = seq.sequence \n 17 | seq.write('./fasta/seq1.fasta') 18 | """ 19 | 20 | # Constants ---------------------------------------------------------------- 21 | HEADER_START_CHAR = ">" 22 | GAP_CHAR = AminoAcid.GAP_ONE 23 | AMINO_ACIDS_IDENTITY_MAP = {aa.one: aa.one for aa in AminoAcid.get_all()} | {aa.one.lower(): aa.one.lower() for aa in AminoAcid.get_all()} 24 | 25 | # Constructor -------------------------------------------------------------- 26 | def __init__(self, name: str, sequence: str, weight: float=1.0, to_upper: bool=True, convert_special_characters: bool=True): 27 | """Constructor for a (protein) Sequence object. 28 | name (str) name of the sequence 29 | sequence (str) amino acid sequence as a string 30 | weight (float=1.0) weight of the sequence (in an MSA) 31 | to_upper (bool=True) if True, convert all lower case amino acids to upper cases (such as in '.a2m' format) 32 | convert_special_characters (bool=True) if True, convert all non-standard characters (like '.' or '_') to a gap '-' (such as in '.a2m' format) 33 | """ 34 | if name.startswith(self.HEADER_START_CHAR): 35 | name = name.removeprefix(self.HEADER_START_CHAR) 36 | if to_upper: 37 | sequence = sequence.upper() 38 | if convert_special_characters: 39 | gap = self.GAP_CHAR 40 | aa_map = self.AMINO_ACIDS_IDENTITY_MAP 41 | sequence = "".join([aa_map.get(aa, gap) for aa in sequence]) 42 | self.name: str = name 43 | self.sequence: str = sequence 44 | self.weight: float = weight 45 | 46 | # Base properties ---------------------------------------------------------- 47 | def __len__(self) -> int: 48 | return len(self.sequence) 49 | 50 | def __str__(self) -> str: 51 | MAX_PRINT_LEN = 15 52 | seq_str = self.sequence 53 | if len(seq_str) > MAX_PRINT_LEN: 54 | seq_str = f"{seq_str[0:MAX_PRINT_LEN]}..." 55 | name_str = self.name 56 | if len(name_str) > MAX_PRINT_LEN: 57 | name_str = f"{name_str[0:MAX_PRINT_LEN]}..." 58 | return f"Sequence('{name_str}', seq='{seq_str}', l={len(self)})" 59 | 60 | def __eq__(self, other: "Sequence") -> bool: 61 | return self.sequence == other.sequence 62 | 63 | def __neq__(self, other: "Sequence") -> bool: 64 | return self.sequence != other.sequence 65 | 66 | def __hash__(self) -> int: 67 | return hash(self.sequence) 68 | 69 | def __iter__(self): 70 | return iter(self.sequence) 71 | 72 | def __getitem__(self, id: int) -> str: 73 | return self.sequence[id] 74 | 75 | def __contains__(self, char: str) -> bool: 76 | return char in self.sequence 77 | 78 | # Base Methods ------------------------------------------------------------- 79 | def n_gaps(self) -> int: 80 | """Return number of gaps in sequence.""" 81 | return len([char for char in self.sequence if char == self.GAP_CHAR]) 82 | 83 | def n_non_gaps(self) -> int: 84 | """Return number of non-gaps in sequence.""" 85 | return len([char for char in self.sequence if char != self.GAP_CHAR]) 86 | 87 | def gap_ratio(self) -> float: 88 | """Return gap ratio.""" 89 | return self.n_gaps() / len(self) 90 | 91 | def contains_gaps(self) -> bool: 92 | """Return is sequence contains gaps.""" 93 | for char in self.sequence: 94 | if char == self.GAP_CHAR: 95 | return True 96 | return False 97 | 98 | def is_all_amino_acids(self) -> bool: 99 | """Returns is sequence is composed of only standard amino acids.""" 100 | for char in self.sequence: 101 | if not AminoAcid.one_exists(char): 102 | return False 103 | return True 104 | 105 | def to_fasta_string(self) -> str: 106 | """Return string of the sequence in FASTA format.""" 107 | return f"{self.HEADER_START_CHAR}{self.name}\n{self.sequence}\n" 108 | 109 | def mutation_is_compatible(self, mutation: Union[str, Mutation]) -> bool: 110 | """Return if mutation is compatible with the sequence.""" 111 | 112 | # Convert to Mutation type 113 | if isinstance(mutation, str): 114 | mutation = Mutation(mutation) 115 | 116 | # Verify if mutatoin position is in sequence 117 | if not (1 <= mutation.position <= len(self)): 118 | return False 119 | # Verify if wild-type amino acid corresponds to sequence 120 | if mutation.wt_aa.one != self.sequence[mutation.position-1]: 121 | return False 122 | return True 123 | 124 | # IO Methods --------------------------------------------------------------- 125 | def write(self, fasta_path: str) -> "Sequence": 126 | """Save sequence in a FASTA file.""" 127 | 128 | # Guardians 129 | fasta_path = os.path.abspath(fasta_path) 130 | assert os.path.isdir(os.path.dirname(fasta_path)), f"ERROR in Sequence('{self.name}').write(): directory of '{fasta_path}' does not exists." 131 | assert fasta_path.endswith(".fasta"), f"ERROR in Sequence('{self.name}').write(): fasta_path='{fasta_path}' should end with '.fasta'." 132 | 133 | # Save FASTA and return self 134 | with open(fasta_path, "w") as fs: 135 | fs.write(self.to_fasta_string()) 136 | return self 137 | 138 | # Mutate Methods ----------------------------------------------------------- 139 | def trim(self, keep_positions: List[bool]) -> "Sequence": 140 | """Trim sequence (filter on positions) according to keep_positions (array of bool indicating which position to keep).""" 141 | 142 | # Guardians 143 | assert len(keep_positions) == len(self), f"ERROR in {self}.trim(): length of keep_positions ({len(keep_positions)}) does not match length of sequence ({len(self)})." 144 | 145 | # Trim and return self 146 | self.sequence = "".join([char for char, keep in zip(self.sequence, keep_positions) if keep]) 147 | return self -------------------------------------------------------------------------------- /rsalor/structure/__init__.py: -------------------------------------------------------------------------------- 1 | from rsalor.structure.residue import Residue 2 | from rsalor.structure.structure import Structure 3 | -------------------------------------------------------------------------------- /rsalor/structure/residue.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | from typing import Union 4 | from rsalor.sequence import AminoAcid 5 | 6 | # Main ------------------------------------------------------------------------- 7 | class Residue: 8 | """Container class for a PDB residue. 9 | 10 | usage: 11 | res = Residue('A', '113', AminoAcid('K')) 12 | """ 13 | 14 | # Constructor -------------------------------------------------------------- 15 | def __init__(self, chain: str, position: str, amino_acid: AminoAcid, rsa: Union[None, float]=None): 16 | 17 | # Guardians 18 | assert len(chain) == 1 and chain != " ", f"ERROR in Residue(): invalid chain='{chain}'." 19 | if rsa is not None: 20 | assert rsa >= 0.0, f"ERROR in Residue(): rsa='{rsa}' should be positive." 21 | 22 | # Set properties 23 | self.chain: str = chain 24 | self.position: str = position 25 | self.amino_acid: AminoAcid = amino_acid 26 | self.rsa: Union[None, float] = rsa 27 | 28 | # Properties --------------------------------------------------------------- 29 | @property 30 | def resid(self) -> str: 31 | return self.chain + self.position 32 | 33 | def __str__(self) -> str: 34 | return f"Residue('{self.resid}', '{self.amino_acid.three}', RSA={self.rsa})" -------------------------------------------------------------------------------- /rsalor/structure/structure.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from typing import Union, List, Dict, Literal 5 | from rsalor.sequence import AminoAcid 6 | from rsalor.structure import Residue 7 | from rsalor.sequence import Sequence 8 | from rsalor.rsa import RSASolver, RSABiopython, RSADSSP, RSAMuSiC 9 | 10 | # Execution -------------------------------------------------------------------- 11 | class Structure: 12 | """Structure object for parsing all Residues from ATOM lines and assign RSA (with biopython (Shrake & Rupley), DSSP or MuSiC). 13 | 14 | usage: 15 | structure = Structure('./my_pdb.pdb', 'A') 16 | """ 17 | 18 | # Constants ---------------------------------------------------------------- 19 | RSA_SOLVERS: Dict[str, RSASolver] = { 20 | "biopython": RSABiopython, 21 | "DSSP": RSADSSP, 22 | "MuSiC": RSAMuSiC, 23 | } 24 | 25 | # Constructor -------------------------------------------------------------- 26 | def __init__( 27 | self, 28 | pdb_path: str, 29 | chain: str, 30 | rsa_solver: Literal["biopython", "DSSP", "MuSiC"]="biopython", 31 | rsa_solver_path: Union[None, str]=None, 32 | rsa_cache_path: Union[None, str]=None, 33 | verbose: bool=False, 34 | ): 35 | """Structure object for parsing all Residues from ATOM lines and assign RSA (with biopython, DSSP or MuSiC). 36 | 37 | arguments: 38 | pdb_path (str): path to PDB file 39 | chain (str): target chain in the PDB 40 | rsa_solver ('biopython'/'DSSP'/'MuSiC'): solver to use to compute RSA 41 | rsa_solver_path (Union[None, str]=None): path to solver executable 42 | rsa_cache_path (Union[None, str]=None): path to write/read to/from RSA values 43 | verbose (bool=False): set True for logs 44 | """ 45 | 46 | # Guardians 47 | assert os.path.isfile(pdb_path), f"ERROR in Structure(): pdb_path='{pdb_path}' file does not exist." 48 | assert pdb_path.endswith(".pdb"), f"ERROR in Structure(): pdb_path='{pdb_path}' should end with '.pdb'." 49 | assert len(chain) == 1 and chain != " ", f"ERROR in Structure(): chain='{chain}' should be a string of length 1 and not ' '." 50 | solver_list = list(self.RSA_SOLVERS.keys()) 51 | assert rsa_solver in solver_list, f"ERROR in Structure(): rsa_solver='{rsa_solver}' should be in {solver_list}." 52 | 53 | # Init base properties 54 | self.pdb_path = pdb_path 55 | self.pdb_name = os.path.basename(self.pdb_path).removesuffix(".pdb") 56 | self.chain = chain 57 | self.name = f"{self.pdb_name}_{self.chain}" 58 | self.rsa_solver = rsa_solver 59 | self.rsa_solver_path = rsa_solver_path 60 | self.verbose = verbose 61 | 62 | # Parse structure 63 | self.residues: List[Residue] = [] 64 | self.chain_residues: List[Residue] = [] 65 | self.residues_map: Dict[str, Residue] = {} 66 | self._parse_structure() 67 | 68 | # Set sequence 69 | self.sequence = Sequence(f"{self.name} (PDB, ATOM-lines)", "".join(res.amino_acid.one for res in self.chain_residues)) 70 | 71 | # Assign RSA 72 | solver: RSASolver = self.RSA_SOLVERS[rsa_solver] 73 | rsa_map = solver(self.rsa_solver_path, self.verbose).run(self.pdb_path, rsa_cache_path=rsa_cache_path) 74 | n_assigned_in_chain = 0 75 | for residue in self.residues: 76 | resid = residue.resid 77 | if resid in rsa_map: 78 | if residue.chain == self.chain: 79 | n_assigned_in_chain += 1 80 | residue.rsa = rsa_map[resid] 81 | 82 | # Log 83 | if self.verbose: 84 | print(f" * {n_assigned_in_chain} / {len(self.chain_residues)} assigned RSA values for chain '{self.chain}'") 85 | 86 | 87 | # Base properties ---------------------------------------------------------- 88 | def __str__(self) -> str: 89 | return f"Structure('{self.name}', l={len(self)})" 90 | 91 | def __len__(self) -> int: 92 | return len(self.residues) 93 | 94 | def __contains__(self, resid: str) -> bool: 95 | return resid in self.residues_map 96 | 97 | def __getitem__(self, id: int) -> dict: 98 | return self.residues[id] 99 | 100 | def __iter__(self): 101 | return iter(self.residues) 102 | 103 | # Deendencies -------------------------------------------------------------- 104 | def _parse_structure(self) -> None: 105 | """Parse residues data from PDB file.""" 106 | 107 | # Init 108 | model_counter = 0 109 | current_chain = None 110 | closed_chains = set() 111 | 112 | # Parse PDB residues 113 | with open(self.pdb_path, "r", encoding="ISO-8859-1") as fs: 114 | line = fs.readline() 115 | while line: 116 | prefix = line[0:6] 117 | 118 | # Atom line 119 | if prefix == "ATOM " or prefix == "HETATM": 120 | current_chain = line[21] 121 | if current_chain in closed_chains: # discard ATOM line if chain is closed 122 | line = fs.readline() 123 | continue 124 | position = line[22:27].replace(" ", "") 125 | aa_three = line[17:20] 126 | aa = AminoAcid.parse_three(aa_three) 127 | if aa.is_unknown(): # discard non amino acid ATOM lines 128 | line = fs.readline() 129 | continue 130 | resid = current_chain + position 131 | if resid not in self.residues_map: 132 | residue = Residue(current_chain, position, aa) 133 | self.residues.append(residue) 134 | self.residues_map[resid] = residue 135 | 136 | # Manage multiple models: consider only model 1 137 | elif prefix == "MODEL ": 138 | model_counter += 1 139 | if model_counter > 1: 140 | #print(f"WARNING in {self}: PDB contains multiple models, but only model 1 will be considered.") 141 | break 142 | 143 | # Manage closed chains: ATOMS that appears after the chain is closed are not part of the protein chain 144 | elif prefix == "TER " or prefix == "TER\n": 145 | if current_chain is not None: 146 | closed_chains.add(current_chain) 147 | 148 | # Take next line 149 | line = fs.readline() 150 | 151 | # Set residues list of target chain 152 | self.chain_residues = [res for res in self.residues if res.chain == self.chain] 153 | 154 | # No target chain error 155 | if len(self.chain_residues) == 0: 156 | error_log = f"ERROR in {self}._parse_structure(): target chain '{self.chain}' not found in PDB file." 157 | error_log += f"\n * pdb_path: '{self.pdb_path}'" 158 | error_log += f"\n * num total residues: {len(self.residues)}" 159 | error_log += f"\n * existing chains: {list(set([res.chain for res in self.residues]))}" 160 | raise ValueError(error_log) -------------------------------------------------------------------------------- /rsalor/utils/CSV.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | import csv 5 | from typing import Union, Tuple, List, Dict, Callable 6 | import numpy as np 7 | 8 | 9 | # Main ------------------------------------------------------------------------- 10 | class CSV: 11 | """ 12 | Class to read/write a CSV file and manage it as a dataframe. 13 | * It never assumes a column or a cell type except when it is specified (all cells as :str by default). 14 | * Manages safety: impossible to have redundent values in header. 15 | """ 16 | 17 | # Constants ---------------------------------------------------------------- 18 | ALLOWED_EXTENTIONS = ["csv", "tsv"] 19 | 20 | # Constructor -------------------------------------------------------------- 21 | def __init__( 22 | self, 23 | header: List[str]=[], 24 | sep: str=";", 25 | name: str="DataFrame", 26 | print_warnings: bool=True, 27 | ): 28 | 29 | # Base properties 30 | self.name = name 31 | self.print_warnings = print_warnings 32 | 33 | # Content 34 | self._header = Header(header, sep) 35 | self.entries = [] 36 | 37 | # Basic properties --------------------------------------------------------- 38 | @property 39 | def sep(self) -> str: 40 | return self._header.sep 41 | 42 | def __len__(self) -> int: 43 | return len(self.entries) 44 | 45 | def __contains__(self, property_name: str) -> bool: 46 | return property_name in self._header 47 | 48 | def __getitem__(self, id: int) -> dict: 49 | return self.entries[id] 50 | 51 | def __iter__(self): 52 | return iter(self.entries) 53 | 54 | @property 55 | def n_rows(self) -> int: 56 | return len(self) 57 | 58 | @property 59 | def n_cols(self) -> int: 60 | return len(self._header) 61 | 62 | @property 63 | def shape(self) -> Tuple[int, int]: 64 | return (self.n_rows, self.n_cols) 65 | 66 | @property 67 | def df_size(self) -> int: 68 | return self.n_rows * self.n_cols 69 | 70 | def header(self) -> List[str]: 71 | return [p for p in self._header.properties] 72 | 73 | def __str__(self) -> str: 74 | return f"CSV('{self.name}', r={self.n_rows}, c={self.n_cols})" 75 | 76 | def warning(self, warning_str: str="") -> None: 77 | """Log a CSV Warning.""" 78 | if self.print_warnings: 79 | print(f"WARNING in {self}{warning_str}") 80 | 81 | # Methods ------------------------------------------------------------------ 82 | def set_sep(self, sep: str, safety_check: bool=True): 83 | """Set separator for the CSV (for .read and .write)""" 84 | 85 | if safety_check: 86 | 87 | # Computational time warning 88 | if self.df_size > 1000: 89 | self.warning( 90 | f".set_sep('{sep}'): could be computationally expensive when CSV object already contains many entries. " + \ 91 | f"You can set 'safety_check' to False to skip coherence checks with separator." 92 | ) 93 | 94 | # Guardians 95 | for entry in self.entries: 96 | for key, value in entry.items(): 97 | assert sep not in str(value), f"ERROR in {self}.set_sep('{sep}'): sep contained in entry's value ('{key}': '{value}')." 98 | 99 | # Set 100 | self._header.set_sep(sep) 101 | return self 102 | 103 | def add_entry(self, entry: dict): 104 | entry = {prop: entry[prop] for prop in self._header} 105 | self.entries.append(entry) 106 | return self 107 | 108 | def add_entries(self, entries: List[dict]): 109 | for entry in entries: 110 | self.add_entry(entry) 111 | return self 112 | 113 | def add_col(self, property: str, values: list, allow_replacement=False): 114 | if property in self._header: 115 | assert allow_replacement, f"ERROR in {self}.add_col(): property='{property}' already exists and allow_replacement is set to False." 116 | else: 117 | self._header.add(property) 118 | assert len(values) == len(self), f"ERROR in {self}.add_col(): values length ({len(values)}) != CSV length ({len(self)})." 119 | for entry, value in zip(self.entries, values): 120 | entry[property] = value 121 | return self 122 | 123 | def add_empty_col(self, property: str, missing_value: str="XXX", allow_replacement=False): 124 | values = [missing_value for _ in self.entries] 125 | self.add_col(property, values, allow_replacement=allow_replacement) 126 | return self 127 | 128 | def add_csv(self, other_csv): 129 | """Merge other_csv entries with current CSV (keeps header of current CSV).""" 130 | for property_name in self.header(): 131 | assert property_name in other_csv._header, f"ERROR in {self}.add_csv(): property='{property_name}' does not exists in other_csv ({other_csv})." 132 | for entry in other_csv: 133 | self.add_entry(entry) 134 | return self 135 | 136 | def remove_col(self, property: str): 137 | self._header.remove(property) 138 | for entry in self.entries: 139 | del entry[property] 140 | return self 141 | 142 | def rename_col(self, property_old: str, property_new: str): 143 | self._header.rename(property_old, property_new) 144 | for entry in self.entries: 145 | entry[property_new] = entry[property_old] 146 | del entry[property_old] 147 | return self 148 | 149 | def order_header(self, header_order: List[str]): 150 | self._header.order(header_order) 151 | return self 152 | 153 | def filter(self, keep_entry_function: Callable, do_print: bool=False, filter_name: str=""): 154 | """Filter entries in the CSV with a filter_function.""" 155 | l1 = len(self) 156 | self.entries = [entry for entry in self.entries if keep_entry_function(entry)] 157 | l2 = len(self) 158 | if do_print: 159 | print(f"{self}: Filter('{filter_name}'): {l1} -> {l2}") 160 | return self 161 | 162 | def set_col_type(self, property_name: str, dt: type, default_value=None): 163 | assert property_name in self._header, f"ERROR in {self}.set_col_type(): property_name='{property_name}' does not exists." 164 | for entry in self.entries: 165 | entry[property_name] = to_type(entry[property_name], dt, default_value=default_value) 166 | 167 | # Get Methods -------------------------------------------------------------- 168 | def get_col(self, property: str, dt: Union[None, type]=None, default_value=None, as_numpy: bool=False): 169 | """Get Column of CSV as array.""" 170 | assert property in self, f"ERROR in {self}.get_array('{property}'): property does not exists." 171 | col_list = [entry[property] for entry in self.entries] 172 | if dt is not None: 173 | col_list = [to_type(el, dt, default_value=default_value) for el in col_list] 174 | if as_numpy: 175 | col_list = np.array(col_list) 176 | return col_list 177 | 178 | def get_row(self, id: int, dt: Union[None, type]=None, default_value=None, as_numpy: bool=False): 179 | """Get Raw of CSV as array""" 180 | entry = self[id] 181 | row_list = [entry[p] for p in self._header] 182 | if dt is not None: 183 | row_list = [to_type(el, dt, default_value=default_value) for el in row_list] 184 | if as_numpy: 185 | row_list = np.array(row_list) 186 | return row_list 187 | 188 | def get_X(self, features: List[str]) -> np.ndarray: 189 | """Get features matrix X (numpy) from the CSV.""" 190 | for feature in features: 191 | assert feature in self, f"ERROR in {self}.get_X(): feature='{feature}' does not exists." 192 | return np.array([ 193 | [float(entry[feature]) for feature in features] 194 | for entry in self.entries 195 | ]) 196 | 197 | def get_y(self, label: str) -> np.ndarray: 198 | """Get label array y (numpy) from the CSV.""" 199 | assert label in self, f"ERROR in {self}.get_y(): label='{label}' does not exists." 200 | return np.array([float(entry[label]) for entry in self.entries]) 201 | 202 | def get_Xy(self, features: List[str], label: str) -> Tuple[np.ndarray, np.ndarray]: 203 | """Get (features, label) tuple (X, y) (numpy) from the CSV.""" 204 | return self.get_X(features), self.get_y(label) 205 | 206 | @staticmethod 207 | def hash_entry(entry: dict, hash_properties: List[str], sep: str="_") -> str: 208 | """Hash an entry (to :str) by values of its hash_properties.""" 209 | return sep.join([entry[prop] for prop in hash_properties]) 210 | 211 | @staticmethod 212 | def get_hash_entry(hash_properties: List[str], sep: str="_") -> Callable: 213 | """Generate a hash_entry function.""" 214 | def hash_entry_function(entry: dict) -> str: 215 | return sep.join([entry[prop] for prop in hash_properties]) 216 | return hash_entry_function 217 | 218 | def get_map(self, hash_properties: List[str], sep: str="_", map_function: Union[None, Callable]=None) -> Dict[str, Dict]: 219 | """ 220 | Obtain a map {hash(entry) -> entry} from CSV (redundencies not allowed). 221 | * if map_function is set, values of the map are defined as map_function(entry) 222 | """ 223 | for property in hash_properties: 224 | assert property in self, f"ERROR in {self}.to_map(): property='{property}' not in header." 225 | entries_map = {} 226 | for entry in self.entries: 227 | h = self.hash_entry(entry, hash_properties, sep=sep) 228 | assert h not in entries_map, f"ERROR in {self}.to_map({hash_properties}) redundency found for '{h}'." 229 | entries_map[h] = entry 230 | if map_function is not None: 231 | for h, entry in entries_map.items(): 232 | entry[h] = map_function(entry) 233 | return entries_map 234 | 235 | def get_groups(self, hash_properties: List[str], sep: str="_", map_function: Union[None, Callable]=None) -> Dict[str, List[Dict]]: 236 | """ 237 | Obtain a map for groups {hash(entry) -> [entries_list]} from CSV. 238 | * if map_function is set, values of the map are defined as [map_function(entry), ...] 239 | """ 240 | for property in hash_properties: 241 | assert property in self, f"ERROR in {self}.to_map(): property='{property}' not in header." 242 | groups_map = {} 243 | for entry in self: 244 | h = self.hash_entry(entry, hash_properties, sep=sep) 245 | if h not in groups_map: 246 | groups_map[h] = [] 247 | groups_map[h].append(entry) 248 | if map_function is not None: 249 | for h, group in groups_map.items(): 250 | groups_map[h] = [map_function(e) for e in group] 251 | return groups_map 252 | 253 | def copy(self): 254 | """Copy CSV object.""" 255 | new_csv = CSV() 256 | new_csv.name = self.name 257 | new_csv.print_warnings = self.print_warnings 258 | new_csv._header = self._header.copy() 259 | new_csv.entries = [ 260 | {k: v for k, v in entry.items()} 261 | for entry in self.entries 262 | ] 263 | return new_csv 264 | 265 | def show(self, n_entries: int=5, min_colsize: int=3, max_colsize: int=20, max_linesize: int=200, round_digit: int=4, sep: str=" | ") -> None: 266 | """Show summary of CSV.""" 267 | lines = [self._header.properties] + [self.get_row(id) for id in range(min(n_entries, len(self)))] 268 | col_sizes = [ 269 | max([min_colsize, min([max([len(stringify_float(line[i], round_digit=round_digit)) for line in lines]), max_colsize])]) 270 | for i in range(len(self._header)) 271 | ] 272 | print(self) 273 | for line in lines: 274 | print_line(line, sizes=col_sizes, max_linesize=max_linesize, round_digit=round_digit, sep=sep) 275 | if len(self) > n_entries: 276 | print(" ...") 277 | 278 | # IO ----------------------------------------------------------------------- 279 | def write(self, output_path: str): 280 | """Save to file.""" 281 | 282 | # Guardians 283 | output_path = os.path.abspath(output_path) 284 | assert any([output_path.endswith(f".{extention}")] for extention in CSV.ALLOWED_EXTENTIONS), f"ERROR in {self}.write('{output_path}'): extention sould be among {CSV.ALLOWED_EXTENTIONS})." 285 | assert os.path.isdir(os.path.dirname(output_path)), f"ERROR in {self}.write('{output_path}'): destination folder does not exists." 286 | if output_path.endswith("tsv"): 287 | assert self.sep == "\t", f"ERROR in {self}.write('{output_path}'): if extention is '.tsv', separator should be '\\t' however sep='{self.sep}'." 288 | 289 | # Stringify 290 | str_header = self.sep.join(self._header.properties) 291 | str_entries_list = [ 292 | self.sep.join(str(entry[prop]) for prop in self._header.properties) 293 | for entry in self.entries 294 | ] 295 | str_lines = [str_header] + str_entries_list 296 | 297 | # Write 298 | with open(output_path, "w") as fs: 299 | fs.write("\n".join(str_lines)) 300 | return self 301 | 302 | def read(self, input_path: str, col_types: Dict[str, type]={}, col_default: dict={}): 303 | """Read from file.""" 304 | 305 | # Guardians 306 | assert any([input_path.endswith(f".{extention}")] for extention in CSV.ALLOWED_EXTENTIONS), f"ERROR in {self}.read('{input_path}'): extention sould be among {CSV.ALLOWED_EXTENTIONS})." 307 | assert os.path.isfile(input_path), f"ERROR in {self}.read('{input_path}'): input_path file does not exists." 308 | 309 | # Set name 310 | file_name = os.path.basename(input_path) 311 | name = ".".join(file_name.split(".")[:-1]) 312 | self.name = name 313 | 314 | # Parse csv from file 315 | with open(input_path, newline='') as csvfile: 316 | csv_lines = list(csv.reader(csvfile, delimiter=self.sep)) 317 | 318 | # Set CSV header 319 | header = csv_lines[0] 320 | if len(header) <= 1: 321 | self.warning(f".read('{input_path}'): header contains {len(header)} values. Maybe sep='{self.sep}' parameter in incorrect.") 322 | self._header = Header(header, self.sep) 323 | 324 | # Set CSV entries 325 | self.entries = [] 326 | for i, line in enumerate(csv_lines[1:]): 327 | assert len(line) == len(header), f"ERROR in {self}.read('{input_path}'): number of elements ({len(line)}) in entry ({i+1}/{len(csv_lines)-1}) does not match the header ({len(header)})." 328 | self.entries.append({prop: value for prop, value in zip(header, line)}) 329 | 330 | # Set column types if required 331 | for col_name, dt in col_types.items(): 332 | col_default_value = col_default.get(col_name, None) 333 | self.set_col_type(col_name, dt, default_value=col_default_value) 334 | 335 | return self 336 | 337 | # Dependencies ----------------------------------------------------------------- 338 | 339 | class Header: 340 | """ 341 | Container for the Header of a CSV object. 342 | -> ordered list with no repetitions allowed and a separator of length = 1. 343 | """ 344 | 345 | # Constructor -------------------------------------------------------------- 346 | def __init__(self, properties: List[str], sep: str): 347 | 348 | # Init 349 | self.sep = "" 350 | self.properties = [] 351 | self.properties_set = set() 352 | 353 | # Set header values 354 | self.set_sep(sep) 355 | for property in properties: 356 | self.add(property) 357 | 358 | # Basic properties --------------------------------------------------------- 359 | def __getitem__(self, id: int) -> str: 360 | return self.properties[id] 361 | 362 | def __iter__(self): 363 | return iter(self.properties) 364 | 365 | def __contains__(self, property_name: str) -> bool: 366 | return property_name in self.properties_set 367 | 368 | def __len__(self) -> int: 369 | return len(self.properties) 370 | 371 | def __str__(self) -> str: 372 | return f"CSV.Header(l={len(self)})" 373 | 374 | def show(self): 375 | MAX_CHAR = 80 376 | properties_str = f"'{self.properties[0]}'" 377 | for property in self.properties[1:]: 378 | if len(properties_str) + len(property) > MAX_CHAR: 379 | properties_str += ", ..." 380 | break 381 | properties_str += f", '{property}'" 382 | print(f"CSV.Header([{properties_str}], len={len(self)}, sep='{self.sep}')") 383 | return self 384 | 385 | def idof(self, property_name: str) -> int: 386 | assert property_name in self, f"ERROR in {self}.idof(): property_name='{property_name}' not in header." 387 | for i, current_property_name in enumerate(self): 388 | if property_name == current_property_name: 389 | return i 390 | 391 | # Methods ------------------------------------------------------------------ 392 | def set_sep(self, sep: str): 393 | assert len(sep) == 1, f"ERROR in {self}.set_sep(): sep='{sep}' should be of length 1." 394 | for property in self: 395 | assert sep not in property, f"ERROR in {self}.set_sep(): sep='{sep}' is contained in property '{property}'." 396 | self.sep = sep 397 | return self 398 | 399 | def add(self, property_name: str): 400 | assert self.sep not in property_name, f"ERROR in {self}.add('{property_name}'): property contains sep='{self.sep}'." 401 | assert property_name not in self, f"ERROR in {self}.add('{property_name}'): property already exists." 402 | self.properties.append(property_name) 403 | self.properties_set.add(property_name) 404 | return self 405 | 406 | def remove(self, property_name: str): 407 | assert property_name in self, f"ERROR in {self}.remove('{property_name}'): property does not exists." 408 | self.properties.remove(property_name) 409 | self.properties_set.remove(property_name) 410 | return self 411 | 412 | def rename(self, property_old: str, property_new: str): 413 | assert property_old != property_new, f"ERROR in {self}.rename(): old property and new property have the same value '{property_old}'." 414 | assert property_old in self, f"ERROR in {self}.rename(): old property '{property_old}' is not in header." 415 | assert property_new not in self, f"ERROR in {self}.rename(): new property '{property_new}' already in header." 416 | assert self.sep not in property_new, f"ERROR in {self}.rename(): new property '{property_new}' contains sep='{self.sep}'." 417 | id = self.idof(property_old) 418 | self.properties[id] = property_new 419 | self.properties_set.add(property_new) 420 | self.properties_set.remove(property_old) 421 | return self 422 | 423 | def order(self, header_order: List[str]): 424 | for property in header_order: 425 | assert property in self, f"ERROR in {self}.order(): property '{property}' not in header." 426 | ordered_properties_set = set(header_order) 427 | unordered_properties = [property for property in self if property not in ordered_properties_set] 428 | self.properties = header_order + unordered_properties 429 | return self 430 | 431 | def copy(self): 432 | return Header([p for p in self], self.sep) 433 | 434 | # Dependency: Utils Funcions --------------------------------------------------- 435 | def to_type(input, dt:type, default_value=None): 436 | """Convert input to type dt. If default_value is set, returns default_value when convertion fails.""" 437 | try: 438 | return dt(input) 439 | except: 440 | if default_value is None: 441 | raise ValueError(f"ERROR in CSV().to_type(): input='{input}' not convertable to {dt}. Please correct input or set a default_value.") 442 | else: 443 | return default_value 444 | 445 | def print_line( 446 | line_list, 447 | sep: str=" | ", dots_str: str="...", 448 | size: int=20, sizes: Union[None, List[int]]=None, max_linesize: int=200, 449 | round_digit: int=4, 450 | ) -> None: 451 | """Print a line from a table (dataframe) in a standardized way.""" 452 | if sizes is None: sizes = [size for _ in line_list] 453 | line_str = "" 454 | unprinted_cols = False 455 | for element, size in zip(line_list, sizes): 456 | line_new_col = sep + format_string(element, size, round_digit=round_digit) 457 | if len(line_str) + len(line_new_col) > max_linesize - (len(sep) + len(dots_str)): 458 | unprinted_cols = True 459 | break 460 | line_str += line_new_col 461 | if unprinted_cols: 462 | line_str += sep + "..." 463 | line_str += sep 464 | print(line_str[1:-1]) 465 | 466 | def format_string(input, size: int=20, filler: int=" ", dots_str: str="...", round_digit: int=4) -> str: 467 | """Format a string to standardized form (length, ...)""" 468 | input_str = stringify_float(input, round_digit=round_digit) 469 | if len(input_str) > size: 470 | return input_str[:size-len(dots_str)] + dots_str 471 | else: 472 | return input_str + filler*(size - len(input_str)) 473 | 474 | def stringify_float(input, round_digit: int=4) -> str: 475 | if isinstance(input, float): 476 | str_float = str(round(input, round_digit)) 477 | n_digits = len(str_float.split(".")[-1]) 478 | str_float = str_float + ("0"*(round_digit-n_digits)) 479 | if str_float[0] != "-": 480 | str_float = " " + str_float 481 | return str_float 482 | else: 483 | return str(input) 484 | -------------------------------------------------------------------------------- /rsalor/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from rsalor.utils.utils import is_convertable_to, memory_str, time_str, find_file 2 | from rsalor.utils.CSV import CSV 3 | from rsalor.utils.logger import Logger 4 | from rsalor.utils.ali_to_fasta import ali_to_fasta -------------------------------------------------------------------------------- /rsalor/utils/ali_to_fasta.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from os import remove 5 | from typing import Union 6 | from Bio import SeqIO 7 | 8 | # MSA files processing functions ----------------------------------------------- 9 | def ali_to_fasta(input_path: str, output_path: str, delete_input: bool=False) -> Union[None, str]: 10 | """Convert '.ali' (Stickholm format) file to a '.fasta' file. 11 | * Then, deletes input '.ali' file if required. 12 | * Returns output_path or None if execution failed. 13 | Source: https://stackoverflow.com/questions/24156578/using-bio-seqio-to-write-single-line-fasta 14 | """ 15 | 16 | # Guardians 17 | error_log = f"ERROR in ali_to_fasta(): " 18 | error_log += f"\n * input_path : '{input_path}'" 19 | error_log += f"\n * output_path : '{output_path}'\n" 20 | if not os.path.isfile(input_path): 21 | print(f"{error_log} -> input file does not exists.") 22 | return None 23 | if not is_nonempty_file(input_path): 24 | print(f"{error_log} -> input file is empty.") 25 | return None 26 | if not input_path.endswith(".ali"): 27 | print(f"{error_log} -> input file should end with '.ali'.") 28 | return None 29 | if not output_path.endswith(".fasta"): 30 | print(f"{error_log} -> output file should end with '.fasta'.") 31 | return None 32 | 33 | # Run convertion 34 | try: 35 | records = SeqIO.parse(input_path, "stockholm") 36 | except Exception as error: 37 | print(f"{error_log} -> input file parsing failed.") 38 | print(error) 39 | return None 40 | try: 41 | SeqIO.FastaIO.FastaWriter(output_path, wrap=None).write_file(records) 42 | except Exception as error: 43 | print(f"{error_log} -> file convertion + writing failed.") 44 | print(error) 45 | return None 46 | 47 | # Detect errors 48 | if not is_nonempty_file(output_path): 49 | print(f"{error_log} -> converted output file is empty.") 50 | return None 51 | 52 | # Delete initial '.ali' file if required 53 | if delete_input: 54 | if os.path.isfile(input_path): 55 | remove(input_path) 56 | 57 | # Return 58 | return output_path 59 | 60 | # Dependency ------------------------------------------------------------------- 61 | def is_nonempty_file(input_path: str) -> bool: 62 | """Check if 'input_path' is an existing non-empty file.""" 63 | if not os.path.isfile(input_path): 64 | return False 65 | with open(input_path, "r") as fs: 66 | line = fs.readline() 67 | return len(line) > 0 -------------------------------------------------------------------------------- /rsalor/utils/logger.py: -------------------------------------------------------------------------------- 1 | 2 | # Logger ----------------------------------------------------------------------- 3 | class Logger: 4 | 5 | # Constants ---------------------------------------------------------------- 6 | HEADER = '\033[95m' 7 | OKBLUE = '\033[94m' 8 | OKCYAN = '\033[96m' 9 | OKGREEN = '\033[92m' 10 | WARNING = '\033[93m' 11 | FAIL = '\033[91m' 12 | ENDC = '\033[0m' 13 | BOLD = '\033[1m' 14 | UNDERLINE = '\033[4m' 15 | 16 | # Constructor -------------------------------------------------------------- 17 | def __init__( 18 | self, 19 | verbose: bool, 20 | disable_warnings: bool=True, 21 | step_prefix: str="STEP", 22 | warning_prefix: str="WARNING", 23 | error_prefix: str="ERROR", 24 | step_note: str="", 25 | warning_note: str="", 26 | error_note: str="", 27 | ): 28 | """Minimalistic logger: 29 | * manage verbose and disable_warnings 30 | * add colored prefixes to logs 31 | """ 32 | self.verbose = verbose 33 | self.disable_warnings = disable_warnings 34 | self._step_prefix = step_prefix 35 | self._warning_prefix = warning_prefix 36 | self._error_prefix = error_prefix 37 | self._step_note = step_note 38 | self._warning_note = warning_note 39 | self._error_note = error_note 40 | 41 | # Methods ------------------------------------------------------------------ 42 | @property 43 | def STEP_PREFIX(self) -> str: 44 | return f"{self.OKGREEN}{self._step_prefix}{self.ENDC}{self._step_note}" 45 | 46 | @property 47 | def WARNING_PREFIX(self) -> str: 48 | return f"{self.WARNING}{self._warning_prefix}{self.ENDC}{self._warning_note}" 49 | 50 | @property 51 | def CRITICAL_WARNING_PREFIX(self) -> str: 52 | return f"{self.FAIL}{self._warning_prefix}{self.ENDC}{self._warning_note}" 53 | 54 | @property 55 | def ERROR_PREFIX(self) -> str: 56 | return f"{self.FAIL}{self._error_prefix}{self.ENDC}{self._error_note}" 57 | 58 | def log(self, log_str: str) -> None: 59 | if self.verbose: 60 | print(log_str) 61 | 62 | def step(self, log_str: str) -> None: 63 | if self.verbose: 64 | print(f"{self.STEP_PREFIX}: {log_str}") 65 | 66 | def warning(self, log_str: str, critical: bool=False) -> None: 67 | prefix = self.WARNING_PREFIX 68 | if critical: 69 | prefix = self.CRITICAL_WARNING_PREFIX 70 | if not self.disable_warnings: 71 | print(f"{prefix}: {log_str}") 72 | 73 | def error(self, log_str: str) -> None: 74 | print(f"{self.ERROR_PREFIX}: {log_str}") 75 | 76 | def error_str(self, log_str: str) -> str: 77 | return f"{self.ERROR_PREFIX}: {log_str}" -------------------------------------------------------------------------------- /rsalor/utils/utils.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from shutil import which 5 | from typing import List, Union 6 | 7 | 8 | # Base functions --------------------------------------------------------------- 9 | def is_convertable_to(input_object, input_type) -> bool: 10 | """Return if input_object is convertable to input_type.""" 11 | try: 12 | _ = input_type(input_object) 13 | return True 14 | except: 15 | return False 16 | 17 | def memory_str(n_bytes: int) -> str: 18 | """Return a human readable string for a memory size measure (input in bytes).""" 19 | if n_bytes / 1000**3 > 1.0: 20 | return f"{n_bytes / 1000**3:.3f} GB" 21 | elif n_bytes / 1000**2 > 1.0: 22 | return f"{n_bytes / 1000**2:.3f} MB" 23 | elif n_bytes / 1000 > 1.0: 24 | return f"{n_bytes / 1000:.3f} kB" 25 | else: 26 | return f"{n_bytes} B" 27 | 28 | def time_str(n_sec: float) -> str: 29 | """Return a human readable string for a time measure (input in seconds).""" 30 | if n_sec / (60*60*24) > 1.0: 31 | return f"{n_sec / (60*60*24):.3f} d." 32 | elif n_sec / (60*60) > 1.0: 33 | return f"{n_sec / (60*60):.3f} h." 34 | elif n_sec / 60 > 1.0: 35 | return f"{n_sec / 60:.3f} min." 36 | else: 37 | return f"{n_sec:.3f} sec." 38 | 39 | def find_file(path_list: List[str], is_software: bool, name: str, description: Union[str, None]=None, verbose: bool=False,) -> str: 40 | """Find first existing file among path_list.""" 41 | 42 | # Find valid path among candidates 43 | output_path = None 44 | for candidate_path in path_list: 45 | 46 | # Find as a path to a file 47 | if os.path.isfile(candidate_path): 48 | output_path = candidate_path 49 | if verbose: 50 | print(f" * Set path for [{name}] (AS PATH TO A FILE): '{output_path}'") 51 | break 52 | 53 | # Find valid bash command executable in PATH 54 | if output_path is None and is_software: 55 | for candidate_path in path_list: 56 | basename = os.path.basename(candidate_path) 57 | which_candidate_path = which(basename) 58 | if which_candidate_path is not None: 59 | output_path = which_candidate_path 60 | if verbose: 61 | print(f" * set path for [{name}] (AS EXECUTABLE): '{output_path}'") 62 | break 63 | 64 | # Raise error if no valid path is found 65 | if output_path is None: 66 | 67 | # Init error message 68 | instance_name = "software" if is_software else "file" 69 | error_str = f"\nERROR in find_file(): no valid path found for {instance_name} '{name}':" 70 | error_str += "\nPath to file not found among: " 71 | 72 | # List failed candidates 73 | for candidate_path in path_list: 74 | error_str += f"\n - '{candidate_path}'" 75 | if is_software: 76 | error_str += "\nCommand not found in the system PATH among: " 77 | for candidate_path in path_list: 78 | error_str += f"\n - '{os.path.basename(candidate_path)}'" 79 | 80 | # Add recommendaiton 81 | if is_software: 82 | error_str += f"\n -> Please install software '{name}' and provide the path to its executable file or add it to system PATH." 83 | 84 | # Add description 85 | if description is not None: 86 | error_str += f"\nDescription: \n{description}" 87 | raise ValueError(error_str) 88 | 89 | # Return first found valid path 90 | return output_path 91 | 92 | -------------------------------------------------------------------------------- /rsalor/weights/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | # Specify the minimum CMake version required 2 | cmake_minimum_required(VERSION 3.10) 3 | project(computeWeightsBackend) 4 | 5 | # Specify C++ standard 6 | set(CMAKE_CXX_STANDARD 11) 7 | set(CMAKE_CXX_STANDARD_REQUIRED True) 8 | 9 | # Include directories 10 | include_directories(include) 11 | 12 | # Source files 13 | set(SRC_FILES 14 | computeWeightsBackend.cpp 15 | msa.cpp 16 | ) 17 | 18 | # Shared library output 19 | add_library(computeWeightsBackend SHARED ${SRC_FILES}) 20 | 21 | # Change output name to _computeWeightsBackend.so 22 | set_target_properties(computeWeightsBackend PROPERTIES 23 | OUTPUT_NAME "_computeWeightsBackend" 24 | SUFFIX ".so" 25 | ) 26 | 27 | # Compiler options (optional: add any optimization/debug flags here) 28 | target_compile_options(computeWeightsBackend PRIVATE -fPIC -Ofast) 29 | 30 | # Link libraries (add more if needed, e.g., -lpthread) 31 | target_link_libraries(computeWeightsBackend PRIVATE) -------------------------------------------------------------------------------- /rsalor/weights/__init__.py: -------------------------------------------------------------------------------- 1 | from rsalor.weights.compute_weights import compute_weights, read_weights, write_weights -------------------------------------------------------------------------------- /rsalor/weights/computeWeightsBackend.cpp: -------------------------------------------------------------------------------- 1 | #include "include/msa.h" 2 | 3 | extern "C" float* computeWeightsBackend( 4 | const char* msa_path, 5 | unsigned int const msa_len, 6 | unsigned int const msa_depth, 7 | float seqid, 8 | bool count_target_sequence, 9 | unsigned int num_threads, 10 | bool verbose 11 | ) 12 | { 13 | 14 | // Init MSA 15 | MSA msa( 16 | msa_path, 17 | msa_len, 18 | msa_depth, 19 | seqid, 20 | count_target_sequence, 21 | num_threads, 22 | verbose 23 | ); 24 | 25 | // Check depth consistency 26 | unsigned int observed_msa_depth = msa.getDepth(); 27 | if(observed_msa_depth != msa_depth) { 28 | std::cerr << "ERROR in computeWeights() (C++ backend): input msa_depth do not match to computed msa depth." << std::endl; 29 | std::cerr << " * msa_path: " << msa_path << std::endl; 30 | std::cerr << " * input msa_depth: " << msa_depth << std::endl; 31 | std::cerr << " * observed msa_depth: " << observed_msa_depth << std::endl; 32 | throw std::runtime_error("Invalid msa_depth argument"); 33 | } 34 | 35 | // Allocate memory to the weights pointer because it will be passed to python 36 | float* weight_ptr = (float*)malloc(msa_depth*sizeof(float)); 37 | auto weights_ptr_local = msa.getWeightsPointer(); 38 | for(int i = 0; i < msa_depth; i++) { // Copy content from local 39 | weight_ptr[i]= weights_ptr_local[i]; 40 | } 41 | return weight_ptr; 42 | 43 | } 44 | 45 | extern "C" void freeWeights(void* weights_ptr) { 46 | float* weights_ptr_casted = static_cast(weights_ptr); 47 | if(weights_ptr_casted !=nullptr){ 48 | delete [] weights_ptr_casted; 49 | weights_ptr_casted = nullptr; 50 | } 51 | } -------------------------------------------------------------------------------- /rsalor/weights/compute_weights.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | import os.path 4 | from typing import List 5 | import glob 6 | import numpy as np 7 | import ctypes 8 | 9 | 10 | # Main ------------------------------------------------------------------------- 11 | def compute_weights( 12 | msa_path: str, 13 | msa_len: int, 14 | msa_depth: int, 15 | seqid: float=0.80, 16 | count_target_sequence: bool=True, 17 | num_threads: int=1, 18 | verboses: bool=False, 19 | ) -> List[float]: 20 | """Compute weights for all sequences of an MSA. 21 | Use C++ backend for time-performance. Implementation inspired from python package 'pycofitness'. 22 | 23 | Arguments: 24 | msa_path (str): path to msa '.fasta' file 25 | msa_len (int): length of the MSA (length of target sequence) 26 | msa_depth (int): depth of the MSA (number of sequences in the MSA) 27 | seqid (float): sequence identity threshold to consider two sequences as similar (default=0.80) 28 | count_target_sequence (bool): count target sequence in weights computations 29 | num_threads (int): number of threads (CPUs) used by C++ backend (default=1) 30 | verboses (bool): set True to log steps of execution (default=False) 31 | 32 | Return: 33 | weights (List[float]) 34 | """ 35 | 36 | # Guardians 37 | assert msa_path.endswith(".fasta"), f"ERROR in compute_weights(): msa_path='{msa_path}' should end with '.fasta'." 38 | assert os.path.exists(msa_path), f"ERROR in compute_weights('{msa_path}'): msa_path='{msa_path}' files does not exist." 39 | assert 0.0 < seqid < 1.0, f"ERROR in compute_weights('{msa_path}'): seqid={seqid} (for clustering to compute weights) should be in [0, 1] excluded." 40 | assert num_threads > 0, f"ERROR in compute_weights('{msa_path}'): num_threads={num_threads} should be stricktly positive." 41 | 42 | # Find C++ computeWeightsBackend compiled executable file 43 | path_prefix = os.path.join(os.path.dirname(__file__), "lib_computeWeightsBackend*") 44 | backend_so_paths = glob.glob(path_prefix) 45 | try: 46 | BACKEND_SO_PATH = backend_so_paths[0] 47 | except IndexError: 48 | error_log = "ERROR in compute_weights(): C++ computeWeightsBackend '.so' library path not found.\n" 49 | error_log += f" * Unable to find C++ computeWeightsBackend '.so' library path in '{path_prefix}'\n" 50 | error_log += " * Please install the pip package or compile the C++ code." 51 | raise ValueError(error_log) 52 | 53 | # Init C++ bridge 54 | computeWeightsBackend = ctypes.CDLL(BACKEND_SO_PATH) 55 | computeWeightsFunction = computeWeightsBackend.computeWeightsBackend 56 | computeWeightsFunction.argtypes = ( 57 | ctypes.c_char_p, # msa_path 58 | ctypes.c_uint, # msa_len 59 | ctypes.c_uint, # msa_depth 60 | ctypes.c_float, # seqid 61 | ctypes.c_bool, # count_target_sequence 62 | ctypes.c_uint, # num_threads 63 | ctypes.c_bool # verboses 64 | ) 65 | computeWeightsFunction.restype = ctypes.POINTER(ctypes.c_float * msa_depth) 66 | freeWeights = computeWeightsBackend.freeWeights 67 | #freeWeights.argtypes # not need to define argtypes ??? 68 | freeWeights.restype = None 69 | 70 | # Run backend 71 | weights_ptr = computeWeightsFunction( 72 | msa_path.encode('utf-8'), 73 | msa_len, 74 | msa_depth, 75 | seqid, 76 | count_target_sequence, 77 | num_threads, 78 | verboses, 79 | ) 80 | 81 | # Convert to list 82 | weights = np.zeros((msa_depth), dtype=np.float32) 83 | for i, x in enumerate(weights_ptr.contents): 84 | weights[i]= x 85 | 86 | # Free memory 87 | weights_ptr_casted = ctypes.cast(weights_ptr, ctypes.POINTER(ctypes.c_void_p)) 88 | freeWeights(weights_ptr_casted) 89 | 90 | # Return 91 | return weights 92 | 93 | 94 | def write_weights(weights: List[float], weights_path: str) -> None: 95 | """Read weights list from a file.""" 96 | 97 | # Guardians 98 | assert os.path.isdir(os.path.dirname(weights_path)), f"ERROR in write_weights(): directory of weights_path='{weights_path}' does not exist." 99 | assert len(weights) > 0, f"ERROR in write_weights(): weigths list can not be of length zero." 100 | 101 | # Write 102 | weights_str = "\n".join([str(w) for w in weights]) 103 | with open(weights_path, "w") as fs: 104 | fs.write(weights_str) 105 | 106 | 107 | def read_weights(weights_path: str) -> List[float]: 108 | """Write weights list to a file.""" 109 | 110 | # Guardians (for input) 111 | assert os.path.isfile(weights_path), f"ERROR in read_weights(): weights_path='{weights_path}' file does not exist." 112 | 113 | # Read from file 114 | with open(weights_path, "r") as fs: 115 | lines = fs.readlines() 116 | 117 | # Parse 118 | weights: List[float] = [] 119 | for i, line in enumerate(lines): 120 | if len(line) > 1 and line[0] != "#": 121 | try: 122 | weights.append(float(line)) 123 | except: 124 | line = line.replace('\n', '') 125 | error_log = f"ERROR in read_weights(): failed to parse line {i+1} / {len(lines)} as a float." 126 | error_log += f" * weights_path='{weights_path}'" 127 | error_log += f" * line='{line}'" 128 | raise ValueError(error_log) 129 | if len(weights) == 0: 130 | raise ValueError(f"ERROR in read_weights(): no parsable weights line found in weights_path='{weights_path}'.") 131 | return weights -------------------------------------------------------------------------------- /rsalor/weights/include/msa.h: -------------------------------------------------------------------------------- 1 | #ifndef MSA_H 2 | #define MSA_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include // For uint8_t type 12 | #include // Standard C++ multi-threading 13 | //#include // To time code execution 14 | 15 | class MSA { 16 | protected: 17 | const char* msa_path; 18 | unsigned int msa_len; 19 | unsigned int msa_depth; 20 | float seqid; 21 | bool count_target_sequence; 22 | unsigned int num_threads; 23 | bool verbose; 24 | std::vector> seqs_int_form; 25 | std::vector weights; 26 | 27 | public: 28 | 29 | // Constructor 30 | MSA( 31 | const char* msa_path, 32 | unsigned int msa_len, 33 | unsigned int msa_depth, 34 | float seqid, 35 | bool count_target_sequence, 36 | unsigned int num_threads, 37 | bool verbose 38 | ); 39 | 40 | // Methods 41 | std::vector> readSequences(); 42 | std::vector computeWeights(); 43 | void countClustersInRange( 44 | const std::vector& range_indices, 45 | std::vector& thread_counts, 46 | const unsigned int start_loop 47 | ); 48 | 49 | // Getters 50 | float* getWeightsPointer(); 51 | unsigned int getDepth(); 52 | unsigned int getLength(); 53 | float getNeff(); 54 | 55 | }; 56 | #endif -------------------------------------------------------------------------------- /rsalor/weights/msa.cpp: -------------------------------------------------------------------------------- 1 | 2 | // Header ---------------------------------------------------------------------- 3 | #include "include/msa.h" 4 | 5 | // MSA: Constructor ------------------------------------------------------------ 6 | MSA::MSA( 7 | const char* m_msa_path, 8 | unsigned int const m_msa_len, 9 | unsigned int const m_msa_depth, 10 | float m_seqid, 11 | bool m_count_target_sequence, 12 | unsigned int m_num_threads, 13 | bool m_verbose 14 | ): 15 | msa_path(m_msa_path), 16 | msa_len(m_msa_len), 17 | msa_depth(m_msa_depth), 18 | seqid(m_seqid), 19 | count_target_sequence(m_count_target_sequence), 20 | num_threads(m_num_threads), 21 | verbose(m_verbose) 22 | { 23 | // Read MSA 24 | if(this->verbose) { 25 | std::cout << " - RSALOR (C++ backend): read sequences from file." << std::endl; 26 | } 27 | this->seqs_int_form = readSequences(); 28 | 29 | // Compute weights 30 | if(this->verbose) { 31 | std::cout << " - RSALOR (C++ backend): compute sequences weights." << std::endl; 32 | } 33 | this->weights = this->computeWeights(); 34 | } 35 | 36 | // Parse MSA sequences from file ---------------------------------------------- 37 | std::vector> MSA::readSequences() 38 | { 39 | 40 | // Init residues mapping to int 41 | std::unordered_map res_mapping; 42 | res_mapping['A'] = 0; res_mapping['C'] = 1; res_mapping['D'] = 2; 43 | res_mapping['E'] = 3; res_mapping['F'] = 4; res_mapping['G'] = 5; 44 | res_mapping['H'] = 6; res_mapping['I'] = 7; res_mapping['K'] = 8; 45 | res_mapping['L'] = 9; res_mapping['M'] = 10; res_mapping['N'] = 11; 46 | res_mapping['P'] = 12; res_mapping['Q'] = 13; res_mapping['R'] = 14; 47 | res_mapping['S'] = 15; res_mapping['T'] = 16; res_mapping['V'] = 17; 48 | res_mapping['W'] = 18; res_mapping['Y'] = 19; res_mapping['-'] = 20; 49 | res_mapping['.'] = 20; res_mapping['~'] = 20; res_mapping['B'] = 20; 50 | res_mapping['J'] = 20; res_mapping['O'] = 20; res_mapping['U'] = 20; 51 | res_mapping['X'] = 20; res_mapping['Z'] = 20; 52 | 53 | // Init 54 | std::vector> seqs_int_form; 55 | std::ifstream msa_file_stream(this->msa_path); 56 | std::string current_line; 57 | 58 | // Check file streaming 59 | if(msa_file_stream.fail()){ 60 | std::cerr << "ERROR in MSA (C++ backend): Unable to open file." << this->msa_path << std::endl; 61 | throw std::runtime_error("Unable to open file containing the MSA data\n"); 62 | } 63 | 64 | // Loop on lines of the file 65 | while(std::getline(msa_file_stream, current_line)){ 66 | if(!current_line.empty() && current_line[0] != '>') { // Skip header and empty lines 67 | std::vector current_seq_int; 68 | current_seq_int.reserve(this->msa_len); // optimize by putting the vector in the correct size which is known 69 | for (char c : current_line) { 70 | current_seq_int.push_back(res_mapping.at(toupper(c))); 71 | } 72 | seqs_int_form.push_back(current_seq_int); 73 | } 74 | } 75 | 76 | // Return 77 | return seqs_int_form; 78 | } 79 | 80 | // Assign weights for all sequences based on clusters -------------------------- 81 | 82 | // Compute sequences weight 83 | std::vector MSA::computeWeights(){ 84 | 85 | // Init counts (all threads) 86 | std::vector counts(this->msa_depth, 1); 87 | 88 | // Count or ignore first sequence for weights computations by starting loop at 0 or 1 89 | unsigned int start_loop = this->count_target_sequence ? 0 : 1; 90 | 91 | // Initialize the per-thread counts vectors 92 | std::vector> thread_counts( 93 | num_threads, std::vector(this->msa_depth, 0) 94 | ); 95 | 96 | // Separate indices in chunks for each thread 97 | // * Trick: Since we only loop on half (i, j)-matrix (j < i), first i iterations will stop much earlier than last, 98 | // so we distribute i indices evenly across threads, so they all terminate approximatively at the same time 99 | std::vector> threads_indices(num_threads); 100 | for (unsigned int i = start_loop; i < this->msa_depth; ++i) { 101 | unsigned int thread_id = i % num_threads; 102 | threads_indices[thread_id].push_back(i); 103 | } 104 | 105 | // Manage multi-threading 106 | std::vector threads; 107 | for (unsigned int t = 0; t < num_threads; ++t) { 108 | threads.emplace_back( // ok here some magic 109 | [this, &threads_indices, &thread_counts, t, start_loop]() { 110 | countClustersInRange(threads_indices[t], thread_counts[t], start_loop); // compute cluster by chunks 111 | }); 112 | } 113 | for (auto& thread : threads) { 114 | thread.join(); 115 | } 116 | 117 | // Merge thread counts into global counts 118 | for (const auto& thread_count : thread_counts) { 119 | for (unsigned int i = 0; i < this->msa_depth; ++i) { 120 | counts[i] += thread_count[i]; 121 | } 122 | } 123 | 124 | // Convert counts to weights 125 | std::vector weights(this->msa_depth); 126 | for(unsigned int i = 0; i < this->msa_depth; ++i){ 127 | weights[i] = 1.f/ static_cast(counts[i]); 128 | } 129 | 130 | // Remove first sequences weight (that was initally assigned to 1.0) 131 | if(!this->count_target_sequence) { 132 | weights[0] = 0.f; 133 | } 134 | 135 | // Return 136 | return weights; 137 | } 138 | 139 | void MSA::countClustersInRange( 140 | const std::vector& range_indices, 141 | std::vector& range_counts, 142 | const unsigned int start_loop 143 | ) 144 | { 145 | // Init 146 | unsigned int num_identical_residues; 147 | unsigned int identical_residues_thr = static_cast(this->seqid * this->msa_len); 148 | 149 | // Loop on range 150 | for (auto i : range_indices) { 151 | const auto& seq_i = this->seqs_int_form[i]; 152 | // Loop on other sequences j < i (half matrix because (i, i)=(j, i)) 153 | for (unsigned int j = start_loop; j < i; ++j) { 154 | const auto& seq_j = this->seqs_int_form[j]; 155 | 156 | // Compute seqid(i, j) 157 | num_identical_residues = 0; 158 | for (unsigned int site = 0; site < this->msa_len; ++site) { 159 | num_identical_residues += seq_i[site] == seq_j[site]; 160 | } 161 | 162 | // Update if (i, j) in same cluster 163 | if (num_identical_residues > identical_residues_thr) { 164 | ++range_counts[i]; 165 | ++range_counts[j]; 166 | } 167 | } 168 | } 169 | } 170 | 171 | // Getter ---------------------------------------------------------------------- 172 | float* MSA::getWeightsPointer() { 173 | return weights.data(); 174 | } 175 | 176 | // Getters 177 | unsigned int MSA::getDepth() { 178 | return this->msa_depth; 179 | } 180 | 181 | unsigned int MSA::getLength() { 182 | return this->msa_len; 183 | } 184 | 185 | float MSA::getNeff() { 186 | return std::accumulate(this->weights.begin(), this->weights.end(), 0.f); 187 | } -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | 2 | # Imports ---------------------------------------------------------------------- 3 | from setuptools import setup, find_packages, Extension 4 | #from setuptools.command.build_ext import build_ext 5 | 6 | 7 | # Extensions ------------------------------------------------------------------- 8 | # Define extension (C++ code that need to be compiled) 9 | compute_weights_ext = Extension( 10 | 'rsalor.weights.lib_computeWeightsBackend', # name 11 | sources=[ # .cpp files 12 | 'rsalor/weights/computeWeightsBackend.cpp', 13 | 'rsalor/weights/msa.cpp', 14 | ], 15 | include_dirs=[ # .h directories 16 | 'rsalor/weights/include', 17 | ], 18 | extra_compile_args=['-std=c++11', '-O3'], # optimization and other flags 19 | extra_link_args=['-O3'], 20 | language='c++', 21 | ) 22 | 23 | 24 | # Setup ------------------------------------------------------------------------ 25 | setup( 26 | name="rsalor", 27 | version="1.1.1", 28 | author="Matsvei Tsishyn", 29 | author_email="matsvei.tsishyn@protonmail.com", 30 | description="Combines structural data (Relative Solvent Accessibility, RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins.", 31 | long_description=open("README.md").read(), 32 | long_description_content_type="text/markdown", 33 | url="https://github.com/3BioCompBio/RSALOR", 34 | python_requires=">=3.9", 35 | packages=find_packages(), 36 | install_requires=[ 37 | #'llvmlite>0.30.0', 38 | 'numpy', 39 | 'biopython>=1.75', 40 | ], 41 | ext_modules = [compute_weights_ext], 42 | classifiers=[ 43 | "Programming Language :: Python :: 3", 44 | "Programming Language :: C++", 45 | "Programming Language :: C", 46 | "License :: OSI Approved :: MIT License", 47 | "Operating System :: OS Independent", 48 | "Topic :: Scientific/Engineering :: Bio-Informatics", 49 | ], 50 | #entry_points={ 51 | # "console_scripts":[ 52 | # "rsalor=rsalor.main:run_mutation", 53 | # ], 54 | #}, 55 | ) --------------------------------------------------------------------------------