├── .gitignore
├── LICENCE
├── Logo.png
├── MANIFEST.in
├── README.md
├── rsalor
├── __init__.py
├── msa.py
├── rsa
│ ├── __init__.py
│ ├── rsa_biopython.py
│ ├── rsa_dssp.py
│ ├── rsa_music.py
│ └── rsa_solver.py
├── sequence
│ ├── __init__.py
│ ├── amino_acid.py
│ ├── fasta_reader.py
│ ├── mutation.py
│ ├── pairwise_alignment.py
│ └── sequence.py
├── structure
│ ├── __init__.py
│ ├── residue.py
│ └── structure.py
├── utils
│ ├── CSV.py
│ ├── __init__.py
│ ├── ali_to_fasta.py
│ ├── logger.py
│ └── utils.py
└── weights
│ ├── CMakeLists.txt
│ ├── __init__.py
│ ├── computeWeightsBackend.cpp
│ ├── compute_weights.py
│ ├── include
│ └── msa.h
│ └── msa.cpp
├── setup.py
└── test_data
├── 6acv_A_29-94.fasta
└── 6acv_A_29-94.pdb
/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore cache build and config files
2 | __pycache__/
3 | build/
4 | rsalor.egg-info/
5 | dist/
6 |
7 | # Ignore experiments
8 | src/
9 | tmp/
10 | fig/
11 | 0_*
12 |
13 | # Ignore compiled files
14 | *.so
15 | *.a
16 | *.o
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c), 2025, Matsvei Tsishyn
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/3BioCompBio/RSALOR/0fa6cdb14eab2b6c6c99bcb170d82b246d6231b0/Logo.png
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | # Include files required for building C++ extension
2 | include rsalor/weights/include/*
3 | recursive-include rsalor/weights *.cpp
4 |
5 | # Exclude files and directories that should not be in the package
6 | exclude Logo.png
7 | exclude fig/*
8 | exclude src/*
9 | exclude tmp/*
10 | exclude 0_*
11 | exclude conda-env.yml
12 | exclude test_data/*
13 | global-exclude *.py[cod]
14 | global-exclude __pycache__/*
15 | global-exclude *.so
16 | global-exclude *.a
17 | global-exclude *.o
18 |
19 | # Exclude build artifacts
20 | global-exclude rsalor/weights/build/*
21 | global-exclude *.egg-info/*
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 | # RSALOR
3 |
4 | [](https://pypi.org/project/rsalor/) [](https://opensource.org/licenses/MIT)
5 |
6 |
![[RSALOR Logo]](Logo.png)
7 |
8 |
9 | `rsalor` is a Python package that computes the `RSA*LOR` score for each missence mutation in a protein. It combines multiple computational steps into a fast and user-friendly tool.
10 |
11 | **Please cite**:
12 | - [Matsvei Tsishyn, Pauline Hermans, Fabrizio Pucci, Marianne Rooman (2025). Residue conservation and solvent accessibility are (almost) all you need for predicting mutational effects in proteins. Bioinformatics, btaf322](https://doi.org/10.1093/bioinformatics/btaf322).
13 |
14 | - [Pauline Hermans, Matsvei Tsishyn, Martin Schwersensky, Marianne Rooman, Fabrizio Pucci (2024). Exploring evolution to uncover insights into protein mutational stability. Molecular Biology and Evolution, 42(1), msae267](https://doi.org/10.1093/molbev/msae267).
15 |
16 |
17 | ## Installation and Usage
18 |
19 | Installation with `pip`:
20 | ```bash
21 | pip install rsalor
22 | ```
23 |
24 | Make sure the first sequence in your MSA file is the target sequence to mutate.
25 | From directory `./test_data/` execute the following Python code:
26 | ```python
27 | # Import
28 | from rsalor import MSA
29 |
30 | # Log basic usage instructions and arguments of the package
31 | MSA.help()
32 |
33 | # Initialize MSA
34 | msa_path = "./6acv_A_29-94.fasta"
35 | pdb_path = "./6acv_A_29-94.pdb"
36 | chain = "A"
37 | msa = MSA(msa_path, pdb_path, chain, num_threads=8, verbose=True)
38 |
39 | # You can ignore structure and RSA by omitting the pdb_path argument
40 | #msa = MSA(msa_path, num_threads=8, verbose=True)
41 |
42 | # Get LOR and other scores for all mutations
43 | scores = msa.get_scores() # [{'mutation_fasta': 'S1A', 'mutation_pdb': 'SA1A', 'RSA': 61.54, 'LOR': 5.05, ...}, ...]
44 |
45 | # Or directly save scores to a CSV file
46 | msa.save_scores("./6acv_A_29-94_scores.csv", sep=";")
47 | ```
48 |
49 | ## Requirements
50 |
51 | - Python 3.9 or later
52 | - Python packages `numpy` ans `biopython` (version 1.75 or later)
53 | - A C++ compiler that supports C++11 (such as GCC)
54 |
55 | ## Short description
56 |
57 | The `rsalor` package combines structural data (Relative Solvent Accessibility, RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins.
58 |
59 | It parses a Multiple Sequence Alignment (MSA), removes redundant sequences, and assigns a weight to each sequence based on sequence identity clustering. The package then computes the weighted Log Odd Ratio (LOR) and Log Ratio (LR) for each single missense mutation. Additionally, it calculates the Relative Solvent Accessibility (RSA) for each residue and combines the LOR/LR and RSA scores, as described in the reference paper. The package resolves discrepancies between the MSA's target sequence and the protein structure (e.g., missing residues in structure) by aligning the PDB structure with the MSA target sequence.
60 |
61 | The sign of RSALOR / LOR is defined such that the result of mutations from a highly represented amino acid to a less represented amino acid is positive, which generally corresponds to a decrease in protein stability or fitness. In other words, large positive values predict highly destabilizing / disruptive mutations, while values close to zero or negative predict positive or neutral mutations.
62 |
63 | ## Compile from source
64 |
65 | For performance reasons, `rsalor` uses a C++ backend to weight sequences in the MSA. The C++ code needs to be compiled to use it directly from source. To compile the code, follow these steps:
66 | ```bash
67 | git clone https://github.com/3BioCompBio/RSALOR # Clone the repository
68 | cd RSALOR/rsalor/weights/ # Navigate to the C++ code directory
69 | mkdir build # Create a build directory
70 | cd build # Enter the build directory
71 | cmake .. # Generate make files
72 | make # Compile the C++ code
73 | mv ./lib_computeWeightsBackend* ../ # Move the compiled file to the correct directory
74 | ```
75 |
--------------------------------------------------------------------------------
/rsalor/__init__.py:
--------------------------------------------------------------------------------
1 | from rsalor.msa import MSA
2 |
--------------------------------------------------------------------------------
/rsalor/msa.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from os import cpu_count
5 | from typing import Union, List, Dict, Literal, Callable
6 | import tempfile
7 | import numpy as np
8 | from rsalor.utils import time_str
9 | from rsalor.sequence import AminoAcid
10 | from rsalor.sequence import Mutation
11 | from rsalor.sequence import Sequence
12 | from rsalor.sequence import FastaReader, FastaStream
13 | from rsalor.sequence import PairwiseAlignment
14 | from rsalor.structure import Structure
15 | from rsalor.weights import compute_weights, read_weights, write_weights
16 | from rsalor.utils import CSV
17 | from rsalor.utils import Logger
18 |
19 |
20 | # Main -------------------------------------------------------------------------
21 | class MSA:
22 | """Class MSA: Combines structural data (RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins.
23 | Main class of the RSALOR package.
24 | """
25 |
26 |
27 | # Constants ----------------------------------------------------------------
28 | ACCEPTED_EXTENTIONS = ["fasta", "a2m"]
29 | N_STATES = len(AminoAcid.ONE_2_ID) + 1
30 | GAP_ID = N_STATES - 1
31 | GAP_CHAR = AminoAcid.GAP_ONE
32 | ONE_2_ID = {aa_one: aa_id for aa_one, aa_id in AminoAcid.ONE_2_ID.items()}
33 | ONE_2_ID_GAP = {aa_one: aa_id for aa_one, aa_id in AminoAcid.ONE_2_ID.items()}
34 | ONE_2_ID_GAP[GAP_CHAR] = GAP_ID
35 |
36 |
37 | # Constructor --------------------------------------------------------------
38 | def __init__(
39 | self,
40 | msa_path: str,
41 | pdb_path: Union[None, str]=None,
42 | chain: Union[None, str]=None,
43 | theta_regularization: float=0.01,
44 | n_regularization: float=0.0,
45 | count_target_sequence: bool=True,
46 | remove_redundant_sequences: bool=True,
47 | seqid_weights: Union[None, float]=0.80,
48 | min_seqid: Union[None, float]=0.35,
49 | num_threads: int=1,
50 | rsa_solver: Literal["biopython", "DSSP", "MuSiC"]="biopython",
51 | rsa_solver_path: Union[None, str]=None,
52 | trimmed_msa_path: Union[None, str]=None,
53 | allow_msa_overwrite: bool=False,
54 | weights_cache_path: Union[None, str]=None,
55 | rsa_cache_path: Union[None, str]=None,
56 | verbose: bool=False,
57 | disable_warnings: bool=False,
58 | name: Union[None, str]=None,
59 | ):
60 | """\nRSA*LOR: Combines structural data (RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins.
61 |
62 | ----------------------------------------------------------------------------
63 | usage (Python):
64 | from rsalor import MSA # Import pip package
65 | msa = MSA('./msa1.fasta', './pdb1.pdb', 'A') # Initialize MSA object with an MSA file, a PDB file and corresponding chain in the PDB
66 | scores = msa.get_scores() # Compute RSA*LOR scores of all single-site missense mutations
67 | msa.save_scores("./msa1_scores.csv") # Save scores to a '.csv' file
68 |
69 | ----------------------------------------------------------------------------
70 | Main arguments:
71 | msa_path (str) path to MSA '.fasta' or '.a2m' file
72 |
73 | Structure arguments:
74 | pdb_path (None | str, None) path to PDB '.pdb' file (leave empty to ignore structure)
75 | chain (None | str, None) chain in the PDB to consider
76 |
77 | LOR/LR arguments:
78 | theta_regularization (float, 0.01) regularization term for LOR/LR at amino acid frequencies level
79 | n_regularization (float, 0.0) regularization term for LOR/LR at amino acid counts level
80 | count_target_sequence (bool, True) count target (first) sequence of the MSA in frequencies
81 | remove_redundant_sequences (bool, True) pre-process MSA to remove redundent sequences
82 | seqid_weights (None | float, 0.80) seqid threshold to consider two sequences in the same cluster for weighting (set None to ignore)
83 | min_seqid (None | float, 0.35) sequences which seqid with target sequence is below will be discarded (set None to ignore)
84 | num_threads (int, 1) number of threads (CPUs) for weights evaluation (in the C++ backend)
85 |
86 | RSA arguments:
87 | rsa_solver ('biopython'/'DSSP'/'MuSiC') used solver to compute RSA (DSSP and MuSiC require the software to be installed)
88 | rsa_solver_path (None | str, None) path to DSSP/MuSiC executable to compute RSA (leave empty if software is in system PATH)
89 |
90 | Files management arguments:
91 | trimmed_msa_path (None | str, None) set to save the trimmed + non-redundent MSA file (leave empty to ignore)
92 | allow_msa_overwrite (bool, False) allow to overwrite initial MSA file with the trimmed + non-redundent MSA file
93 |
94 | Cache arguments:
95 | weights_cache_path (None | str, None) set to read (is file exists) or write (is files does not exists) weights (leave empty to ignore)
96 | rsa_cache_path (None | str, None) set to read (is file exists) or write (is files does not exists) rsa values (leave empty to ignore)
97 |
98 | Logging arguments:
99 | verbose (bool, False) log execution steps
100 | disable_warnings (bool, False) disable logging of Warnings
101 | name (None | str, None) name of the MSA object (for logging)
102 | """
103 |
104 | # MSA path Guardians
105 | self.name = "" # Required for logs, so we set directly.
106 | self._verify_input_msa_path(msa_path)
107 |
108 | # Fill basic properties
109 | self.msa_path: str = msa_path
110 | self.msa_filename: str = os.path.basename(self.msa_path)
111 | self.name: str = name
112 | if self.name is None:
113 | for extention in self.ACCEPTED_EXTENTIONS:
114 | if self.msa_filename.endswith(f".{extention}"):
115 | self.name = self.msa_filename.removesuffix(f".{extention}")
116 | break
117 | self.pdb_path: str = pdb_path
118 | self.chain: str = chain
119 | self.rsa_solver: str = rsa_solver
120 | self.rsa_solver_path: str = rsa_solver_path
121 | self.rsa_cache_path: str = rsa_cache_path
122 | self.theta_regularization: float = theta_regularization
123 | self.n_regularization: float = n_regularization
124 | self.remove_redundant_sequences: bool = remove_redundant_sequences
125 | self.count_target_sequence: bool = count_target_sequence
126 | self.seqid_weights: Union[None, float] = seqid_weights
127 | self.min_seqid: Union[None, float] = min_seqid
128 | self.num_threads: int = num_threads
129 | self.weights_cache_path: str = weights_cache_path
130 | self.trimmed_msa_path: Union[None, str] = trimmed_msa_path
131 | self.allow_msa_overwrite: bool = allow_msa_overwrite
132 | self.verbose: bool = verbose
133 | self.disable_warnings: bool = disable_warnings
134 | self.logger = Logger(verbose, disable_warnings, step_prefix="RSALOR", warning_note=f" in {self}", error_note=f" in {self}")
135 |
136 | # Too much CPU warning
137 | num_cpu_total = cpu_count()
138 | if num_cpu_total is None or num_cpu_total < self.num_threads:
139 | self.logger.warning(f"num_threads={num_threads} exeeds total number of CPUs detected on current machine (num_cpu_total={num_cpu_total}).")
140 |
141 | # Init structure (if pdb_path is specified)
142 | self._init_structure()
143 |
144 | # Read sequences
145 | self._read_sequences()
146 |
147 | # Filter sequences that are too far from target sequence
148 | if min_seqid is not None:
149 | self._remove_far_seqid_sequences()
150 |
151 | # Align Structure and Sequence (if pdb_path is specified)
152 | self._align_structure_to_sequence()
153 |
154 | # Save trimmed MSA (if trimmed_msa_path is specified)
155 | if self.trimmed_msa_path is not None:
156 | self.logger.step("save trimmed MSA (without target sequence gaps and non-std AAs, without redundent sequences) to a file.")
157 | self.logger.log(f" * trimmed_msa_path: '{trimmed_msa_path}'")
158 | self._verify_trimmed_seq_path()
159 | self.write(trimmed_msa_path)
160 |
161 | # Assign weights
162 | self._init_weights()
163 |
164 | # Counts and Frequencies
165 | self._init_counts()
166 |
167 |
168 | # Constructor dependencies -------------------------------------------------
169 | def _init_structure(self) -> None:
170 | """Parse PDB file and compute RSA (Relative Solvent Accessibility)."""
171 |
172 | # Case: pdb_path is None -> just log some warnings and continue
173 | if self.pdb_path is None:
174 | if self.chain is not None:
175 | warning_log = "pdb_path is not set, so structure and RSA are ignored."
176 | warning_log += f" However chain is set to '{self.chain}'."
177 | warning_log += f" Please specify pdb_path to consider structure and RSA."
178 | self.logger.warning(warning_log)
179 | if self.rsa_solver_path is not None:
180 | warning_log = "pdb_path is not set, so structure and RSA are ignored."
181 | warning_log += f" However rsa_solver_path is set to '{self.rsa_solver_path}'."
182 | warning_log += f" Please specify pdb_path to consider structure and RSA."
183 | self.logger.warning(warning_log)
184 | self.structure = None
185 | return None
186 |
187 | # Set Structure
188 | self.logger.step(f"parse PDB structure '{os.path.basename(self.pdb_path)}' (chain '{self.chain}') and compute RSA.")
189 | assert self.chain is not None, f"{self.error_prefix}: pdb_path='{self.pdb_path}' is set, so please set also the PDB chain to consider."
190 | self.structure = Structure(
191 | self.pdb_path,
192 | self.chain,
193 | rsa_solver=self.rsa_solver,
194 | rsa_solver_path=self.rsa_solver_path,
195 | rsa_cache_path=self.rsa_cache_path,
196 | verbose=self.verbose,
197 | )
198 |
199 | # Non assigned RSA warnings
200 | self._verify_rsa_values()
201 |
202 | def _read_sequences(self) -> None:
203 | """Read sequences from MSA FASTA file."""
204 |
205 | # Read MSA
206 | self.logger.step(f"read sequences from MSA file '{self.msa_filename}'.")
207 |
208 | # Inspect target sequence for gaps and non-standard AAs
209 | # Also set up alignment between MSA and trimmed MSA positions
210 | target_sequence = FastaReader.read_first_sequence(self.msa_path)
211 | self.fasta_to_fasta_trimmed: Dict[str, str] = {}
212 | self.fasta_trimmed_to_fasta: Dict[str, str] = {}
213 | tgt_seq_len = len(target_sequence)
214 | n_gaps = 0
215 | non_standard = []
216 | keep_position: List[bool] = []
217 | i_res_trimmed = 0
218 | for i_res, res in enumerate(target_sequence):
219 | if res in self.ONE_2_ID: # Standard AA -> keep
220 | fasta_res = str(i_res+1)
221 | fasta_trimmed_res = str(i_res_trimmed+1)
222 | self.fasta_to_fasta_trimmed[fasta_res] = fasta_trimmed_res
223 | self.fasta_trimmed_to_fasta[fasta_trimmed_res] = fasta_res
224 | i_res_trimmed += 1
225 | keep_position.append(True)
226 | elif res == self.GAP_CHAR: # Gap -> remove
227 | n_gaps += 1
228 | keep_position.append(False)
229 | else: # Other -> remove
230 | non_standard.append(res)
231 | keep_position.append(False)
232 | n_remove = n_gaps + len(non_standard)
233 | do_trimming = n_remove > 0
234 | n_keep = len(target_sequence) - n_remove
235 | if n_keep < 1:
236 | raise ValueError(f"{self.error_prefix}: target sequence does not contain any standard amino acid residues.")
237 | if do_trimming:
238 | self.logger.warning(f"target sequence contains some gaps or non-standard amino acids: MSA will be trimmed: {len(target_sequence)} -> {n_keep} (num trimmed positions: {n_remove}).")
239 | if n_gaps > 0:
240 | self.logger.warning(f"target sequence contains {n_gaps} gaps -> those positions will be trimmed.")
241 | if len(non_standard) > 0:
242 | non_std_str = "".join(non_standard)
243 | if len(non_std_str) > 10:
244 | non_std_str = non_std_str[0:7] + "..."
245 | self.logger.warning(f"target sequence contains {len(non_standard)} non-standard amino acids ('{non_std_str}') -> those positions will be trimmed.")
246 |
247 | # Read sequences from file
248 | self.sequences: List[Sequence] = []
249 | fasta_stream = FastaStream(self.msa_path) # Caution with this one
250 | n_tot_sequences = 0
251 | # Keep redundant sequences
252 | if not self.remove_redundant_sequences:
253 | sequence = fasta_stream.get_next()
254 | while sequence is not None:
255 | self._verify_sequence_length(sequence, tgt_seq_len, n_tot_sequences)
256 | if do_trimming:
257 | sequence.trim(keep_position)
258 | if len(sequence) == 0:
259 | continue
260 | self.sequences.append(sequence)
261 | sequence = fasta_stream.get_next()
262 | n_tot_sequences += 1
263 | # Keep only non-redundant sequences
264 | # the filter is done during execution to optimize time and RAM (could help with huge MSAs)
265 | else:
266 | sequences_set = set()
267 | sequence = fasta_stream.get_next()
268 | while sequence is not None:
269 | self._verify_sequence_length(sequence, tgt_seq_len, n_tot_sequences)
270 | if do_trimming:
271 | sequence.trim(keep_position)
272 | if len(sequence) == 0:
273 | continue
274 | sequence_str = sequence.sequence
275 | if sequence_str not in sequences_set:
276 | self.sequences.append(sequence)
277 | sequences_set.add(sequence_str)
278 | sequence = fasta_stream.get_next()
279 | n_tot_sequences += 1
280 | self.logger.log(f" * remove redundant sequences : {n_tot_sequences} -> {len(self.sequences)}")
281 | fasta_stream.close()
282 |
283 | # Verify MSA consisency
284 | assert self.depth > 1, f"{self.error_prefix}: MSA contains no or only 1 sequence."
285 | assert self.length > 0, f"{self.error_prefix}: MSA target (first) sequence is of length 0."
286 |
287 | # Log
288 | self.logger.log(f" * MSA length (tgt seq length) : {len(self.target_sequence)}")
289 | self.logger.log(f" * MSA depth (num sequences) : {len(self.sequences)}")
290 |
291 | # Set target sequence name
292 | self.target_sequence.name += " (trimmed MSA)"
293 |
294 | def _remove_far_seqid_sequences(self) -> None:
295 | """Filter sequences that are too far from target sequence by sequence identity."""
296 |
297 | # Guardian
298 | assert 0.0 <= self.min_seqid < 1.0, f"{self.error_prefix}: min_seqid={self.min_seqid} should be stricktly between 0 and 1."
299 |
300 | # Log
301 | self.logger.step(f"filter sequences that are too far from target sequence.")
302 |
303 | # Compute sequences to keep
304 | keep_sequences: List[Sequence] = []
305 | target_sequence_str = self.sequences[0].sequence
306 | for current_sequence in self.sequences:
307 | current_sequence_str = current_sequence.sequence
308 |
309 | # Compute seqid with target sequence
310 | current_seqid = self._seqid_to_target(target_sequence_str, current_sequence_str)
311 |
312 | if current_seqid > self.min_seqid:
313 | keep_sequences.append(current_sequence)
314 |
315 | # Update MSA sequences
316 | l1, l2 = len(self.sequences), len(keep_sequences)
317 | self.sequences = keep_sequences
318 |
319 | # Log results
320 | self.logger.log(f" * filter: {l1} -> {l2} (min_seqid={self.min_seqid:.2f})")
321 |
322 | # Guardians
323 | if l2 == 0:
324 | error_log = f"{self.error_prefix}: remove_far_seqid_sequences(): no sequence left."
325 | error_log += f"\n - No sequences left in the MSA after removing sequences that are too far from target sequence (by sequence indentity)"
326 | error_log += f"\n - min_seqid={self.min_seqid}: please increase value or set to None."
327 | raise ValueError(error_log)
328 |
329 | def _seqid_to_target(self, seq1: str, seq2: str) -> float:
330 | """Computes sequence identity between two sequences in the MSA."""
331 | gap = self.GAP_CHAR
332 | num_identical_residues = sum([int(aa1 == aa2) for aa1, aa2 in zip(seq1, seq2)])
333 | num_aligned_residues = sum([int(aa != gap) for aa in seq2])
334 | return num_identical_residues / num_aligned_residues
335 |
336 | def _align_structure_to_sequence(self) -> None:
337 | """Align residues position between PDB sequence and target sequence of the MSA."""
338 |
339 | # Init
340 | self.str_seq_align: PairwiseAlignment
341 | self.pdb_to_fasta_trimmed: Dict[str, str] = {}
342 | self.fasta_trimmed_to_pdb: Dict[str, str] = {}
343 | self.rsa_array: List[Union[None, float]] = [None for _ in range(self.length)]
344 | self.rsa_factor_array: List[Union[None, float]] = [None for _ in range(self.length)]
345 | if self.structure is None:
346 | return None
347 |
348 | # Log
349 | self.logger.step("align Structure (from PDB) and Sequence (from MSA).")
350 |
351 | # Init alignment
352 | self.str_seq_align = PairwiseAlignment(self.structure.sequence, self.target_sequence)
353 |
354 | # Map positions
355 | i_pdb, i_fasta_trimmed = 0, 0
356 | n_no_rsa, n_no_residue = 0, 0
357 | for aa_pdb, aa_fasta_trimmed in zip(self.str_seq_align.align1, self.str_seq_align.align2):
358 | if aa_pdb != self.GAP_CHAR and aa_fasta_trimmed != self.GAP_CHAR:
359 | residue = self.structure.chain_residues[i_pdb]
360 | fasta_trimmed_id = str(i_fasta_trimmed+1)
361 | self.pdb_to_fasta_trimmed[residue.resid] = fasta_trimmed_id
362 | self.fasta_trimmed_to_pdb[fasta_trimmed_id] = residue.resid
363 | self.rsa_array[i_fasta_trimmed] = residue.rsa
364 | if residue.rsa is None:
365 | n_no_rsa += 1
366 | if aa_pdb != self.GAP_CHAR:
367 | i_pdb += 1
368 | if aa_fasta_trimmed != self.GAP_CHAR:
369 | if aa_pdb == self.GAP_CHAR: # Position in MSA but not is PDB
370 | n_no_residue += 1
371 | i_fasta_trimmed += 1
372 |
373 | # Log
374 | n_assigned = len([rsa for rsa in self.rsa_array if rsa is not None])
375 | self.logger.log(f" * {n_assigned} / {len(self.rsa_array)} assigned RSA values for positions in trimmed MSA")
376 |
377 | # Set RSA factor
378 | self.set_rsa_factor()
379 |
380 | # Alignment Warnings
381 | if n_no_residue:
382 | self.logger.warning(f"{n_no_residue} / {len(self.rsa_array)} positions in trimmed MSA with no corresponding residues in PDB structure.")
383 | if n_no_rsa:
384 | self.logger.warning(f"{n_no_rsa} / {len(self.rsa_array)} positions in trimmed MSA corresponding to PDB residues without assigned RSA.")
385 | critical_alignment_warning = False
386 | if self.str_seq_align.mismatch > 0:
387 | critical_alignment_warning = True
388 | self.logger.warning(f"{self.str_seq_align.mismatch} / {len(self.rsa_array)} mismatch between trimmed MSA and PDB.", critical=True)
389 | if self.str_seq_align.internal_gap2 > 0:
390 | critical_alignment_warning = True
391 | self.logger.warning(f"{self.str_seq_align.internal_gap2} internal residues in the PDB do not correspond to a position in trimmed MSA.", critical=True)
392 | if critical_alignment_warning and not self.disable_warnings:
393 | self.str_seq_align.show(n_lines=80, only_critical_chunks=True)
394 | self.logger.warning("Please, make sure the first sequence in your MSA file is the target sequence to mutate.", critical=True)
395 |
396 | def set_rsa_factor(self, rsa_factor_function: Union[Callable[[float], float], None]=None) -> None:
397 |
398 | # Set default function
399 | if rsa_factor_function is None:
400 | rsa_factor_function = self.inverse_rsa
401 |
402 | # Log
403 | self.logger.step(f"set RSA factor (RSA -> w(RSA) with w='{rsa_factor_function.__name__}').")
404 |
405 | # Set RSA factor
406 | for i, rsa in enumerate(self.rsa_array):
407 | if rsa is not None:
408 | self.rsa_factor_array[i] = (1.0 - min(rsa, 100.0) / 100.0)
409 |
410 | def _init_weights(self) -> None:
411 | """Initialize weights for all sequences of the MSA (using C++ backend or from a cache file)."""
412 |
413 | # Case: keep all weights to 1
414 | if self.seqid_weights is None:
415 | # Put weight of first sequence to 0.0 manually to ignore it if required
416 | if not self.count_target_sequence:
417 | self.sequences[0].weight = 0.0
418 | return None
419 |
420 | # Read from cached file case
421 | if self.weights_cache_path is not None and os.path.isfile(self.weights_cache_path):
422 | self.logger.step("read weights from cached file.")
423 | self.logger.log(f" * weights_cache_path: '{self.weights_cache_path}'")
424 | weights = read_weights(self.weights_cache_path)
425 | if len(weights) != len(self.sequences):
426 | error_log = f"{self.error_prefix}: read_weights(weights_cache_path='{self.weights_cache_path}'): "
427 | error_log += f"\nnumber of parsed weights ({len(weights)}) does not match number of sequences ({len(self.sequences)}) in MSA."
428 | error_log += f"\n * Please remove current weights_cache file and re-run weights or set weights_cache_path to None."
429 | raise ValueError(error_log)
430 |
431 | # Re-compute case weights case
432 | else:
433 | self.logger.step("compute weights using C++ backend.")
434 | dt = (0.00000000015 * self.length * self.depth**2) / self.num_threads
435 | dt_str = time_str(dt)
436 | self.logger.log(f" * seqid (to compute clusters) : {self.seqid_weights}")
437 | self.logger.log(f" * expected computation-time : {dt_str} (with {self.num_threads} CPUs)")
438 |
439 | # Case when processed+trimmed MSA in saved
440 | if self.trimmed_msa_path is not None:
441 | weights = compute_weights(
442 | self.trimmed_msa_path,
443 | self.length,
444 | self.depth,
445 | self.seqid_weights,
446 | self.count_target_sequence,
447 | self.num_threads,
448 | self.verbose
449 | )
450 | # Case when processed+trimmed MSA is not saved
451 | else:
452 | with tempfile.TemporaryDirectory() as tmp_dir:
453 | tmp_msa_path = os.path.join(tmp_dir, f"{self.name}.fasta")
454 | self.write(tmp_msa_path)
455 | weights = compute_weights(
456 | tmp_msa_path,
457 | self.length,
458 | self.depth,
459 | self.seqid_weights,
460 | self.count_target_sequence,
461 | self.num_threads,
462 | self.verbose
463 | )
464 |
465 | # Verify coherence of computed weights
466 | if len(weights) != len(self.sequences):
467 | error_log = f"{self.error_prefix}: compute_weights(): "
468 | error_log += f"number of computed weights ({len(weights)}) does not match number of sequences ({len(self.sequences)}) in MSA."
469 | raise ValueError(error_log)
470 |
471 | # Assign weights
472 | for i, wi in enumerate(weights):
473 | self.sequences[i].weight = wi
474 |
475 | # Save weights in cache file if required
476 | if self.weights_cache_path is not None and not os.path.isfile(self.weights_cache_path):
477 | self.logger.step(f"save computed weights to file '{self.weights_cache_path}'.")
478 | self.logger.log(f" * weights_cache_path: '{self.weights_cache_path}'")
479 | write_weights(weights, self.weights_cache_path)
480 |
481 | def _init_counts(self) -> None:
482 | """Initialize residues counts and frequences from the MSA."""
483 |
484 | # Log
485 | self.logger.step("initialize residues counts and frequencies.")
486 |
487 | # Set Neff
488 | self.Neff: float = sum([sequence.weight for sequence in self.sequences])
489 | self.logger.log(f" * Neff (sum of weights): {self.Neff:.2f}")
490 |
491 | # Counts
492 | self.counts = np.zeros((self.length, self.N_STATES), float)
493 | for sequence in self.sequences:
494 | for l, aa in enumerate(sequence):
495 | aa_id = self.ONE_2_ID.get(aa, self.GAP_ID)
496 | self.counts[l, aa_id] += sequence.weight
497 | self.gap_counts = self.counts[:, self.GAP_ID]
498 | self.nongap_counts = self.Neff - self.gap_counts
499 |
500 | # Frequencies
501 | self.frequencies = self.counts / self.Neff
502 | self.gap_frequencies = self.frequencies[:, self.GAP_ID]
503 | self.nongap_frequencies = 1.0 - self.gap_frequencies
504 |
505 | # CI (Conservation Index)
506 | self.global_aa_frequencies = np.sum(self.frequencies, axis=0) / self.length
507 | self.CI = np.sqrt(0.5 * np.sum(((self.frequencies - self.global_aa_frequencies)[:, 0:20])**2, axis=1))
508 |
509 | # Manage regularization and LOR/LR scores
510 | self.update_regularization(self.theta_regularization, self.n_regularization)
511 |
512 | def update_regularization(self, theta_regularization: float, n_regularization: float) -> "MSA":
513 | """Update regularization parameters and recompute regularized frequencies.
514 |
515 | Arguments:
516 | theta_regularization (float): Regularization at the level of frequencies (add theta to all positional frequencies and normalize)
517 | n_regularization (float): Regularization at the level of counts (add n to all positional counts and normalize)
518 | """
519 |
520 | # Log
521 | self.logger.step("compute regularized frequencies.")
522 | self.logger.log(f" * theta_regularization : {theta_regularization}")
523 | self.logger.log(f" * n_regularization : {n_regularization}")
524 |
525 | # Regularization Guardians
526 | assert theta_regularization >= 0.0, f"{self.error_prefix}: theta_regularization={theta_regularization} should be positive."
527 | assert n_regularization >= 0.0, f"{self.error_prefix}: n_regularization={n_regularization} sould be positive."
528 | assert theta_regularization > 0.0 or n_regularization > 0.0, f"{self.error_prefix}: both theta_regularization and n_regularization can not be zero to avoid divering values."
529 |
530 | # Set regularization properties
531 | self.theta_regularization = theta_regularization
532 | self.n_regularization = n_regularization
533 |
534 | # Apply n_regularization
535 | self.frequencies_reg = (self.counts + self.n_regularization) / (self.Neff + (float(self.N_STATES) * self.n_regularization))
536 |
537 | # Apply theta_regularization
538 | reg_term: float = self.theta_regularization / float(self.N_STATES)
539 | reg_factor: float = 1.0 - self.theta_regularization
540 | self.frequencies_reg = reg_factor * self.frequencies_reg + reg_term
541 |
542 | # Compute dependent values
543 | self.gap_frequencies_reg = self.frequencies_reg[:, self.GAP_ID]
544 | self.nongap_frequencies_reg = 1.0 - self.gap_frequencies_reg
545 |
546 | # Set LOR and LR
547 | self.LR = np.log(self.frequencies_reg)
548 | self.LOR = np.log(self.frequencies_reg / (1.0 - self.frequencies_reg))
549 |
550 | return self
551 |
552 |
553 | # Base Properties ----------------------------------------------------------
554 | @classmethod
555 | def help(cls) -> None:
556 | """Log main usage (help) of MSA class in the 'rsalor' package."""
557 | print(cls.__init__.__doc__)
558 |
559 | def __str__(self) -> str:
560 | return f"MSA('{self.name}')"
561 |
562 | def __iter__(self):
563 | return iter(self.sequences)
564 |
565 | def __getitem__(self, id: int) -> str:
566 | return self.sequences[id]
567 |
568 | @property
569 | def target_sequence(self) -> Sequence:
570 | return self.sequences[0]
571 |
572 | @property
573 | def length(self) -> int:
574 | """Length of each sequence from the MSA."""
575 | return len(self.target_sequence)
576 |
577 | @property
578 | def depth(self) -> int:
579 | """Number of sequences in the MSA."""
580 | return len(self.sequences)
581 |
582 | @property
583 | def error_prefix(self) -> str:
584 | """Return error in MSA prefix."""
585 | return f"\033[91mERROR\033[0m in {self}"
586 |
587 | @staticmethod
588 | def inverse_rsa(rsa_value: float) -> float:
589 | return 1.0 - min(rsa_value, 100.0) / 100.0
590 |
591 | # Scores (such as LOR) Properties ------------------------------------------
592 | def get_frequency(self, residue_id: int, amino_acid_one_char: str, regularized: bool=True):
593 | """Get a given amino acid (regularized) frequency at a given position:
594 |
595 | NOTE: residue_id in FASTA convention (first position is 1) on the trimmed MSA
596 |
597 | Arguments:
598 | residue_id (int): position index in fasta convention (first residues is 1)
599 | amino_acid_one_char (str): amino acid one-letter-code or gap code '-'
600 | regularized (bool): set True for regularized frequencies
601 | """
602 | if regularized:
603 | return self.frequencies_reg[residue_id - 1, self.ONE_2_ID_GAP[amino_acid_one_char]]
604 | else:
605 | return self.frequencies[residue_id - 1, self.ONE_2_ID_GAP[amino_acid_one_char]]
606 |
607 | def eval_mutations(
608 | self,
609 | mutations_list: List[str],
610 | mutations_reference: Literal["fasta_trimmed", "fasta", "pdb"]="fasta_trimmed",
611 | metric: Literal["LOR", "LR"]="LOR",
612 | use_rsa_factor: bool=False,
613 | disable_wt_warning: bool=False,
614 | ) -> List[float]:
615 | """Return list of LOR (log-add ratio) or LR (log ratio) for each mutation in mutations_list
616 | * for a mutation: LOR('H13K') = log(freq(H, 13) / 1 - freq(H, 13)) - log(freq(K, 13) / 1 - freq(K, 13))
617 | * by default, position of the mutation is given in the fasta convention (first residue position is 1) on the trimmed MSA
618 |
619 | NOTE: mutation can be indicated in 3 different references:
620 | - 'fasta': residues are numbered using the FASTA convention (first residue is 1) using the input MSA target sequence as reference
621 | - 'fasta_trimmed': residues are numbered using the FASTA convention from the trimmed MSA (without target sequence gaps and non-std AAs)
622 | - 'pdb': residues are numbered as in the PDB file
623 |
624 | Arguments:
625 | mutations_list (List[str]): list of mutations as strings
626 | mutations_reference (str): "fasta_trimmed", "fasta", "pdb" to specify which mutation convention to use
627 | metric (str): "LOR" or "LR" to specify which metric to compute
628 | use_rsa_factor (bool): set True to multiply the score by the RSA factor at this position
629 | disable_wt_warning (bool): set True to not throw WARNING is mutation wt-aa does not match aa in the target sequence
630 | """
631 |
632 | # Set metric
633 | ALLOWED_METRICS = ["LOR", "LR"]
634 | assert metric in ALLOWED_METRICS, f"{self.error_prefix}.eval_mutations(): metric='{metric}' should be in {ALLOWED_METRICS}."
635 | if metric == "LOR":
636 | E_matrix = self.LOR
637 | else:
638 | E_matrix = self.LR
639 |
640 | # Uniformize mutations to 'fasta_trimmed' reference
641 | ALLOWED_MUTATIONS_TYPES = ["fasta_trimmed", "fasta", "pdb"]
642 | assert mutations_reference in ALLOWED_MUTATIONS_TYPES, f"{self.error_prefix}: mutations_reference='{mutations_reference}' sould be in {ALLOWED_MUTATIONS_TYPES}."
643 | if mutations_reference == "fasta" or mutations_reference == "pdb":
644 | residues_map = self.fasta_to_fasta_trimmed if mutations_reference == "fasta" else self.pdb_to_fasta_trimmed
645 | mutations_list_converted = []
646 | for mutation in mutations_list:
647 | wt, resid, mt = mutation[0], mutation[1:-1], mutation[-1]
648 | if resid not in residues_map:
649 | error_log = f"{self.error_prefix}.eval_mutations():"
650 | error_log += f"\nMutation '{mutation}' can not be converted from '{mutations_reference}' reference to 'fasta_trimmed' reference."
651 | error_log += f"\n - residue '{resid}' may be outside of the range of the MSA"
652 | if mutations_reference == "pdb":
653 | error_log += f"\n - residue '{resid}' may be missing in the PDB structure"
654 | elif mutations_reference == "fasta":
655 | error_log += f"\n - residue '{resid}' may be a gap or a non-standard amino acid in the target sequence of initial MSA"
656 | raise ValueError(error_log)
657 | mutation_converted = wt + residues_map[resid] + mt
658 | mutations_list_converted.append(mutation_converted)
659 | mutations_list_reference = [Mutation(mut) for mut in mutations_list_converted]
660 | else:
661 | mutations_list_reference = [Mutation(mut) for mut in mutations_list]
662 |
663 | # Compute mutations
664 | dE_arr = []
665 | for i, mutation in enumerate(mutations_list_reference):
666 | assert 1 <= mutation.position <= self.length, f"{self.error_prefix}.eval_mutations(): position of mutation='{mutation}' is out of range of target sequence of the MSA."
667 | if not disable_wt_warning:
668 | aa_target = self.target_sequence[mutation.position-1]
669 | aa_mutation = mutation.wt_aa.one
670 | # Trigger incorrect wt aa warning
671 | if aa_mutation != aa_target:
672 | mutation_description = f"'{mutation}'"
673 | if mutations_reference != "fasta_trimmed":
674 | mutation_description = f"{mutation_description} ('{mutations_list[i]}' in '{mutations_reference}' reference)"
675 | self.logger.warning(f"eval_mutations(): mutation {mutation_description}: wt-aa does not match target sequence aa '{aa_target}'.")
676 | dE = E_matrix[mutation.position-1, mutation.wt_aa.id] - E_matrix[mutation.position-1, mutation.mt_aa.id]
677 | dE_arr.append(dE)
678 |
679 | # Modulate by RSA factor
680 | if use_rsa_factor:
681 | for i, (mutation, dE) in enumerate(zip(mutations_list_reference, dE_arr)):
682 | rsa_factor = self.rsa_factor_array[mutation.position-1]
683 | if rsa_factor is None:
684 | dE_arr[i] = None
685 | else:
686 | dE_arr[i] = rsa_factor * dE
687 |
688 | return dE_arr
689 |
690 | def get_scores(self, round_digit: Union[None, int]=None, log_results: bool=False,) -> List[dict]:
691 | """Compute scores (gap_freq, wt_freq, mt_freq, RSA, LOR, RSA*LOR, ...) for each single-site mutation.
692 |
693 | NOTE: mutation are indicated in 3 different references:
694 | - 'fasta': residues are numbered using the FASTA convention (first residue is 1) using the input MSA target sequence as reference
695 | - 'fasta_trimmed': residues are numbered using the FASTA convention from the trimmed MSA (without target sequence gaps and non-std AAs)
696 | - 'pdb': residues are numbered as in the PDB file
697 |
698 | output: List of dictionary with the scores:
699 | [{mutation_fasta: 'A13G', LOR: 0.4578, ...}, ...]
700 | """
701 |
702 | # Log
703 | self.logger.step("compute scores for all single-site mutations.")
704 |
705 | # Compute scores for each single site mutation
706 | all_aas = AminoAcid.get_all()
707 | scores = []
708 | for i, wt in enumerate(self.target_sequence.sequence):
709 | wt_i = AminoAcid.ONE_2_ID[wt]
710 | resid_fasta_trimmed = str(i+1)
711 | resid_fasta = self.fasta_trimmed_to_fasta[resid_fasta_trimmed]
712 | resid_pdb = self.fasta_trimmed_to_pdb.get(resid_fasta_trimmed, None)
713 | RSA = self.rsa_array[i]
714 | RSA_factor = self.rsa_factor_array[i]
715 | CI = self.CI[i]
716 | gap_freq = self.gap_frequencies[i]
717 | wt_freq = self.frequencies[i, wt_i]
718 | for mt_aa in all_aas:
719 | mt = mt_aa.one
720 | mt_i = mt_aa.id
721 | mutation_fasta_trimmed = wt + resid_fasta_trimmed + mt
722 | mutation_fasta = wt + resid_fasta + mt
723 | mutation_pdb = None
724 | if resid_pdb is not None:
725 | mutation_pdb = wt + resid_pdb + mt
726 | mt_freq = self.frequencies[i, mt_i]
727 | LOR = self.LOR[i, wt_i] - self.LOR[i, mt_i]
728 | LR = self.LR[i, wt_i] - self.LR[i, mt_i]
729 | RSALOR, RSALR = None, None
730 | if RSA_factor is not None:
731 | RSALOR = RSA_factor * LOR
732 | RSALR = RSA_factor * LR
733 | score = {
734 | "mutation_fasta": mutation_fasta,
735 | "mutation_fasta_trimmed": mutation_fasta_trimmed,
736 | "mutation_pdb": mutation_pdb,
737 | "gap_freq": gap_freq,
738 | "wt_freq": wt_freq,
739 | "mt_freq": mt_freq,
740 | "CI": CI,
741 | "RSA": RSA,
742 | "LOR": LOR,
743 | "LR": LR,
744 | "RSA*LOR": RSALOR,
745 | "RSA*LR": RSALR,
746 | }
747 | scores.append(score)
748 |
749 | # Round float values if required
750 | if round_digit is not None:
751 | for score in scores:
752 | for prop in ["gap_freq", "wt_freq", "mt_freq", "CI", "RSA", "LOR", "LR", "RSA*LOR", "RSA*LR"]:
753 | val = score[prop]
754 | if val is not None:
755 | score[prop] = round(val, round_digit)
756 |
757 | # Log
758 | if log_results:
759 | scores_csv = CSV(list(scores[0].keys()), name=self.name)
760 | scores_csv.add_entries(scores[0:40])
761 | scores_csv.show(n_entries=40, max_colsize=23)
762 |
763 | return scores
764 |
765 | def save_scores(
766 | self,
767 | scores_path: str,
768 | round_digit: Union[None, int]=None,
769 | sep: str=";",
770 | missing_value: Union[None, str]="XXX",
771 | log_results: bool=False
772 | ) -> List[dict]:
773 | """Compute scores (gap_freq, wt_freq, mt_freq, RSA, LOR, RSA*LOR, ...) for each single-site mutation and save it to scores_path as a '.csv' file.
774 |
775 | NOTE: mutation are indicated in 3 different references:
776 | - 'fasta': residues are numbered using the FASTA convention (first residue is 1) using the input MSA target sequence as reference
777 | - 'fasta_trimmed': residues are numbered using the FASTA convention from the trimmed MSA (without target sequence gaps and non-std AAs)
778 | - 'pdb': residues are numbered as in the PDB file
779 |
780 | output: List of dictionary with the scores:
781 | [{mutation_fasta: 'A13G', LOR: 0.4578, ...}, ...]
782 | """
783 |
784 | # Compute scores
785 | scores = self.get_scores(round_digit)
786 |
787 | # Log
788 | self.logger.step("save scores to a file.")
789 | self.logger.log(f" * scores_path: '{scores_path}'")
790 |
791 | # Format in CSV
792 | scores_properties = list(scores[0].keys())
793 | scores_csv = CSV(scores_properties, name=self.name)
794 | scores_csv.set_sep(sep)
795 | scores_csv.add_entries(scores)
796 |
797 | # Change None to missing_value
798 | if missing_value is not None:
799 | for entry in scores_csv:
800 | for prop in scores_properties:
801 | if entry[prop] is None:
802 | entry[prop] = missing_value
803 |
804 | # Log
805 | if log_results:
806 | scores_csv.show(n_entries=40, max_colsize=23)
807 |
808 | # Save and return
809 | if scores_path is not None:
810 | scores_csv.write(scores_path)
811 | return scores
812 |
813 |
814 | # IO Methods ---------------------------------------------------------------
815 | def write(self, msa_path: str) -> "MSA":
816 | """Save MSA to a FASTA MSA file."""
817 |
818 | # Guardians
819 | msa_path = os.path.abspath(msa_path)
820 | assert os.path.isdir(os.path.dirname(msa_path)), f"{self.error_prefix}.write(): directory of msa_path='{msa_path}' does not exists."
821 | assert msa_path.endswith(".fasta"), f"{self.error_prefix}.write(): msa_path='{msa_path}' should end with '.fasta'."
822 |
823 | # Write
824 | with open(msa_path, "w") as fs:
825 | fs.write("".join([seq.to_fasta_string() for seq in self.sequences]))
826 | return self
827 |
828 |
829 | # Guardians Dependencies ---------------------------------------------------
830 | # Helpers to verify coherence of inputs and current state
831 |
832 | def _verify_input_msa_path(self, msa_path: str) -> None:
833 | """For correct format and existance of input msa_path."""
834 |
835 | # Existance
836 | assert os.path.exists(msa_path), f"{self.error_prefix}: msa_path='{msa_path}' files does not exist."
837 |
838 | # Hint for '.ali' format
839 | if msa_path.endswith(".ali"):
840 | error_log = f"{self.error_prefix}: msa_path='{msa_path}' should be in FASTA format."
841 | error_log += f"\n * msa_path: '{msa_path}'"
842 | error_log += f"\n * input msa_path is expected to be a MSA file in FASTA ('.fasta') format."
843 | error_log += f"\n * Please convert the MSA to '.fasta' with python script: "
844 | error_log += "\nfrom rsalor.utils import ali_to_fasta"
845 | error_log += "\nali_to_fasta('./my_msa.ali', './my_msa.fasta')\n"
846 | raise ValueError(error_log)
847 |
848 | # ERROR for bad MSA extention
849 | if msa_path.split(".")[-1] not in self.ACCEPTED_EXTENTIONS:
850 | error_log = f"{self.error_prefix}: msa_path='{msa_path}' should be in FASTA format (with file extention in {self.ACCEPTED_EXTENTIONS})."
851 | raise ValueError(error_log)
852 |
853 | def _verify_sequence_length(self, sequence: Sequence, target_length: int, i: int) -> None:
854 | """For coherence of all sequences in the MSA."""
855 | if len(sequence) != target_length:
856 | seq_str = sequence.sequence
857 | if len(seq_str) > 40:
858 | seq_str = seq_str[0:37] + "..."
859 | error_log = f"{self.error_prefix}._read_sequences(): msa_path='{self.msa_path}':"
860 | error_log += f"\n -> length of sequence [{i+1}] l={len(sequence)} ('{seq_str}') does not match length of target sequence l={target_length}."
861 | raise ValueError(error_log)
862 |
863 | def _verify_trimmed_seq_path(self) -> None:
864 | """For coherence of trimmed_msa_path and for safety to not overwrite initial input MSA."""
865 | trimmed_msa_path = str(os.path.abspath(self.trimmed_msa_path))
866 | assert os.path.isdir(os.path.dirname(trimmed_msa_path)), f"{self.error_prefix}: directory of trimmed_msa_path='{trimmed_msa_path}' does not exists."
867 | assert trimmed_msa_path.endswith(".fasta"), f"{self.error_prefix}: trimmed_msa_path='{trimmed_msa_path}' should end with '.fasta'."
868 | if os.path.normpath(self.msa_path) == os.path.normpath(trimmed_msa_path) and not self.allow_msa_overwrite:
869 | error_log = f"{self.error_prefix}: trimmed_msa_path='{trimmed_msa_path}' is same as input MSA path."
870 | error_log == "\nIf trimmed_msa_path is set, the trimmed MSA (without target sequence gaps and non-std AAs) will be saved to this path."
871 | error_log += "\nWARNING: This operation will overwrite initial input MSA file."
872 | error_log += "\nTo continue, set argument 'allow_msa_overwrite' to True."
873 | raise ValueError(error_log)
874 |
875 | def _verify_rsa_values(self) -> None:
876 | """Warnings for non-assigned RSA residues."""
877 | norsa_std, norsa_non_std = 0, 0
878 | for residue in self.structure.chain_residues:
879 | if residue.rsa is None:
880 | if residue.amino_acid.is_standard():
881 | norsa_std += 1
882 | else:
883 | norsa_non_std += 1
884 | norsa = norsa_std + norsa_non_std
885 | if norsa > 0:
886 | warning_log = f"{norsa} / {len(self.structure.chain_residues)} residues with no assigned RSA values ({norsa_std} std and {norsa_non_std} non-std) in PDB target chain '{self.chain}'."
887 | warning_log += "\n -> This can be caused by non-standard AAs or missing atoms."
888 | warning_log += "\n -> For optimal RSA estimations, we highly recommend to 'repair' the PDB and standardize AAs."
889 | self.logger.warning(warning_log)
890 |
--------------------------------------------------------------------------------
/rsalor/rsa/__init__.py:
--------------------------------------------------------------------------------
1 | from rsalor.rsa.rsa_solver import RSASolver
2 | from rsalor.rsa.rsa_biopython import RSABiopython
3 | from rsalor.rsa.rsa_dssp import RSADSSP
4 | from rsalor.rsa.rsa_music import RSAMuSiC
5 |
--------------------------------------------------------------------------------
/rsalor/rsa/rsa_biopython.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from typing import Dict
5 | from Bio.PDB import PDBParser
6 | from Bio.PDB.SASA import ShrakeRupley
7 | from rsalor.sequence import AminoAcid
8 | from rsalor.rsa.rsa_solver import RSASolver
9 |
10 | # RSAMuSiC ---------------------------------------------------------------------
11 | class RSABiopython(RSASolver):
12 | """
13 | RSABiopython(): Solver for RSA (Relative Solvent Accessibility) from a '.pdb' file using python package biopython.
14 | Uses the “rolling ball” algorithm developed by Shrake & Rupley algorithm
15 | doc: https://biopython.org/docs/dev/api/Bio.PDB.SASA.html
16 |
17 | usage:
18 | rsa_map = RSABiopython().run('./my_pdb.pdb')
19 | """
20 |
21 | # Constants ----------------------------------------------------------------
22 | # Taken from https://pmc.ncbi.nlm.nih.gov/articles/PMC3836772/#pone-0080635-t001
23 | MAX_SURFACE_MAP = {
24 | "ALA": 1.29,
25 | "ARG": 2.74,
26 | "ASN": 1.95,
27 | "ASP": 1.93,
28 | "CYS": 1.67,
29 | "GLN": 2.23,
30 | "GLU": 2.25,
31 | "GLY": 1.04,
32 | "HIS": 2.24,
33 | "ILE": 1.97,
34 | "LEU": 2.01,
35 | "LYS": 2.36,
36 | "MET": 2.24,
37 | "PHE": 2.40,
38 | "PRO": 1.59,
39 | "SER": 1.55,
40 | "THR": 1.55,
41 | "TRP": 2.85,
42 | "TYR": 2.63,
43 | "VAL": 1.74,
44 | }
45 | MAX_SURFACE_DEFAULT = 2.01 # mean value
46 |
47 | # Methods ------------------------------------------------------------------
48 | def __str__(self) -> str:
49 | return "RSASolver['biopython' (Shrake & Rupley algorithm)]"
50 |
51 | def execute_solver(self, pdb_path: str) -> Dict[str, float]:
52 | """Compute RSA by running biopython python package: Bio.PDB.SASA: ShrakeRupley
53 | doc: https://biopython.org/docs/dev/api/Bio.PDB.SASA.html
54 |
55 | args:
56 | pdb_path (str): path to PDB file
57 |
58 | output:
59 | {resid: str => RSA: float} (such as {'A13': 48.57, ...})
60 | """
61 |
62 | # Parse PDB file
63 | pdb_name = os.path.basename(pdb_path).removesuffix(".pdb")
64 | pdb_parser = PDBParser(QUIET=True)
65 | structure = pdb_parser.get_structure(pdb_name, pdb_path)
66 |
67 | # Compute ASA
68 | shrake_rupley = ShrakeRupley(
69 | #probe_radius=1.40, # radius of the probe in A. Default is 1.40, roughly the radius of a water molecule.
70 | #n_points=200, # resolution of the surface of each atom. Default is 100. A higher number of points results in more precise measurements, but slows down the calculation.
71 | #radii_dict=None, # user-provided dictionary of atomic radii to use in the calculation. Values will replace/complement those in the default ATOMIC_RADII dictionary.
72 | )
73 | shrake_rupley.compute(structure, level="R")
74 |
75 | # Convert to RSA and format
76 | rsa_map: Dict[str, float] = {}
77 | for chain_obj in structure[0]:
78 | chain = chain_obj.id
79 | chain_structure = structure[0][chain]
80 | for residue in chain_structure:
81 |
82 | # Find 'resid' = {chain}{res_position}
83 | (res_insertion, res_id, res_alternate_location) = residue.id
84 | resid = f"{chain}{res_insertion}{res_id}".replace(" ", "")
85 |
86 | # Get AA 3-letter code and standardize if required
87 | aa_three = residue.resname
88 | aa_three = AminoAcid._NON_STANDARD_AAS.get(aa_three, aa_three)
89 |
90 | # Get RSA
91 | asa = residue.sasa
92 | if isinstance(asa, float):
93 | rsa_map[resid] = asa / self.get_max_surf(aa_three)
94 | return rsa_map
95 |
96 | @classmethod
97 | def get_max_surf(cls, aa_three: str) -> float:
98 | return cls.MAX_SURFACE_MAP.get(aa_three, cls.MAX_SURFACE_DEFAULT)
99 |
--------------------------------------------------------------------------------
/rsalor/rsa/rsa_dssp.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | import sys
5 | from typing import Dict, Union
6 | import tempfile
7 | from contextlib import contextmanager
8 | from Bio.PDB import PDBParser
9 | from Bio.PDB.DSSP import DSSP
10 | from rsalor.utils import find_file
11 | from rsalor.rsa.rsa_solver import RSASolver
12 |
13 | # RSADSSP ----------------------------------------------------------------------
14 | class RSADSSP(RSASolver):
15 | """
16 | RSADSSP(): Solver for RSA (Relative Solvent Accessibility) from a '.pdb' file using DSSP software.
17 |
18 | usage:
19 | rsa_map = RSADSSP('./soft/DSSP/dssp').run('./my_pdb.pdb')
20 | """
21 |
22 | # Constants ----------------------------------------------------------------
23 | CANDIDATES_PATHS = ["mkdssp", "dssp"]
24 | HELPER_LOG = """-------------------------------------------------------
25 | RSA Solver: DSSP issue:
26 | In order to solve Relative Solvent Accessiblity (RSA), RSALOR package uses:
27 | Python package biopython -> interface with the DSSP algorithms (https://biopython.org/docs/1.75/api/Bio.PDB.DSSP.html).
28 | The DSSP software (free for academic use) has to be installed on your computer.
29 | Please install DSSP (https://swift.cmbi.umcn.nl/gv/dssp/) and specify the path to its executable or add it to system PATH.
30 | DSSP source code can be found here: https://github.com/cmbi/hssp
31 |
32 | NOTE: you can still use the RSALOR package without DSSP if you only want LOR values of the MSA without using RSA (just set pdb_path=None).
33 | -------------------------------------------------------"""
34 |
35 | # Methods ------------------------------------------------------------------
36 | def __str__(self) -> str:
37 | return "RSASolver['DSSP']"
38 |
39 | def execute_solver(self, pdb_path: str) -> Dict[str, float]:
40 | """Compute RSA by running DSSP.
41 |
42 | args:
43 | pdb_path (str): path to PDB file
44 |
45 | output:
46 | {resid: str => RSA: float} (such as {'A13': 48.57, ...})
47 | """
48 |
49 | # Init DSSP path (check DSSP executable existance only if software is executed)
50 | self._init_dssp_path()
51 |
52 | # Run DSSP
53 | pdb_with_cryst1_line = self._inject_cryst1_line(pdb_path) # Manage CRYST1 line
54 | # Case: CRYST1 line is already present or this version of DSSP does not requires it
55 | if pdb_with_cryst1_line is None:
56 | rsa_map = self._run_dssp_backend(pdb_path)
57 | # Case: inject CRYST1 line and run DSSP on modified PDB
58 | else:
59 | with tempfile.NamedTemporaryFile(delete=True) as temp_file:
60 | tmp_pdb_path = temp_file.name
61 | with open(tmp_pdb_path, "w") as fs:
62 | fs.write(pdb_with_cryst1_line)
63 | rsa_map = self._run_dssp_backend(tmp_pdb_path)
64 |
65 | return rsa_map
66 |
67 | # Dependencies -------------------------------------------------------------
68 | def _init_dssp_path(self) -> str:
69 | """Find an existing executable file for DSSP on the computer."""
70 | if self.executable_path is not None:
71 | dssp_path_list = [self.executable_path] + self.CANDIDATES_PATHS
72 | else:
73 | dssp_path_list = self.CANDIDATES_PATHS
74 | dssp_path = find_file(dssp_path_list, is_software=True, name="DSSP", description=self.HELPER_LOG, verbose=self.verbose)
75 | self.dssp_path = dssp_path
76 |
77 | def _inject_cryst1_line(self, pdb_path: str) -> Union[None, str]:
78 | """Inject CRYST1 line in a PDB file if there is not one.
79 | -> If CRYST1 line is present, return None
80 | -> Else return a string of the PDB file with the CRYST1 line
81 | """
82 |
83 | # No need to inject CRYST1 line with mkdssp
84 | if self.dssp_path.endswith("mkdssp"):
85 | return None
86 |
87 | # Constants
88 | CRYST1_HEADER = "CRYST1"
89 | ATOM_HEADER = "ATOM"
90 | DEFAULT_CRYST1_LINE = "CRYST1 1.000 1.000 1.000 90.00 90.00 90.00 P 1 1 \n"
91 |
92 | # Read lines
93 | new_lines = []
94 | with open(pdb_path, "r") as fs:
95 | line = fs.readline()
96 |
97 | # Read lines to detect CRYST1 line
98 | while line:
99 | if line.startswith(CRYST1_HEADER): # Return None to specify that CRYST1 line is already here
100 | return None
101 | if line.startswith(ATOM_HEADER):
102 | new_lines.append(DEFAULT_CRYST1_LINE)
103 | new_lines.append(line)
104 | line = fs.readline()
105 | break
106 | new_lines.append(line)
107 | line = fs.readline()
108 |
109 | # After injecting CRYST1 line, continue following lines
110 | while line:
111 | new_lines.append(line)
112 | line = fs.readline()
113 |
114 | # Return pdb string with injected CRYST1 line
115 | return "".join(new_lines)
116 |
117 | def _run_dssp_backend(self, pdb_path: str) -> Dict[str, float]:
118 | """Run DSSP software with the BioPython interface."""
119 |
120 | # Parse PDB with BioPython
121 | pdb_name = os.path.basename(pdb_path).removesuffix(".pdb")
122 | structure = PDBParser(QUIET=True).get_structure(pdb_name, pdb_path)
123 | model = structure[0]
124 |
125 | # Run DSSP
126 | if not self.verbose: # Run DSSP with WARNINGS desabled
127 | with suppress_stderr():
128 | dssp = DSSP(model, pdb_path, dssp=self.dssp_path)
129 | else: # Run DSSP normally
130 | dssp = DSSP(model, pdb_path, dssp=self.dssp_path)
131 |
132 | # Parse Residues
133 | resid_set = set()
134 | residues_keys = list(dssp.keys())
135 | rsa_map: Dict[str, float] = {}
136 | for res_key in residues_keys:
137 | chain, (res_insertion, res_id, res_alternate_location) = res_key
138 | resid = f"{chain}{res_insertion}{res_id}".replace(" ", "")
139 | if resid not in resid_set:
140 | res_data = dssp[res_key]
141 | resid_set.add(resid)
142 | rsa = res_data[3]
143 | if isinstance(rsa, float):
144 | rsa = round(rsa * 100.0, 4)
145 | rsa_map[resid] = rsa
146 |
147 | # Return
148 | return rsa_map
149 |
150 | # Just to delete WARNINGS from DSSP and BioPython ------------------------------
151 | # Because BioPython and DSSP does not provide a disable WARNINGS option ...
152 | @contextmanager
153 | def suppress_stderr():
154 | """Redirect standard error to null (with some magic)"""
155 | original_stderr = sys.stderr
156 | sys.stderr = open(os.devnull, 'w')
157 | try:
158 | yield
159 | finally:
160 | sys.stderr.close()
161 | sys.stderr = original_stderr
--------------------------------------------------------------------------------
/rsalor/rsa/rsa_music.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from typing import Dict
5 | import tempfile
6 | import subprocess
7 | from rsalor.utils import find_file
8 | from rsalor.rsa.rsa_solver import RSASolver
9 |
10 | # RSAMuSiC ---------------------------------------------------------------------
11 | class RSAMuSiC(RSASolver):
12 | """
13 | RSAMuSiC(): Solver for RSA (Relative Solvent Accessibility) from a '.pdb' file using MuSiC software.
14 |
15 | usage:
16 | rsa_map = RSAMuSiC('./soft/MuSiC/music').run('./my_pdb.pdb')
17 | """
18 |
19 | # Constants ----------------------------------------------------------------
20 |
21 | # Saved condidate paths to simplify execution on different machines
22 | CANDIDATES_PATHS = [
23 | "music", "music_retro", # MuSiC executables
24 | "/home/Softs/MuSiC-4.1/music", # Nautilus and Santorin
25 | ]
26 |
27 | # Helper: hint to solve problem form the user: install software please
28 | HELPER_LOG = """-------------------------------------------------------
29 | RSA Solver: MuSiC issue:
30 | In order to solve Relative Solvent Accessiblity (RSA), RSALOR package uses MuSiC software.
31 | MuSiC is our in house protein structure software (https://soft.dezyme.com/).
32 | Please install MuSiC and specify the path to its executable or add it to system PATH.
33 |
34 | Alternatively, if you do not have access to MuSiC, set rsa_solver='DSSP' and install DSSP (free for academic uses)
35 | DSSP: https://swift.cmbi.umcn.nl/gv/dssp/
36 |
37 | NOTE: you can still use the RSALOR package without MuSiC if you only want LOR values of the MSA without using RSA (just set pdb_path=None).
38 | -------------------------------------------------------"""
39 |
40 | # Methods ------------------------------------------------------------------
41 | def __str__(self) -> str:
42 | return "RSASolver['MuSiC']"
43 |
44 | def execute_solver(self, pdb_path: str) -> Dict[str, float]:
45 | """Compute RSA by running MuSiC: 'music -cat'
46 |
47 | args:
48 | pdb_path (str): path to PDB file
49 |
50 | output:
51 | {resid: str => RSA: float} (such as {'A13': 48.57, ...})
52 | """
53 |
54 | # Init MuSiC path (check MuSiC executable existance only if software is executed)
55 | self._init_music_path()
56 |
57 | # Using temporary directory
58 | with tempfile.TemporaryDirectory() as tmp_dir:
59 |
60 | # Init
61 | name = os.path.basename(pdb_path).removesuffix(".pdb")
62 | pdb_dir = os.path.abspath(os.path.dirname(pdb_path))
63 | path_in_path = os.path.join(tmp_dir, "path.in")
64 | cat_path = os.path.join(tmp_dir, f"{name}.cat")
65 | log_path = os.path.join(tmp_dir, f"log_{name}.txt")
66 |
67 | # Generate path.in file
68 | path_in = "\n".join([
69 | f"DATA {os.path.join(os.path.dirname(self.music_path), 'MuSiC/Data/')}",
70 | f"PDB {pdb_dir}/",
71 | f"OUTPUT {tmp_dir}/",
72 | f"CAT {tmp_dir}/\n"
73 | ])
74 | with open(path_in_path, "w") as fs:
75 | fs.write(path_in)
76 |
77 | # Adapt run to MuSiC 4.0 or 4.1 assuming version is specified in MuSiC folder name
78 | music_last_folder_name = os.path.basename(os.path.dirname(self.music_path))
79 | sidechain_parameter = "FULLATOM" if "4.1" in music_last_folder_name else ""
80 |
81 | # Run MuSiC cat
82 | music_cmd = f"{self.music_path} -cat {name} {sidechain_parameter} {name} -init {path_in_path} -log {name}"
83 | if self.verbose:
84 | print(f" * run MuSiC command: '{music_cmd}'")
85 | process = subprocess.run(music_cmd.split(), shell=False, capture_output=True, text=True)
86 |
87 | # Check execution errors
88 | if process.returncode != 0:
89 | self._log_music_execution_error(process, log_path, music_cmd)
90 | raise ValueError(f"ERROR in {self}.execute_solver(): execution of MuSiC failed.")
91 | if not os.path.isfile(cat_path):
92 | self._log_music_execution_error(process, log_path, music_cmd)
93 | raise ValueError(f"ERROR in {self}.execute_solver(): execution of MuSiC succeeded but no output '.cat' file is generated at '{cat_path}'.")
94 |
95 | # Log output
96 | #if self.verbose:
97 | # print("\nMuSiC logs: ")
98 | # music_lines = result.stdout.split("\n")
99 | # music_lines = [line for line in music_lines if len(line) > 0]
100 | # print("\n".join(music_lines) + "\n")
101 |
102 | # Parse MuSiC -cat output
103 | try:
104 | rsa_map = self._parse_cat(cat_path)
105 | except:
106 | raise ValueError(f"ERROR in {self}.execute_solver(): failed to parse RSA from generated file '{cat_path}'.")
107 | assert len(rsa_map) > 0, f"ERROR in {self}.execute_solver(): no valid RSA data found in file '{cat_path}'."
108 |
109 | # Return
110 | return rsa_map
111 |
112 | # Dependencies -------------------------------------------------------------
113 | def _init_music_path(self) -> str:
114 | """Find an existing executable file for MuSiC on the computer."""
115 | if self.executable_path is not None:
116 | music_path_list = [self.executable_path] + self.CANDIDATES_PATHS
117 | else:
118 | music_path_list = self.CANDIDATES_PATHS
119 | music_path = find_file(music_path_list, is_software=True, name="MuSiC", description=self.HELPER_LOG, verbose=self.verbose)
120 | self.music_path = music_path
121 |
122 | def _parse_cat(self, file_path: str) -> Dict[str, float]:
123 | """Parse music '.cat' file and return RSA mapping: {resid: str => RSA: float}."""
124 |
125 | # Guardians
126 | assert os.path.isfile(file_path), f"ERROR in {self}._parse_cat(): file_path='{file_path}' does not exists."
127 | assert file_path.endswith(".cat"), f"ERROR in {self}._parse_cat(): file_path='{file_path}' should end with '.cat'."
128 |
129 | # Read cat file
130 | with open(file_path, "r") as fs:
131 |
132 | # Skip lines before #RESIDUES section
133 | line = fs.readline()
134 | while line and not line.startswith("#RESIDUES"):
135 | if line.startswith("#RESIDUES"): break
136 | line = fs.readline()
137 | line = fs.readline()
138 |
139 | # Read #RESIDUES section
140 | rsa_map: Dict[str, float] = {}
141 | while line:
142 |
143 | # Break after #RESIDUES section ends
144 | if line.startswith("#"): break
145 |
146 | # Parse values from line
147 | resid = line[0:6].replace(" ", "")
148 | rsa = float(line[30:40])
149 |
150 | # Save values
151 | rsa_map[resid] = rsa
152 | line = fs.readline()
153 |
154 | # Sanity chech and return
155 | return rsa_map
156 |
157 | def _log_music_execution_error(self, music_run_process, log_path: str, music_cmd: str) -> None:
158 | print("\nERROR in MuSiC execution.")
159 | print(" * MuSiC command: ")
160 | print(f" $ {music_cmd}")
161 | print(" * Standard Output (stdout): ")
162 | print(music_run_process.stdout)
163 | print(" * Error Output (stderr): ")
164 | print(music_run_process.stderr)
165 | print(" * Log file content: ")
166 | if os.path.isfile(log_path):
167 | with open(log_path, "r") as fs:
168 | log_lines = "\n".join(list(fs.readlines()))
169 | print(log_lines)
--------------------------------------------------------------------------------
/rsalor/rsa/rsa_solver.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from abc import ABC, abstractmethod
5 | from typing import Dict, Union
6 |
7 |
8 | # Abstract RSASolver class -----------------------------------------------------
9 | class RSASolver(ABC):
10 | """
11 | Abstract container class for RSASolver: to compute RSA (Relative Solvent Accessibility) from a PDB file.
12 |
13 | usage:
14 | rsa_map = RSASolver('./soft/software_executable').run('./my_pdb.pdb')
15 | """
16 |
17 | # Constants ----------------------------------------------------------------
18 | COMMENT_CHAR = "#"
19 |
20 | # Constructor --------------------------------------------------------------
21 | def __init__(
22 | self,
23 | executable_path: Union[None, str]=None,
24 | verbose: bool=False,
25 | ):
26 | self.executable_path = executable_path
27 | self.verbose = verbose
28 |
29 | # Methods ------------------------------------------------------------------
30 | def __str__(self) -> str:
31 | return "RSASolver['AbstractSolver']"
32 |
33 | def run(
34 | self,
35 | pdb_path: str,
36 | rsa_cache_path: Union[None, str]=None,
37 | ) -> Dict[str, float]:
38 | """Compute RSA by running the solver or using the caced file.
39 |
40 | args:
41 | pdb_path (str): path to PDB file
42 | rsa_cache_path (str): path to/from where save/read RSA (if file exists, solver execution will be skipped and output directly read from file)
43 |
44 | output:
45 | {resid: str => RSA: float} (such as {'A13': 48.57, ...})
46 | """
47 |
48 | # Parse RSA if cache file exists
49 | if rsa_cache_path is not None and os.path.isfile(rsa_cache_path):
50 | if self.verbose:
51 | print(f" * read RSA values from rsa_cache_path '{rsa_cache_path}'")
52 | rsa_map = self.read(rsa_cache_path)
53 | return rsa_map
54 |
55 | # PDB file Guardians
56 | assert os.path.isfile(pdb_path), f"ERROR in {self}.execute_solver(): pdb_path='{pdb_path}' file does not exists."
57 | assert pdb_path.endswith(".pdb"), f"ERROR in {self}.execute_solver(): pdb_path='{pdb_path}' should be a '.pdb' file."
58 |
59 | # Run solver
60 | if self.verbose:
61 | print(f" * execute RSA solver: {self}")
62 | rsa_map = self.execute_solver(pdb_path)
63 |
64 | # Write RSA map
65 | if rsa_cache_path is not None and not os.path.isfile(rsa_cache_path):
66 | if self.verbose:
67 | print(f" * save RSA values in rsa_cache_path '{rsa_cache_path}'")
68 | self.write(rsa_cache_path, rsa_map)
69 |
70 | # Return
71 | return rsa_map
72 |
73 | @abstractmethod
74 | def execute_solver(self, pdb_path: str) -> Dict[str, float]:
75 | """Compute RSA by running the solver.
76 |
77 | args:
78 | pdb_path (str): path to PDB file
79 |
80 | output:
81 | {resid: str => RSA: float} (such as {'A13': 48.57, ...})
82 | """
83 | pass
84 |
85 | def read(self, file_path: str) -> Dict[str, float]:
86 | """Read rsa_map file and return RSA mapping: {resid: str => RSA: float}."""
87 |
88 | # Guardians
89 | assert os.path.isfile(file_path), f"ERROR in {self}.read(): file file_path='{file_path}' does not exist."
90 |
91 | # Parse and return
92 | rsa_map: Dict[str, float] = {}
93 | with open(file_path, "r") as fs:
94 | lines = [line.split() for line in fs.readlines() if len(line) > 2 and line[0] != self.COMMENT_CHAR]
95 | for line in lines:
96 | resid, rsa = line[0], line[1]
97 | rsa_map[resid] = float(rsa)
98 |
99 | # Guardian and return
100 | assert len(rsa_map) > 0, f"ERROR in {self}.read(): No RSA data found in file_path='{file_path}'."
101 | return rsa_map
102 |
103 | def write(self, file_path: str, rsa_map: Dict[str, float]) -> "RSASolver":
104 | """Write rsa_map to a file."""
105 |
106 | # Guardians
107 | file_path = os.path.abspath(file_path)
108 | assert os.path.isdir(os.path.dirname(file_path)), f"ERROR in {self}.write(): directory of file_path='{file_path}' does not exists."
109 |
110 | # Stringity
111 | rsa_map_str = "\n".join(f"{resid} {rsa}" for resid, rsa in rsa_map.items())
112 |
113 | # Write
114 | with open(file_path, "w") as fs:
115 | fs.write(rsa_map_str)
116 |
117 | return self
--------------------------------------------------------------------------------
/rsalor/sequence/__init__.py:
--------------------------------------------------------------------------------
1 | from rsalor.sequence.amino_acid import AminoAcid
2 | from rsalor.sequence.mutation import Mutation
3 | from rsalor.sequence.sequence import Sequence
4 | from rsalor.sequence.fasta_reader import FastaReader, FastaStream
5 | from rsalor.sequence.pairwise_alignment import PairwiseAlignment
6 |
--------------------------------------------------------------------------------
/rsalor/sequence/amino_acid.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | from typing import List, Dict
4 |
5 | # AminoAcid --------------------------------------------------------------------
6 | class AminoAcid:
7 | """Container class for the 20 standard proteogenic amino acids.
8 | * Manages mapping between amino acids: id, one-letter-code and three-letter-code
9 | * Manages non-standard amino acids three-letters-codes and their corresponding standard AA
10 |
11 | usage:
12 |
13 | alanine_id: int = AminoAcid.ONE_2_ID['A'] \n
14 | one_letter_code_3: str = AminoAcid.ID_2_ONE[3] \n
15 | ala: AminoAcid= AminoAcid('A') \n
16 | non_standard_metionine: AminoAcid = AminoAcid.parse_three('MSE')
17 | """
18 |
19 | # Static Properties --------------------------------------------------------
20 |
21 | # Standard Amino Acids metadata
22 | _AA_LIST = [
23 | ( 0, "A", "ALA", "alanine"),
24 | ( 1, "C", "CYS", "cysteine"),
25 | ( 2, "D", "ASP", "aspartate"),
26 | ( 3, "E", "GLU", "glutamate"),
27 | ( 4, "F", "PHE", "phenylalanine"),
28 | ( 5, "G", "GLY", "glycine"),
29 | ( 6, "H", "HIS", "histidine"),
30 | ( 7, "I", "ILE", "isoleucine"),
31 | ( 8, "K", "LYS", "lysine"),
32 | ( 9, "L", "LEU", "leucine"),
33 | (10, "M", "MET", "methionine"),
34 | (11, "N", "ASN", "asparagine"),
35 | (12, "P", "PRO", "proline"),
36 | (13, "Q", "GLN", "glutamine"),
37 | (14, "R", "ARG", "arginine"),
38 | (15, "S", "SER", "serine"),
39 | (16, "T", "THR", "thréonine"),
40 | (17, "V", "VAL", "valine"),
41 | (18, "W", "TRP", "tryptophane"),
42 | (19, "Y", "TYR", "tyrosine"),
43 | ]
44 |
45 | # Gap and Unknown Amino Acid properties
46 | GAP_ID = 20
47 | GAP_ONE = "-"
48 | GAP_THREE = "GAP"
49 | GAP_NAME = "gap"
50 | UNK_ID = -1
51 | UNK_ONE = "X"
52 | UNK_THREE = "XXX"
53 | UNK_NAME = "unknown"
54 |
55 | # Non-Standard Amino Acids three-letter-codes mapped to closest Standard Amino Acids
56 | _NON_STANDARD_AAS = {
57 | # WARNING: We do not represent here ambiguous mappings like: "GLX" => "GLN" or "GLU"
58 | "4HT": "TRP", "CLG": "LYS", "HSE": "SER", "BIF": "PHE", "B3D": "ASP", "BB8": "PHE", "3MY": "TYR", "SNK": "HIS",
59 | "3CF": "PHE", "A5N": "ASN", "LED": "LEU", "TOX": "TRP", "CR5": "GLY", "ILM": "ILE", "0A9": "PHE", "DAS": "ASP",
60 | "NYS": "CYS", "73P": "LYS", "MSO": "MET", "IYR": "TYR", "PR9": "PRO", "R4K": "TRP", "L5P": "LYS", "31Q": "CYS",
61 | "OCY": "CYS", "BH2": "ASP", "XSN": "ASN", "SXE": "SER", "GMA": "GLU", "SEP": "SER", "CYD": "CYS", "YPZ": "TYR",
62 | "GPL": "LYS", "RVX": "SER", "YCM": "CYS", "SEL": "SER", "DNE": "LEU", "LEN": "LEU", "4FB": "PRO", "4OU": "PHE",
63 | "LGY": "LYS", "TTQ": "TRP", "DBB": "THR", "LBZ": "LYS", "QX7": "ALA", "H14": "PHE", "CIR": "ARG", "73O": "TYR",
64 | "EI4": "ARG", "LVN": "VAL", "SRZ": "SER", "55I": "PHE", "UF0": "SER", "YHA": "LYS", "QM8": "ALA", "TQQ": "TRP",
65 | "QIL": "ILE", "Q75": "MET", "11Q": "PRO", "A8E": "VAL", "DHV": "VAL", "3BY": "PRO", "2ZC": "SER", "T9E": "THR",
66 | "CSZ": "CYS", "5CS": "CYS", "KPI": "LYS", "0AH": "SER", "HSK": "HIS", "TH6": "THR", "ARO": "ARG", "E9V": "HIS",
67 | "UXQ": "PHE", "MHL": "LEU", "CAS": "CYS", "8RE": "LYS", "LLP": "LYS", "PTH": "TYR", "ORQ": "ARG", "73N": "ARG",
68 | "BTK": "LYS", "HVA": "VAL", "LMQ": "GLN", "FME": "MET", "XX1": "LYS", "I7F": "SER", "4N9": "PRO", "TYJ": "TYR",
69 | "BOR": "ARG", "HL2": "LEU", "73C": "SER", "0CS": "ALA", "AGM": "ARG", "CYW": "CYS", "ASL": "ASP", "I3D": "TRP",
70 | "NPH": "CYS", "JKH": "PRO", "QMB": "ALA", "XCN": "CYS", "PHI": "PHE", "NAL": "ALA", "LYZ": "LYS", "6M6": "CYS",
71 | "VAD": "VAL", "EXL": "TRP", "WFP": "PHE", "823": "ASN", "CLH": "LYS", "C6C": "CYS", "DCY": "CYS", "DPP": "ALA",
72 | "KHB": "LYS", "DNW": "ALA", "BUC": "CYS", "CSU": "CYS", "H5M": "PRO", "RXL": "VAL", "FOE": "CYS", "GHP": "GLY",
73 | "2KP": "LYS", "OMX": "TYR", "ZCL": "PHE", "MGG": "ARG", "DLS": "LYS", "30V": "CYS", "02K": "ALA", "DA2": "ARG",
74 | "TYY": "TYR", "HRG": "ARG", "PHL": "PHE", "PRJ": "PRO", "M2L": "LYS", "SUN": "SER", "TSY": "CYS", "PF5": "PHE",
75 | "4CF": "PHE", "1OP": "TYR", "CSB": "CYS", "POM": "PRO", "ELY": "LYS", "TRQ": "TRP", "BP5": "ALA", "5VV": "ASN",
76 | "6DN": "LYS", "MIS": "SER", "MLZ": "LYS", "EME": "GLU", "4J5": "ARG", "MPQ": "GLY", "LLO": "LYS", "FQA": "LYS",
77 | "PR7": "PRO", "NLW": "LEU", "OMY": "TYR", "5CT": "LYS", "PRK": "LYS", "DPQ": "TYR", "N0A": "PHE", "3QN": "LYS",
78 | "K5H": "CYS", "HNC": "CYS", "TYO": "TYR", "Q3P": "LYS", "BWV": "ARG", "4L0": "PRO", "ZAL": "ALA", "IAM": "ALA",
79 | "AGQ": "TYR", "07O": "CYS", "PCA": "GLN", "2MR": "ARG", "TRN": "TRP", "4AR": "ARG", "HLY": "LYS", "DHI": "HIS",
80 | "J2F": "TYR", "C3Y": "CYS", "GL3": "GLY", "BTR": "TRP", "OYL": "HIS", "IGL": "GLY", "2GX": "PHE", "8LJ": "PRO",
81 | "AYA": "ALA", "XYC": "ALA", "CY1": "CYS", "CGU": "GLU", "PM3": "PHE", "03Y": "CYS", "CE7": "ASN", "HSL": "SER",
82 | "BXT": "SER", "MHU": "PHE", "HOX": "PHE", "5GM": "ILE", "DVA": "VAL", "CYR": "CYS", "YOF": "TYR", "DDZ": "ALA",
83 | "4PQ": "TRP", "ECC": "GLN", "GHG": "GLN", "IPG": "GLY", "PPN": "PHE", "L3O": "LEU", "AEA": "CYS", "7N8": "PHE",
84 | "AHO": "ALA", "TBG": "VAL", "BFD": "ASP", "HPE": "PHE", "5MW": "LYS", "U2X": "TYR", "N10": "SER", "TGH": "TRP",
85 | "51T": "TYR", "DDE": "HIS", "DBZ": "ALA", "FF9": "LYS", "HTN": "ASN", "NVA": "VAL", "HS9": "HIS", "ACB": "ASP",
86 | "9KP": "LYS", "FTR": "TRP", "ALS": "ALA", "DYJ": "PRO", "RPI": "ARG", "FTY": "TYR", "TQZ": "CYS", "FVA": "VAL",
87 | "CS4": "CYS", "QVA": "CYS", "XPR": "PRO", "0QL": "CYS", "TCQ": "TYR", "OXX": "ASP", "ZZJ": "ALA", "LDH": "LYS",
88 | "3CT": "TYR", "H7V": "ALA", "4N7": "PRO", "PYA": "ALA", "WVL": "VAL", "DMK": "ASP", "EFC": "CYS", "0BN": "PHE",
89 | "MHO": "MET", "ECX": "CYS", "ESB": "TYR", "KGC": "LYS", "3WX": "PRO", "MBQ": "TYR", "ILX": "ILE", "DSG": "ASN",
90 | "P2Q": "TYR", "LSO": "LYS", "6CW": "TRP", "SDP": "SER", "MP8": "PRO", "HTR": "TRP", "B3S": "SER", "TYB": "TYR",
91 | "PAQ": "TYR", "HS8": "HIS", "RX9": "ILE", "DHA": "SER", "CHP": "GLY", "MMO": "ARG", "FCL": "PHE", "05O": "TYR",
92 | "ICY": "CYS", "DIV": "VAL", "N65": "LYS", "Q78": "PHE", "KCR": "LYS", "TY8": "TYR", "GVL": "SER", "MLL": "LEU",
93 | "DNP": "ALA", "5XU": "ALA", "O7D": "TRP", "NFA": "PHE", "DBY": "TYR", "QCS": "CYS", "ZYK": "PRO", "IIL": "ILE",
94 | "ABA": "ALA", "4AW": "TRP", "BSE": "SER", "LLY": "LYS", "4D4": "ARG", "MNL": "LEU", "FGL": "GLY", "SET": "SER",
95 | "MYN": "ARG", "C4R": "CYS", "CZZ": "CYS", "CZS": "ALA", "Y1V": "LEU", "CWR": "SER", "NBQ": "TYR", "KYQ": "LYS",
96 | "2TY": "TYR", "1PA": "PHE", "6V1": "CYS", "FGP": "SER", "BB9": "CYS", "AGT": "CYS", "CYG": "CYS", "VI3": "CYS",
97 | "PH6": "PRO", "NZH": "HIS", "DAB": "ALA", "B2A": "ALA", "6WK": "CYS", "PR4": "PRO", "7O5": "ALA", "OHS": "ASP",
98 | "3YM": "TYR", "Z3E": "THR", "NC1": "SER", "CAF": "CYS", "BPE": "CYS", "BB7": "CYS", "RE0": "TRP", "TSQ": "PHE",
99 | "4CY": "MET", "G5G": "LEU", "TDD": "LEU", "KCX": "LYS", "0AR": "ARG", "HSV": "HIS", "2ML": "LEU", "4PH": "PHE",
100 | "V44": "CYS", "IAS": "ASP", "FH7": "LYS", "PTM": "TYR", "SAR": "GLY", "SVX": "SER", "MEN": "ASN", "CS1": "CYS",
101 | "HOO": "HIS", "NYB": "CYS", "HMR": "ARG", "05N": "PRO", "V61": "PHE", "41H": "PHE", "BMT": "THR", "4HL": "TYR",
102 | "I2M": "ILE", "4N8": "PRO", "2RX": "SER", "CS3": "CYS", "MEA": "PHE", "B2F": "PHE", "CYF": "CYS", "GNC": "GLN",
103 | "4HJ": "SER", "CSJ": "CYS", "2SO": "HIS", "Q2E": "TRP", "CXM": "MET", "4WQ": "ALA", "5OW": "LYS", "TRX": "TRP",
104 | "B3Y": "TYR", "DAH": "PHE", "5PG": "GLY", "ESC": "MET", "DTY": "TYR", "CGA": "GLU", "TFW": "TRP", "SMF": "PHE",
105 | "S1H": "SER", "SAC": "SER", "QCI": "GLN", "CMT": "CYS", "TY2": "TYR", "0A8": "CYS", "OMH": "SER", "QPA": "CYS",
106 | "MK8": "LEU", "DLE": "LEU", "T0I": "TYR", "ALT": "ALA", "3X9": "CYS", "5CW": "TRP", "9E7": "LYS", "MGN": "GLN",
107 | "PBF": "PHE", "AEI": "THR", "TYI": "TYR", "SNN": "ASN", "74P": "LYS", "OHI": "HIS", "KST": "LYS", "SBL": "SER",
108 | "JJJ": "CYS", "JJL": "CYS", "2RA": "ALA", "DIL": "ILE", "02Y": "ALA", "CYJ": "LYS", "2HF": "HIS", "FC0": "PHE",
109 | "NLN": "LEU", "XW1": "ALA", "QMM": "GLN", "TOQ": "TRP", "WPA": "PHE", "TIH": "ALA", "NLB": "LEU", "BG1": "SER",
110 | "PTR": "TYR", "0WZ": "TYR", "ZYJ": "PRO", "SNC": "CYS", "BBC": "CYS", "B3E": "GLU", "4GJ": "CYS", "MSA": "GLY",
111 | "TPO": "THR", "HIQ": "HIS", "PHA": "PHE", "THC": "THR", "JJK": "CYS", "API": "LYS", "TY5": "TYR", "LPD": "PRO",
112 | "MND": "ASN", "PRV": "GLY", "M3L": "LYS", "HR7": "ARG", "86N": "GLU", "DSN": "SER", "5R5": "SER", "IC0": "GLY",
113 | "ARM": "ARG", "4AK": "LYS", "HT7": "TRP", "E9M": "TRP", "4DP": "TRP", "IML": "ILE", "BCS": "CYS", "7OZ": "ALA",
114 | "2MT": "PRO", "GLZ": "GLY", "0E5": "THR", "U3X": "PHE", "HYP": "PRO", "M0H": "CYS", "7XC": "PHE", "AZK": "LYS",
115 | "AHB": "ASN", "NCB": "ALA", "ASA": "ASP", "TPL": "TRP", "0TD": "ASP", "HTI": "CYS", "LRK": "LYS", "ME0": "MET",
116 | "143": "CYS", "FY2": "TYR", "1TY": "TYR", "QPH": "PHE", "F2F": "PHE", "3PX": "PRO", "PLJ": "PRO", "N9P": "ALA",
117 | "3ZH": "HIS", "C5C": "CYS", "PFF": "PHE", "NEP": "HIS", "CSA": "CYS", "4J4": "CYS", "O7G": "VAL", "TTS": "TYR",
118 | "KFP": "LYS", "FZN": "LYS", "TYN": "TYR", "AA4": "ALA", "LYX": "LYS", "HP9": "PHE", "TH5": "THR", "D2T": "ASP",
119 | "MED": "MET", "TRW": "TRP", "HLU": "LEU", "CSO": "CYS", "23F": "PHE", "PG9": "GLY", "EJA": "CYS", "RE3": "TRP",
120 | "66D": "ILE", "4OG": "TRP", "MSE": "MET", "MDF": "TYR", "DBU": "THR", "SEN": "SER", "Y57": "LYS", "XA6": "PHE",
121 | "M2S": "MET", "FLT": "TYR", "GME": "GLU", "LE1": "VAL", "FY3": "TYR", "OZW": "PHE", "FP9": "PRO", "FHL": "LYS",
122 | "MLE": "LEU", "DAR": "ARG", "BHD": "ASP", "LA2": "LYS", "SLZ": "LYS", "CSX": "CYS", "OCS": "CYS", "DMH": "ASN",
123 | "2CO": "CYS", "NLE": "LEU", "LME": "GLU", "HIC": "HIS", "ZBZ": "CYS", "MYK": "LYS", "2JG": "SER", "ORN": "ALA",
124 | "YTF": "GLN", "1AC": "ALA", "OLD": "HIS", "B2I": "ILE", "HZP": "PRO", "4AF": "PHE", "OMT": "MET", "CSP": "CYS",
125 | "APK": "LYS", "DPR": "PRO", "CY0": "CYS", "5T3": "LYS", "CY3": "CYS", "3GL": "GLU", "4II": "PHE", "0AK": "ASP",
126 | "ALC": "ALA", "LP6": "LYS", "HIP": "HIS", "60F": "CYS", "CML": "CYS", "CYQ": "CYS", "NA8": "ALA", "MH6": "SER",
127 | "GFT": "SER", "WLU": "LEU", "AZH": "ALA", "KBE": "LYS", "LCK": "LYS", "LAY": "LEU", "0LF": "PRO", "KKD": "ASP",
128 | "K7K": "SER", "CSR": "CYS", "B3K": "LYS", "OSE": "SER", "F2Y": "TYR", "NMM": "ARG", "P1L": "CYS", "PRS": "PRO",
129 | "OBS": "LYS", "ZDJ": "TYR", "BYR": "TYR", "HY3": "PRO", "ASB": "ASP", "NLY": "GLY", "0A1": "TYR", "DPL": "PRO",
130 | "SCS": "CYS", "I4G": "GLY", "6CV": "ALA", "HIA": "HIS", "LYN": "LYS", "54C": "TRP", "FGA": "GLU", "B27": "THR",
131 | "TYE": "TYR", "DTH": "THR", "PSH": "HIS", "EXA": "LYS", "BLE": "LEU", "P9S": "CYS", "23P": "ALA", "1TQ": "TRP",
132 | "RVJ": "ALA", "ALO": "THR", "FL6": "ASP", "4LZ": "TYR", "TMD": "THR", "FHO": "LYS", "0FL": "ALA", "AN6": "LEU",
133 | "4OV": "SER", "432": "SER", "SCH": "CYS", "DGL": "GLU", "2TL": "THR", "TPQ": "TYR", "3AH": "HIS", "CSD": "CYS",
134 | "PR3": "CYS", "IZO": "MET", "DV9": "GLU", "41Q": "ASN", "DI7": "TYR", "34E": "VAL", "MHS": "HIS", "GGL": "GLU",
135 | "ALY": "LYS", "O6H": "TRP", "8JB": "CYS", "SVV": "SER", "KOR": "MET", "PYX": "CYS", "6CL": "LYS", "WRP": "TRP",
136 | "SCY": "CYS", "G1X": "TYR", "2KK": "LYS", "TYQ": "TYR", "MIR": "SER", "ALN": "ALA", "CMH": "CYS", "KPY": "LYS",
137 | "SVZ": "SER", "NMC": "GLY", "RGL": "ARG", "SME": "MET", "DAL": "ALA", "DTR": "TRP", "PEC": "CYS", "SGB": "SER",
138 | "NLO": "LEU", "AHP": "ALA", "SLL": "LYS", "TRF": "TRP", "CME": "CYS", "SEE": "SER", "MME": "MET", "DYA": "ASP",
139 | "33X": "ALA", "LYF": "LYS", "CZ2": "CYS", "TRO": "TRP", "DPN": "PHE", "IB9": "TYR", "POK": "ARG", "LET": "LYS",
140 | "CCS": "CYS", "DGN": "GLN", "NIY": "TYR", "E9C": "TYR", "SEB": "SER", "AIB": "ALA", "OAS": "SER", "V7T": "LYS",
141 | "K5L": "SER", "TYS": "TYR", "FIO": "ARG", "B2V": "VAL", "GLJ": "GLU", "JLP": "LYS", "MVA": "VAL", "0Y8": "PRO",
142 | "OTH": "THR", "00C": "CYS", "0EA": "TYR", "F7W": "TRP", "LEI": "VAL", "UMA": "ALA", "OLT": "THR", "4KY": "PRO",
143 | "MCS": "CYS", "TNQ": "TRP", "HIX": "ALA", "C1X": "LYS", "PAT": "TRP", "T8L": "THR", "DM0": "LYS", "CG6": "CYS",
144 | "KPF": "LYS", "DYS": "CYS", "BB6": "CYS", "LAL": "ALA", "DLY": "LYS", "DJD": "PHE", "LTU": "TRP", "TYT": "TYR",
145 | "VPV": "LYS", "D11": "THR", "LEF": "LEU", "1X6": "SER", "ML3": "LYS", "MAA": "ALA", "7ID": "ASP", "AAR": "ARG",
146 | "NZC": "THR", "R1A": "CYS", "CGV": "CYS", "D3P": "GLY", "TIS": "SER", "LYR": "LYS", "4IN": "TRP", "CY4": "CYS",
147 | "0AF": "TRP", "TLY": "LYS", "SVA": "SER", "4HH": "SER", "HQA": "ALA", "PHD": "ASP", "KYN": "TRP", "4FW": "TRP",
148 | "VHF": "GLU", "CTH": "THR", "B3X": "ASN", "MTY": "TYR", "MLY": "LYS", "SMC": "CYS", "TS9": "ILE", "PXU": "PRO",
149 | "DSE": "SER", "P3Q": "TYR", "BCX": "CYS", "FAK": "LYS", "SVY": "SER", "CSS": "CYS", "FDL": "LYS", "2LT": "TYR",
150 | "N80": "PRO", "B3A": "ALA", "LYO": "LYS", "VR0": "ARG", "YTH": "THR",
151 | }
152 |
153 | # Mappings to aa metadata
154 | _ID_MAP: Dict[int, tuple] = {aa[0]: aa for aa in _AA_LIST}
155 | _ONE_MAP: Dict[str, tuple] = {aa[1]: aa for aa in _AA_LIST}
156 | _THREE_MAP: Dict[str, tuple] = {aa[2]: aa for aa in _AA_LIST}
157 | #_AA_MAP = _ID_MAP | _ONE_MAP | _THREE_MAP
158 |
159 | # Translation tables
160 | THREE_2_ONE: Dict[str, str] = {aa[2]: aa[1] for aa in _AA_LIST}
161 | ONE_2_THREE: Dict[str, str] = {aa[1]: aa[2] for aa in _AA_LIST}
162 | ONE_2_ID: Dict[str, int] = {aa[1]: aa[0] for aa in _AA_LIST}
163 | ID_2_ONE: Dict[int, str] = {aa[0]: aa[1] for aa in _AA_LIST}
164 |
165 | # Construcors --------------------------------------------------------------
166 | def __init__(self, aa_one: str) -> "AminoAcid":
167 | """Only accepts standard Amino Acid one-letter-codes."""
168 | assert aa_one in self._ONE_MAP, f"ERROR in AminoAcid('{aa_one}'): invalid amino acid one-letter-code."
169 | aa_metadata = self._ONE_MAP[aa_one]
170 | self.id: int = aa_metadata[0]
171 | self.one: str = aa_metadata[1]
172 | self.three: str = aa_metadata[2]
173 | self.three_standard: str = aa_metadata[2]
174 | self.name: str = aa_metadata[3]
175 |
176 | @classmethod
177 | def parse_three(cls, aa_three: int) -> "AminoAcid":
178 | """Parse an AminoAcid from its three-letter-code (can handle non-standard AAs and mapping to corresponding standard AA)."""
179 |
180 | # Standard case
181 | aa_one = cls.THREE_2_ONE.get(aa_three, None)
182 | if aa_one is not None:
183 | return AminoAcid(aa_one)
184 |
185 | # Non-standard case
186 | aa_three_standard = cls._NON_STANDARD_AAS.get(aa_three, None)
187 | if aa_three_standard is not None:
188 | aa_one = cls.THREE_2_ONE[aa_three_standard]
189 | aa = AminoAcid(aa_one)
190 | aa.three = aa_three
191 | return aa
192 |
193 | # Unknown case
194 | return cls.get_unknown()
195 |
196 | @classmethod
197 | def get_all(cls) -> List["AminoAcid"]:
198 | """Get the list of all 20 standard AminoAcids."""
199 | return [AminoAcid(aa_metadata[1]) for aa_metadata in cls._AA_LIST]
200 |
201 | @classmethod
202 | def get_unknown(cls) -> "AminoAcid":
203 | """Return an unknown AminoAcid."""
204 | aa = AminoAcid("A")
205 | aa.id = AminoAcid.UNK_ID
206 | aa.one = AminoAcid.UNK_ONE
207 | aa.three = AminoAcid.UNK_THREE
208 | aa.three_standard = None
209 | aa.name = AminoAcid.UNK_NAME
210 | return aa
211 |
212 | @classmethod
213 | def get_gap(cls) -> "AminoAcid":
214 | """Return a gap 'AminoAcid'."""
215 | aa = AminoAcid("A")
216 | aa.id = AminoAcid.GAP_ID
217 | aa.one = AminoAcid.GAP_ONE
218 | aa.three = AminoAcid.GAP_THREE
219 | aa.three_standard = None
220 | aa.name = AminoAcid.GAP_NAME
221 | return aa
222 |
223 | # Base properties ----------------------------------------------------------
224 | def __str__(self) -> str:
225 | if self.is_standard:
226 | return f"AminoAcid('{self.one}', '{self.three}', id={self.id})"
227 | else:
228 | return f"AminoAcid('{self.one}', '{self.three}' (std='{self.three_standard}'), id={self.id})"
229 |
230 | def is_gap(self) -> bool:
231 | self.id == AminoAcid.GAP_ID
232 |
233 | def is_unknown(self) -> bool:
234 | return self.id == AminoAcid.UNK_ID
235 |
236 | def is_aminoacid(self) -> bool:
237 | return not self.is_gap()
238 |
239 | def is_standard(self) -> bool:
240 | return self.three == self.three_standard
241 |
242 | # Class methods ------------------------------------------------------------
243 | @classmethod
244 | def id_exists(cls, id: int) -> bool:
245 | """Return if 'id' corresponds to the id of a standard Amino Acid."""
246 | return id in cls._ID_MAP
247 |
248 | @classmethod
249 | def one_exists(cls, aa_one: str) -> bool:
250 | """Return if 'aa_one' corresponds to the one-letter-code of a standard Amino Acid."""
251 | return aa_one in cls._ONE_MAP
252 |
253 | @classmethod
254 | def three_exists(cls, aa_three: str) -> bool:
255 | """Return if 'aa_three' corresponds to the three-letter-code of a standard Amino Acid."""
256 | return aa_three in cls._THREE_MAP
257 |
--------------------------------------------------------------------------------
/rsalor/sequence/fasta_reader.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from typing import List, Union
5 | from rsalor.sequence import Sequence
6 |
7 |
8 | # FastaReader ------------------------------------------------------------------
9 | class FastaReader:
10 | """High level FASTA file reader."""
11 |
12 | @classmethod
13 | def read_first_sequence(cls, fasta_path: str) -> Sequence:
14 | """Read first sequence from a FASTA file."""
15 | fasta_stream = FastaStream(fasta_path)
16 | sequence = fasta_stream.get_next()
17 | fasta_stream.close()
18 | return sequence
19 |
20 | @classmethod
21 | def read_sequences(cls, fasta_path: str) -> List[Sequence]:
22 | """Read all sequences from a FASTA file."""
23 | fasta_stream = FastaStream(fasta_path)
24 | sequences = fasta_stream.get_all()
25 | fasta_stream.close()
26 | return sequences
27 |
28 | @classmethod
29 | def count_sequences(cls, fasta_path: str) -> int:
30 | """Count the number of sequences in a FASTA file (just count the '>')."""
31 |
32 | # Guardians
33 | assert os.path.isfile(fasta_path), f"ERROR in FastaReader.count_sequences(): fasta_path='{fasta_path}' does not exists."
34 | assert fasta_path.split(".")[-1] in FastaStream.ACCEPTED_EXTENTIONS, f"ERROR in FastaReader.count_sequences(): fasta_path='{fasta_path}' should end with {FastaStream.ACCEPTED_EXTENTIONS}."
35 |
36 | # Count
37 | HEADER_START_CHAR = Sequence.HEADER_START_CHAR
38 | n = 0
39 | with open(fasta_path) as fs:
40 | line = fs.readline()
41 | while line:
42 | if line.startswith(HEADER_START_CHAR):
43 | n += 1
44 | line = fs.readline()
45 | return n
46 |
47 |
48 | # FastaStream ------------------------------------------------------------------
49 | class FastaStream:
50 | """Low level class to stream sequences from a FASTA file (one by one to avoid loading the whole file in RAM).
51 |
52 | WARNING: Please use with caution and do not forget to '.close()'.
53 |
54 | usage:
55 |
56 | fasta_stream = FastaStream('./fasta/msa1.fasta') \n
57 | sequence1 = fasta_stream.get_next() \n
58 | sequence2 = fasta_stream.get_next() \n
59 | fasta_stream.close()
60 | """
61 |
62 | # Constants ----------------------------------------------------------------
63 | ACCEPTED_EXTENTIONS = ["fasta", "a2m"]
64 | HEADER_START_CHAR = Sequence.HEADER_START_CHAR
65 |
66 | # Constructor --------------------------------------------------------------
67 | def __init__(self, fasta_path: str):
68 |
69 | # Guardians
70 | assert os.path.isfile(fasta_path), f"ERROR in FastaStream(): fasta_path='{fasta_path}' does not exists."
71 | assert fasta_path.split(".")[-1] in self.ACCEPTED_EXTENTIONS, f"ERROR in FastaStream(): fasta_path='{fasta_path}' should end with {self.ACCEPTED_EXTENTIONS}."
72 |
73 | # Init
74 | self.fasta_path = fasta_path
75 | self.file = open(fasta_path, "r")
76 | self.current_id = -1
77 | self.current_line = self._next_line()
78 |
79 | # First sequence sanity check
80 | assert self.current_line is not None, f"ERROR in FastaStream(): no sequences found in file '{fasta_path}'."
81 | assert self._is_current_line_header(), f"ERROR in FastaStream(): first line of file '{fasta_path}' sould be a fasta header (thus start with '{self.HEADER_START_CHAR}').\nline='{self.current_line}'"
82 |
83 | @property
84 | def is_open(self) -> bool:
85 | """Return if current file/stream is still open"""
86 | return self.current_line is not None
87 |
88 | # Methods ------------------------------------------------------------------
89 | def close(self) -> None:
90 | """Close file."""
91 | self.file.close()
92 | self.current_line = None
93 |
94 | def get_next(self) -> Union[None, Sequence]:
95 | """Get next Fasta sequence."""
96 | if self.current_line is None:
97 | return None
98 | self.current_id += 1
99 | header = self.current_line.removesuffix("\n")
100 | seq_arr = []
101 | self.current_line = self._next_line()
102 | while self.current_line:
103 | if self._is_current_line_header():
104 | break
105 | seq_arr.append(self.current_line.removesuffix("\n"))
106 | self.current_line = self._next_line()
107 |
108 | seq = "".join(seq_arr)
109 | return Sequence(header, seq)
110 |
111 | def get_all(self) -> List[Sequence]:
112 | """Get all remaining Fasta sequences."""
113 | fasta_sequence_list = []
114 | fasta_sequence = self.get_next()
115 | while fasta_sequence is not None:
116 | fasta_sequence_list.append(fasta_sequence)
117 | fasta_sequence = self.get_next()
118 | return fasta_sequence_list
119 |
120 | # Dependencies -------------------------------------------------------------
121 | def _next_line(self) -> Union[None, str]:
122 | line = self.file.readline()
123 | if line == "":
124 | self.close()
125 | return None
126 | return line
127 |
128 | def _is_current_line_header(self) -> bool:
129 | return self.current_line.startswith(Sequence.HEADER_START_CHAR)
130 |
--------------------------------------------------------------------------------
/rsalor/sequence/mutation.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | from rsalor.utils import is_convertable_to
4 | from rsalor.sequence import AminoAcid
5 |
6 | # Mutation ---------------------------------------------------------------------
7 | class Mutation:
8 | """Container class for a single missence/synonymous mutation on a protein (FASTA) sequence.
9 |
10 | NOTE: Use FASTA residue position convention: so resdue position is an integer and starts at 1.
11 | NOTE: Trivial mutations are accepter (like 'A14A').
12 |
13 | usage:
14 | mutation: Mutation = Mutation('A14G')
15 | """
16 |
17 | # Constructor --------------------------------------------------------------
18 | def __init__(self, mutation_str: str):
19 |
20 | # Unpack and guardians
21 | assert len(mutation_str) >= 3, f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': should be of length 3 or more."
22 | wt_aa, position, mt_aa = mutation_str[0], mutation_str[1:-1], mutation_str[-1]
23 | assert AminoAcid.one_exists(wt_aa), f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': wild-type amino acid is incorrect."
24 | assert AminoAcid.one_exists(mt_aa), f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': mutant amino acid is incorrect."
25 | assert is_convertable_to(position, int), f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': position must be a stricktly positive integer."
26 | position = int(position)
27 | assert position > 0, f"ERROR in Mutation(): invalid mutation_str='{mutation_str}': position must be a stricktly positive integer."
28 |
29 | # Set
30 | self.wt_aa: AminoAcid = AminoAcid(wt_aa)
31 | self.position: int = position
32 | self.mt_aa: AminoAcid = AminoAcid(mt_aa)
33 |
34 | # Methods ------------------------------------------------------------------
35 | def __str__(self) -> str:
36 | return f"{self.wt_aa.one}{self.position}{self.mt_aa.one}"
37 |
38 | def __int__(self) -> int:
39 | """Return unique integer code for each mutation."""
40 | return self.position*10000 + self.wt_aa.id*100 + self.mt_aa.id
41 |
42 | def is_trivial(self) -> bool:
43 | """Return if mutation is trivial (like 'A14A')."""
44 | return self.wt_aa == self.mt_aa
45 |
--------------------------------------------------------------------------------
/rsalor/sequence/pairwise_alignment.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from typing import List, Tuple, Dict, Union
5 | from Bio.Align import PairwiseAligner
6 | from rsalor.sequence import Sequence
7 |
8 | # PairwiseAlignment ------------------------------------------------------------
9 | class PairwiseAlignment:
10 | """Class to perform pairwise alignments based on sequence identity.
11 | The aim is to reconsile slightly different inputs of the same protein sequence but with possibly small incoherences like missing residues.
12 | Like the SEQRES and ATOM lines of a PDB.
13 | Or a sequence extracted from a PDB and a sequence from an MSA(for instance, MSA or PDB could cover a different range of the sequence).
14 |
15 | NOTE: Put sequence which is expected to contain the least gaps first
16 |
17 | usage:
18 |
19 | seq1 = FastaSequence("msa_seq", "HHALYDYEARTK") \n
20 | seq2 = FastaSequence("pdb_seq", "ALYDYEART") \n
21 | align = PairwiseAlignment(seq1, seq2) \n
22 | align.show() \n
23 | seq1_to_seq2_id_mapping = align.get_mapping()
24 | """
25 |
26 | # Constants ----------------------------------------------------------------
27 | GAP_CHAR = "-"
28 | MATCH_CHAR = "|"
29 | MISMATCH_CHAR = "x"
30 |
31 | # Constructor --------------------------------------------------------------
32 | def __init__(
33 | self,
34 | sequence1: Sequence,
35 | sequence2: Sequence,
36 | match_score: float=1.0,
37 | mismatch_score: float=-3.0,
38 | open_gap_score: float=-2.5,
39 | extend_gap_score: float=-2.0,
40 | tail_gap_score: float=-2.0,
41 | query_insertion_multiplier: float=3.0,
42 | ):
43 |
44 | # Length Guardians
45 | if len(sequence1) == 0 or len(sequence2) == 0:
46 | print(f" * sequence1: {sequence1}")
47 | print(f" * sequence2: {sequence2}")
48 | raise ValueError("ERROR in PairwiseAlignment(): input target or query sequence can not be of length 0.")
49 |
50 | # Init base properties
51 | self.sequence1 = sequence1
52 | self.sequence2 = sequence2
53 | self.len1 = len(sequence1)
54 | self.len2 = len(sequence2)
55 | self.len_min = min(self.len1, self.len2)
56 | self.len_max = max(self.len1, self.len2)
57 | self.len_ratio = self.len_min / self.len_max
58 |
59 | # Init aligner
60 | self.aligner = PairwiseAligner()
61 | self.aligner.mode = 'global'
62 | self.aligner.match_score = match_score
63 | self.aligner.mismatch_score = mismatch_score
64 | self.aligner.target_internal_open_gap_score = open_gap_score
65 | self.aligner.target_internal_extend_gap_score = extend_gap_score
66 | self.aligner.target_right_open_gap_score = tail_gap_score
67 | self.aligner.target_right_extend_gap_score = tail_gap_score
68 | self.aligner.target_left_open_gap_score = tail_gap_score
69 | self.aligner.target_left_extend_gap_score = tail_gap_score
70 | self.aligner.query_internal_open_gap_score = open_gap_score * query_insertion_multiplier
71 | self.aligner.query_internal_extend_gap_score = extend_gap_score * query_insertion_multiplier
72 | self.aligner.query_left_open_gap_score = tail_gap_score
73 | self.aligner.query_left_extend_gap_score = tail_gap_score
74 | self.aligner.query_right_open_gap_score = tail_gap_score
75 | self.aligner.query_right_extend_gap_score = tail_gap_score
76 |
77 | # Align
78 | alignments = self.aligner.align(self.sequence1.sequence, self.sequence2.sequence)
79 | try: # For Biopython versions 1.80 and later
80 | self.align1: str = alignments[0][0]
81 | self.align2: str = alignments[0][1]
82 | except: # For legacy Biopython versions
83 | alignment_str_list = str(alignments[0]).split()
84 | self.align1: str = alignment_str_list[0]
85 | self.align2: str = alignment_str_list[2]
86 | self.score: float = alignments.score
87 |
88 | # Alignment properties
89 | self.match: int = 0
90 | self.gap: int = 0
91 | self.mismatch: int = 0
92 | comparator_list = []
93 | for aa1, aa2 in zip(self.align1, self.align2):
94 | if aa1 == self.GAP_CHAR or aa2 == self.GAP_CHAR:
95 | self.gap += 1
96 | comparator_list.append(self.GAP_CHAR)
97 | elif aa1 == aa2:
98 | self.match += 1
99 | comparator_list.append(self.MATCH_CHAR)
100 | else:
101 | self.mismatch += 1
102 | comparator_list.append(self.MISMATCH_CHAR)
103 | self.comparator = "".join(comparator_list)
104 | self.match_ratio: float = self.match / len(self)
105 | self.gap_ratio: float = self.gap / len(self)
106 | self.mismatch_ratio: float = self.mismatch / len(self)
107 |
108 | # Failed alignment error
109 | if self.match == 0:
110 | print("PairwiseAlignment(): failed to align sequences: zero matching positions is the alignemnt.")
111 | print(f" * sequence1: {sequence1}")
112 | print(f" * sequence2: {sequence2}")
113 | raise ValueError("ERROR in PairwiseAlignment(): alignment failed.")
114 |
115 | # Count gap types
116 | self.left_gap, self.right_gap = _count_tail_characters(self.comparator, self.GAP_CHAR)
117 | self.tail_gap: int = self.left_gap + self.right_gap
118 | self.internal_gap: int = self.gap - self.tail_gap
119 |
120 | # Count gap types by sequence
121 | self.gap1 = len(self) - self.len1
122 | self.gap2 = len(self) - self.len2
123 | self.left_gap1, self.right_gap1 = _count_tail_characters(self.align1, self.GAP_CHAR)
124 | self.tail_gap1: int = self.left_gap1 + self.right_gap1
125 | self.internal_gap1: int = self.gap1 - self.tail_gap1
126 | self.left_gap2, self.right_gap2 = _count_tail_characters(self.align2, self.GAP_CHAR)
127 | self.tail_gap2: int = self.left_gap2 + self.right_gap2
128 | self.internal_gap2: int = self.gap2 - self.tail_gap2
129 |
130 | # Some final measures
131 | self.sequence_identity: float = self.match / (self.match + self.mismatch) # excluding gapped positions
132 | self.coverage1: float = (self.match + self.mismatch) / self.len1
133 | self.coverage2: float = (self.match + self.mismatch) / self.len2
134 | self.coverage: float = (self.match + self.mismatch) / len(self)
135 |
136 | # Base Properties ----------------------------------------------------------
137 | def __len__(self) -> int:
138 | return len(self.align1)
139 |
140 | def __str__(self) -> str:
141 | return f"PairwiseAlignment('{self.sequence1.name}' vs. '{self.sequence2.name}', l={len(self)}, ({self.match} |, {self.gap} -, {self.mismatch} x))"
142 |
143 | def show(self, n_lines: int=120, only_critical_chunks: bool=False) -> "PairwiseAlignment":
144 | """Show the complete alignemnt."""
145 | assert n_lines > 0, f"ERROR in {self}.show(): n_lines={n_lines} should be > 0."
146 | print(self)
147 | l = len(self)
148 | i = 0
149 | while i < l:
150 | range_line = f"{i+1} - {min(i+n_lines, l)}"
151 | ali1_line = self.align1[i:i+n_lines]
152 | comp_line = self.comparator[i:i+n_lines]
153 | ali2_line = self.align2[i:i+n_lines]
154 | if only_critical_chunks:
155 | comp_line = comp_line.replace(self.MISMATCH_CHAR, f"\033[91m{self.MISMATCH_CHAR}\033[0m")
156 | ali2_line = ali2_line.replace(self.GAP_CHAR, f"\033[91m{self.GAP_CHAR}\033[0m")
157 | if not only_critical_chunks or self.MISMATCH_CHAR in comp_line or self.GAP_CHAR in ali2_line:
158 | print(range_line)
159 | print(ali1_line)
160 | print(comp_line)
161 | print(ali2_line)
162 | i += n_lines
163 | return self
164 |
165 | # Methods ------------------------------------------------------------------
166 | def write(self, save_path: str) -> "PairwiseAlignment":
167 | """Save alignment to a '.fasta' file."""
168 | save_path = os.path.abspath(save_path)
169 | assert save_path.endswith(".fasta"), f"ERROR in {self}.write(): save_path='{save_path}' sould be a '.fasta' file."
170 | assert os.path.isdir(os.path.dirname(save_path)), f"ERROR in {self}.write(): directory of save_path='{save_path}' does not exists."
171 | align_str = f">{self.sequence1.name}\n{self.align1}\n>{self.sequence2.name}\n{self.align2}\n"
172 | with open(save_path, "w") as fs:
173 | fs.write(align_str)
174 | return self
175 |
176 | def get_mapping(
177 | self,
178 | ids1: Union[None, List[Union[str, int]]]=None,
179 | ids2: Union[None, List[Union[str, int]]]=None,
180 | reversed: bool=False,
181 | ) -> Dict[Union[str, int], Union[str, int]]:
182 | """
183 | Return mapping of the aligment between mathing residues from seq1 to seq2.
184 | * By default the ids are just consecutive integers starting at 1.
185 |
186 | args:
187 | ids1: overwrite ids for seq1 (default is [1, 2, 3, ...])
188 | ids2: overwrite ids for seq2 (default is [1, 2, 3, ...])
189 | revered: if True, give mapping from seq2 to seq1
190 | """
191 |
192 | # Init ids
193 | if ids1 is None:
194 | ids1 = list(range(1, len(self.sequence1.sequence)+1))
195 | else:
196 | assert len(ids1) == len(self.sequence1), f"ERROR in {self}.get_mapping(): length of ids1={len(ids1)} does not match length of sequence1={len(self.sequence1)}."
197 | if ids2 is None:
198 | ids2 = list(range(1, len(self.sequence2.sequence)+1))
199 | else:
200 | assert len(ids2) == len(self.sequence2), f"ERROR in {self}.get_mapping(): length of ids2={len(ids2)} does not match length of sequence2={len(self.sequence2)}."
201 |
202 | # Manage reversed
203 | align1, align2 = self.align1, self.align2
204 | if reversed:
205 | align1, align2 = align2, align1
206 | ids1, ids2 = ids2, ids1
207 |
208 | # Generate mapping
209 | mapping = {}
210 | i1, i2 = 0, 0
211 | for aa1, aa2 in zip(align1, align2):
212 | if aa1 != self.GAP_CHAR:
213 | i1 += 1
214 | if aa2 != self.GAP_CHAR:
215 | i2 += 1
216 | if aa1 != self.GAP_CHAR and aa2 != self.GAP_CHAR:
217 | mapping[ids1[i1-1]] = ids2[i2-1]
218 | return mapping
219 |
220 | @classmethod
221 | def get_gaps_ranges(cls, align: str, tail_gaps: bool=True) -> List[Tuple[int, int]]:
222 | """Return gaps ranges of alignment string."""
223 |
224 | # Detect gaps
225 | gaps_ranges = []
226 | is_previous_gap = False
227 | for i, aa in enumerate(align):
228 | is_current_gap = aa == cls.GAP_CHAR
229 | # Open gap range
230 | if not is_previous_gap and is_current_gap:
231 | current_gap_rang = [i]
232 | # Close gap range
233 | elif is_previous_gap and not is_current_gap:
234 | current_gap_rang.append(i)
235 | gaps_ranges.append(current_gap_rang)
236 | is_previous_gap = is_current_gap
237 |
238 | # Mange right tail gap
239 | if align[-1] == cls.GAP_CHAR:
240 | current_gap_rang.append(len(align))
241 | gaps_ranges.append(current_gap_rang)
242 |
243 | # Remove tail gaps if required
244 | if not tail_gaps:
245 | if align[0] == cls.GAP_CHAR:
246 | gaps_ranges = gaps_ranges[1:]
247 | if align[-1] == cls.GAP_CHAR:
248 | gaps_ranges = gaps_ranges[:-1]
249 |
250 | return gaps_ranges
251 |
252 | # Dependencies -----------------------------------------------------------------
253 | def _count_tail_characters(input_sequence: str, count_char: str) -> Tuple[int, int]:
254 | c1, c2 = 0, 0
255 | # Left tail
256 | for char in input_sequence:
257 | if char == count_char:
258 | c1 += 1
259 | else:
260 | break
261 | # Right tail
262 | for char in input_sequence[::-1]:
263 | if char == count_char:
264 | c2 += 1
265 | else:
266 | break
267 | return c1, c2
268 |
--------------------------------------------------------------------------------
/rsalor/sequence/sequence.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from typing import List, Union
5 | from rsalor.sequence import AminoAcid
6 | from rsalor.sequence import Mutation
7 |
8 | # Sequence ---------------------------------------------------------------------
9 | class Sequence:
10 | """Container class for a single sequence (name, sequence and weight).
11 |
12 | usage:
13 |
14 | seq: Sequence = Sequence('seq1', 'MQIFVKTLTGKTI--T') \n
15 | seq_name: str = seq.name \n
16 | seq_str: str = seq.sequence \n
17 | seq.write('./fasta/seq1.fasta')
18 | """
19 |
20 | # Constants ----------------------------------------------------------------
21 | HEADER_START_CHAR = ">"
22 | GAP_CHAR = AminoAcid.GAP_ONE
23 | AMINO_ACIDS_IDENTITY_MAP = {aa.one: aa.one for aa in AminoAcid.get_all()} | {aa.one.lower(): aa.one.lower() for aa in AminoAcid.get_all()}
24 |
25 | # Constructor --------------------------------------------------------------
26 | def __init__(self, name: str, sequence: str, weight: float=1.0, to_upper: bool=True, convert_special_characters: bool=True):
27 | """Constructor for a (protein) Sequence object.
28 | name (str) name of the sequence
29 | sequence (str) amino acid sequence as a string
30 | weight (float=1.0) weight of the sequence (in an MSA)
31 | to_upper (bool=True) if True, convert all lower case amino acids to upper cases (such as in '.a2m' format)
32 | convert_special_characters (bool=True) if True, convert all non-standard characters (like '.' or '_') to a gap '-' (such as in '.a2m' format)
33 | """
34 | if name.startswith(self.HEADER_START_CHAR):
35 | name = name.removeprefix(self.HEADER_START_CHAR)
36 | if to_upper:
37 | sequence = sequence.upper()
38 | if convert_special_characters:
39 | gap = self.GAP_CHAR
40 | aa_map = self.AMINO_ACIDS_IDENTITY_MAP
41 | sequence = "".join([aa_map.get(aa, gap) for aa in sequence])
42 | self.name: str = name
43 | self.sequence: str = sequence
44 | self.weight: float = weight
45 |
46 | # Base properties ----------------------------------------------------------
47 | def __len__(self) -> int:
48 | return len(self.sequence)
49 |
50 | def __str__(self) -> str:
51 | MAX_PRINT_LEN = 15
52 | seq_str = self.sequence
53 | if len(seq_str) > MAX_PRINT_LEN:
54 | seq_str = f"{seq_str[0:MAX_PRINT_LEN]}..."
55 | name_str = self.name
56 | if len(name_str) > MAX_PRINT_LEN:
57 | name_str = f"{name_str[0:MAX_PRINT_LEN]}..."
58 | return f"Sequence('{name_str}', seq='{seq_str}', l={len(self)})"
59 |
60 | def __eq__(self, other: "Sequence") -> bool:
61 | return self.sequence == other.sequence
62 |
63 | def __neq__(self, other: "Sequence") -> bool:
64 | return self.sequence != other.sequence
65 |
66 | def __hash__(self) -> int:
67 | return hash(self.sequence)
68 |
69 | def __iter__(self):
70 | return iter(self.sequence)
71 |
72 | def __getitem__(self, id: int) -> str:
73 | return self.sequence[id]
74 |
75 | def __contains__(self, char: str) -> bool:
76 | return char in self.sequence
77 |
78 | # Base Methods -------------------------------------------------------------
79 | def n_gaps(self) -> int:
80 | """Return number of gaps in sequence."""
81 | return len([char for char in self.sequence if char == self.GAP_CHAR])
82 |
83 | def n_non_gaps(self) -> int:
84 | """Return number of non-gaps in sequence."""
85 | return len([char for char in self.sequence if char != self.GAP_CHAR])
86 |
87 | def gap_ratio(self) -> float:
88 | """Return gap ratio."""
89 | return self.n_gaps() / len(self)
90 |
91 | def contains_gaps(self) -> bool:
92 | """Return is sequence contains gaps."""
93 | for char in self.sequence:
94 | if char == self.GAP_CHAR:
95 | return True
96 | return False
97 |
98 | def is_all_amino_acids(self) -> bool:
99 | """Returns is sequence is composed of only standard amino acids."""
100 | for char in self.sequence:
101 | if not AminoAcid.one_exists(char):
102 | return False
103 | return True
104 |
105 | def to_fasta_string(self) -> str:
106 | """Return string of the sequence in FASTA format."""
107 | return f"{self.HEADER_START_CHAR}{self.name}\n{self.sequence}\n"
108 |
109 | def mutation_is_compatible(self, mutation: Union[str, Mutation]) -> bool:
110 | """Return if mutation is compatible with the sequence."""
111 |
112 | # Convert to Mutation type
113 | if isinstance(mutation, str):
114 | mutation = Mutation(mutation)
115 |
116 | # Verify if mutatoin position is in sequence
117 | if not (1 <= mutation.position <= len(self)):
118 | return False
119 | # Verify if wild-type amino acid corresponds to sequence
120 | if mutation.wt_aa.one != self.sequence[mutation.position-1]:
121 | return False
122 | return True
123 |
124 | # IO Methods ---------------------------------------------------------------
125 | def write(self, fasta_path: str) -> "Sequence":
126 | """Save sequence in a FASTA file."""
127 |
128 | # Guardians
129 | fasta_path = os.path.abspath(fasta_path)
130 | assert os.path.isdir(os.path.dirname(fasta_path)), f"ERROR in Sequence('{self.name}').write(): directory of '{fasta_path}' does not exists."
131 | assert fasta_path.endswith(".fasta"), f"ERROR in Sequence('{self.name}').write(): fasta_path='{fasta_path}' should end with '.fasta'."
132 |
133 | # Save FASTA and return self
134 | with open(fasta_path, "w") as fs:
135 | fs.write(self.to_fasta_string())
136 | return self
137 |
138 | # Mutate Methods -----------------------------------------------------------
139 | def trim(self, keep_positions: List[bool]) -> "Sequence":
140 | """Trim sequence (filter on positions) according to keep_positions (array of bool indicating which position to keep)."""
141 |
142 | # Guardians
143 | assert len(keep_positions) == len(self), f"ERROR in {self}.trim(): length of keep_positions ({len(keep_positions)}) does not match length of sequence ({len(self)})."
144 |
145 | # Trim and return self
146 | self.sequence = "".join([char for char, keep in zip(self.sequence, keep_positions) if keep])
147 | return self
--------------------------------------------------------------------------------
/rsalor/structure/__init__.py:
--------------------------------------------------------------------------------
1 | from rsalor.structure.residue import Residue
2 | from rsalor.structure.structure import Structure
3 |
--------------------------------------------------------------------------------
/rsalor/structure/residue.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | from typing import Union
4 | from rsalor.sequence import AminoAcid
5 |
6 | # Main -------------------------------------------------------------------------
7 | class Residue:
8 | """Container class for a PDB residue.
9 |
10 | usage:
11 | res = Residue('A', '113', AminoAcid('K'))
12 | """
13 |
14 | # Constructor --------------------------------------------------------------
15 | def __init__(self, chain: str, position: str, amino_acid: AminoAcid, rsa: Union[None, float]=None):
16 |
17 | # Guardians
18 | assert len(chain) == 1 and chain != " ", f"ERROR in Residue(): invalid chain='{chain}'."
19 | if rsa is not None:
20 | assert rsa >= 0.0, f"ERROR in Residue(): rsa='{rsa}' should be positive."
21 |
22 | # Set properties
23 | self.chain: str = chain
24 | self.position: str = position
25 | self.amino_acid: AminoAcid = amino_acid
26 | self.rsa: Union[None, float] = rsa
27 |
28 | # Properties ---------------------------------------------------------------
29 | @property
30 | def resid(self) -> str:
31 | return self.chain + self.position
32 |
33 | def __str__(self) -> str:
34 | return f"Residue('{self.resid}', '{self.amino_acid.three}', RSA={self.rsa})"
--------------------------------------------------------------------------------
/rsalor/structure/structure.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from typing import Union, List, Dict, Literal
5 | from rsalor.sequence import AminoAcid
6 | from rsalor.structure import Residue
7 | from rsalor.sequence import Sequence
8 | from rsalor.rsa import RSASolver, RSABiopython, RSADSSP, RSAMuSiC
9 |
10 | # Execution --------------------------------------------------------------------
11 | class Structure:
12 | """Structure object for parsing all Residues from ATOM lines and assign RSA (with biopython (Shrake & Rupley), DSSP or MuSiC).
13 |
14 | usage:
15 | structure = Structure('./my_pdb.pdb', 'A')
16 | """
17 |
18 | # Constants ----------------------------------------------------------------
19 | RSA_SOLVERS: Dict[str, RSASolver] = {
20 | "biopython": RSABiopython,
21 | "DSSP": RSADSSP,
22 | "MuSiC": RSAMuSiC,
23 | }
24 |
25 | # Constructor --------------------------------------------------------------
26 | def __init__(
27 | self,
28 | pdb_path: str,
29 | chain: str,
30 | rsa_solver: Literal["biopython", "DSSP", "MuSiC"]="biopython",
31 | rsa_solver_path: Union[None, str]=None,
32 | rsa_cache_path: Union[None, str]=None,
33 | verbose: bool=False,
34 | ):
35 | """Structure object for parsing all Residues from ATOM lines and assign RSA (with biopython, DSSP or MuSiC).
36 |
37 | arguments:
38 | pdb_path (str): path to PDB file
39 | chain (str): target chain in the PDB
40 | rsa_solver ('biopython'/'DSSP'/'MuSiC'): solver to use to compute RSA
41 | rsa_solver_path (Union[None, str]=None): path to solver executable
42 | rsa_cache_path (Union[None, str]=None): path to write/read to/from RSA values
43 | verbose (bool=False): set True for logs
44 | """
45 |
46 | # Guardians
47 | assert os.path.isfile(pdb_path), f"ERROR in Structure(): pdb_path='{pdb_path}' file does not exist."
48 | assert pdb_path.endswith(".pdb"), f"ERROR in Structure(): pdb_path='{pdb_path}' should end with '.pdb'."
49 | assert len(chain) == 1 and chain != " ", f"ERROR in Structure(): chain='{chain}' should be a string of length 1 and not ' '."
50 | solver_list = list(self.RSA_SOLVERS.keys())
51 | assert rsa_solver in solver_list, f"ERROR in Structure(): rsa_solver='{rsa_solver}' should be in {solver_list}."
52 |
53 | # Init base properties
54 | self.pdb_path = pdb_path
55 | self.pdb_name = os.path.basename(self.pdb_path).removesuffix(".pdb")
56 | self.chain = chain
57 | self.name = f"{self.pdb_name}_{self.chain}"
58 | self.rsa_solver = rsa_solver
59 | self.rsa_solver_path = rsa_solver_path
60 | self.verbose = verbose
61 |
62 | # Parse structure
63 | self.residues: List[Residue] = []
64 | self.chain_residues: List[Residue] = []
65 | self.residues_map: Dict[str, Residue] = {}
66 | self._parse_structure()
67 |
68 | # Set sequence
69 | self.sequence = Sequence(f"{self.name} (PDB, ATOM-lines)", "".join(res.amino_acid.one for res in self.chain_residues))
70 |
71 | # Assign RSA
72 | solver: RSASolver = self.RSA_SOLVERS[rsa_solver]
73 | rsa_map = solver(self.rsa_solver_path, self.verbose).run(self.pdb_path, rsa_cache_path=rsa_cache_path)
74 | n_assigned_in_chain = 0
75 | for residue in self.residues:
76 | resid = residue.resid
77 | if resid in rsa_map:
78 | if residue.chain == self.chain:
79 | n_assigned_in_chain += 1
80 | residue.rsa = rsa_map[resid]
81 |
82 | # Log
83 | if self.verbose:
84 | print(f" * {n_assigned_in_chain} / {len(self.chain_residues)} assigned RSA values for chain '{self.chain}'")
85 |
86 |
87 | # Base properties ----------------------------------------------------------
88 | def __str__(self) -> str:
89 | return f"Structure('{self.name}', l={len(self)})"
90 |
91 | def __len__(self) -> int:
92 | return len(self.residues)
93 |
94 | def __contains__(self, resid: str) -> bool:
95 | return resid in self.residues_map
96 |
97 | def __getitem__(self, id: int) -> dict:
98 | return self.residues[id]
99 |
100 | def __iter__(self):
101 | return iter(self.residues)
102 |
103 | # Deendencies --------------------------------------------------------------
104 | def _parse_structure(self) -> None:
105 | """Parse residues data from PDB file."""
106 |
107 | # Init
108 | model_counter = 0
109 | current_chain = None
110 | closed_chains = set()
111 |
112 | # Parse PDB residues
113 | with open(self.pdb_path, "r", encoding="ISO-8859-1") as fs:
114 | line = fs.readline()
115 | while line:
116 | prefix = line[0:6]
117 |
118 | # Atom line
119 | if prefix == "ATOM " or prefix == "HETATM":
120 | current_chain = line[21]
121 | if current_chain in closed_chains: # discard ATOM line if chain is closed
122 | line = fs.readline()
123 | continue
124 | position = line[22:27].replace(" ", "")
125 | aa_three = line[17:20]
126 | aa = AminoAcid.parse_three(aa_three)
127 | if aa.is_unknown(): # discard non amino acid ATOM lines
128 | line = fs.readline()
129 | continue
130 | resid = current_chain + position
131 | if resid not in self.residues_map:
132 | residue = Residue(current_chain, position, aa)
133 | self.residues.append(residue)
134 | self.residues_map[resid] = residue
135 |
136 | # Manage multiple models: consider only model 1
137 | elif prefix == "MODEL ":
138 | model_counter += 1
139 | if model_counter > 1:
140 | #print(f"WARNING in {self}: PDB contains multiple models, but only model 1 will be considered.")
141 | break
142 |
143 | # Manage closed chains: ATOMS that appears after the chain is closed are not part of the protein chain
144 | elif prefix == "TER " or prefix == "TER\n":
145 | if current_chain is not None:
146 | closed_chains.add(current_chain)
147 |
148 | # Take next line
149 | line = fs.readline()
150 |
151 | # Set residues list of target chain
152 | self.chain_residues = [res for res in self.residues if res.chain == self.chain]
153 |
154 | # No target chain error
155 | if len(self.chain_residues) == 0:
156 | error_log = f"ERROR in {self}._parse_structure(): target chain '{self.chain}' not found in PDB file."
157 | error_log += f"\n * pdb_path: '{self.pdb_path}'"
158 | error_log += f"\n * num total residues: {len(self.residues)}"
159 | error_log += f"\n * existing chains: {list(set([res.chain for res in self.residues]))}"
160 | raise ValueError(error_log)
--------------------------------------------------------------------------------
/rsalor/utils/CSV.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | import csv
5 | from typing import Union, Tuple, List, Dict, Callable
6 | import numpy as np
7 |
8 |
9 | # Main -------------------------------------------------------------------------
10 | class CSV:
11 | """
12 | Class to read/write a CSV file and manage it as a dataframe.
13 | * It never assumes a column or a cell type except when it is specified (all cells as :str by default).
14 | * Manages safety: impossible to have redundent values in header.
15 | """
16 |
17 | # Constants ----------------------------------------------------------------
18 | ALLOWED_EXTENTIONS = ["csv", "tsv"]
19 |
20 | # Constructor --------------------------------------------------------------
21 | def __init__(
22 | self,
23 | header: List[str]=[],
24 | sep: str=";",
25 | name: str="DataFrame",
26 | print_warnings: bool=True,
27 | ):
28 |
29 | # Base properties
30 | self.name = name
31 | self.print_warnings = print_warnings
32 |
33 | # Content
34 | self._header = Header(header, sep)
35 | self.entries = []
36 |
37 | # Basic properties ---------------------------------------------------------
38 | @property
39 | def sep(self) -> str:
40 | return self._header.sep
41 |
42 | def __len__(self) -> int:
43 | return len(self.entries)
44 |
45 | def __contains__(self, property_name: str) -> bool:
46 | return property_name in self._header
47 |
48 | def __getitem__(self, id: int) -> dict:
49 | return self.entries[id]
50 |
51 | def __iter__(self):
52 | return iter(self.entries)
53 |
54 | @property
55 | def n_rows(self) -> int:
56 | return len(self)
57 |
58 | @property
59 | def n_cols(self) -> int:
60 | return len(self._header)
61 |
62 | @property
63 | def shape(self) -> Tuple[int, int]:
64 | return (self.n_rows, self.n_cols)
65 |
66 | @property
67 | def df_size(self) -> int:
68 | return self.n_rows * self.n_cols
69 |
70 | def header(self) -> List[str]:
71 | return [p for p in self._header.properties]
72 |
73 | def __str__(self) -> str:
74 | return f"CSV('{self.name}', r={self.n_rows}, c={self.n_cols})"
75 |
76 | def warning(self, warning_str: str="") -> None:
77 | """Log a CSV Warning."""
78 | if self.print_warnings:
79 | print(f"WARNING in {self}{warning_str}")
80 |
81 | # Methods ------------------------------------------------------------------
82 | def set_sep(self, sep: str, safety_check: bool=True):
83 | """Set separator for the CSV (for .read and .write)"""
84 |
85 | if safety_check:
86 |
87 | # Computational time warning
88 | if self.df_size > 1000:
89 | self.warning(
90 | f".set_sep('{sep}'): could be computationally expensive when CSV object already contains many entries. " + \
91 | f"You can set 'safety_check' to False to skip coherence checks with separator."
92 | )
93 |
94 | # Guardians
95 | for entry in self.entries:
96 | for key, value in entry.items():
97 | assert sep not in str(value), f"ERROR in {self}.set_sep('{sep}'): sep contained in entry's value ('{key}': '{value}')."
98 |
99 | # Set
100 | self._header.set_sep(sep)
101 | return self
102 |
103 | def add_entry(self, entry: dict):
104 | entry = {prop: entry[prop] for prop in self._header}
105 | self.entries.append(entry)
106 | return self
107 |
108 | def add_entries(self, entries: List[dict]):
109 | for entry in entries:
110 | self.add_entry(entry)
111 | return self
112 |
113 | def add_col(self, property: str, values: list, allow_replacement=False):
114 | if property in self._header:
115 | assert allow_replacement, f"ERROR in {self}.add_col(): property='{property}' already exists and allow_replacement is set to False."
116 | else:
117 | self._header.add(property)
118 | assert len(values) == len(self), f"ERROR in {self}.add_col(): values length ({len(values)}) != CSV length ({len(self)})."
119 | for entry, value in zip(self.entries, values):
120 | entry[property] = value
121 | return self
122 |
123 | def add_empty_col(self, property: str, missing_value: str="XXX", allow_replacement=False):
124 | values = [missing_value for _ in self.entries]
125 | self.add_col(property, values, allow_replacement=allow_replacement)
126 | return self
127 |
128 | def add_csv(self, other_csv):
129 | """Merge other_csv entries with current CSV (keeps header of current CSV)."""
130 | for property_name in self.header():
131 | assert property_name in other_csv._header, f"ERROR in {self}.add_csv(): property='{property_name}' does not exists in other_csv ({other_csv})."
132 | for entry in other_csv:
133 | self.add_entry(entry)
134 | return self
135 |
136 | def remove_col(self, property: str):
137 | self._header.remove(property)
138 | for entry in self.entries:
139 | del entry[property]
140 | return self
141 |
142 | def rename_col(self, property_old: str, property_new: str):
143 | self._header.rename(property_old, property_new)
144 | for entry in self.entries:
145 | entry[property_new] = entry[property_old]
146 | del entry[property_old]
147 | return self
148 |
149 | def order_header(self, header_order: List[str]):
150 | self._header.order(header_order)
151 | return self
152 |
153 | def filter(self, keep_entry_function: Callable, do_print: bool=False, filter_name: str=""):
154 | """Filter entries in the CSV with a filter_function."""
155 | l1 = len(self)
156 | self.entries = [entry for entry in self.entries if keep_entry_function(entry)]
157 | l2 = len(self)
158 | if do_print:
159 | print(f"{self}: Filter('{filter_name}'): {l1} -> {l2}")
160 | return self
161 |
162 | def set_col_type(self, property_name: str, dt: type, default_value=None):
163 | assert property_name in self._header, f"ERROR in {self}.set_col_type(): property_name='{property_name}' does not exists."
164 | for entry in self.entries:
165 | entry[property_name] = to_type(entry[property_name], dt, default_value=default_value)
166 |
167 | # Get Methods --------------------------------------------------------------
168 | def get_col(self, property: str, dt: Union[None, type]=None, default_value=None, as_numpy: bool=False):
169 | """Get Column of CSV as array."""
170 | assert property in self, f"ERROR in {self}.get_array('{property}'): property does not exists."
171 | col_list = [entry[property] for entry in self.entries]
172 | if dt is not None:
173 | col_list = [to_type(el, dt, default_value=default_value) for el in col_list]
174 | if as_numpy:
175 | col_list = np.array(col_list)
176 | return col_list
177 |
178 | def get_row(self, id: int, dt: Union[None, type]=None, default_value=None, as_numpy: bool=False):
179 | """Get Raw of CSV as array"""
180 | entry = self[id]
181 | row_list = [entry[p] for p in self._header]
182 | if dt is not None:
183 | row_list = [to_type(el, dt, default_value=default_value) for el in row_list]
184 | if as_numpy:
185 | row_list = np.array(row_list)
186 | return row_list
187 |
188 | def get_X(self, features: List[str]) -> np.ndarray:
189 | """Get features matrix X (numpy) from the CSV."""
190 | for feature in features:
191 | assert feature in self, f"ERROR in {self}.get_X(): feature='{feature}' does not exists."
192 | return np.array([
193 | [float(entry[feature]) for feature in features]
194 | for entry in self.entries
195 | ])
196 |
197 | def get_y(self, label: str) -> np.ndarray:
198 | """Get label array y (numpy) from the CSV."""
199 | assert label in self, f"ERROR in {self}.get_y(): label='{label}' does not exists."
200 | return np.array([float(entry[label]) for entry in self.entries])
201 |
202 | def get_Xy(self, features: List[str], label: str) -> Tuple[np.ndarray, np.ndarray]:
203 | """Get (features, label) tuple (X, y) (numpy) from the CSV."""
204 | return self.get_X(features), self.get_y(label)
205 |
206 | @staticmethod
207 | def hash_entry(entry: dict, hash_properties: List[str], sep: str="_") -> str:
208 | """Hash an entry (to :str) by values of its hash_properties."""
209 | return sep.join([entry[prop] for prop in hash_properties])
210 |
211 | @staticmethod
212 | def get_hash_entry(hash_properties: List[str], sep: str="_") -> Callable:
213 | """Generate a hash_entry function."""
214 | def hash_entry_function(entry: dict) -> str:
215 | return sep.join([entry[prop] for prop in hash_properties])
216 | return hash_entry_function
217 |
218 | def get_map(self, hash_properties: List[str], sep: str="_", map_function: Union[None, Callable]=None) -> Dict[str, Dict]:
219 | """
220 | Obtain a map {hash(entry) -> entry} from CSV (redundencies not allowed).
221 | * if map_function is set, values of the map are defined as map_function(entry)
222 | """
223 | for property in hash_properties:
224 | assert property in self, f"ERROR in {self}.to_map(): property='{property}' not in header."
225 | entries_map = {}
226 | for entry in self.entries:
227 | h = self.hash_entry(entry, hash_properties, sep=sep)
228 | assert h not in entries_map, f"ERROR in {self}.to_map({hash_properties}) redundency found for '{h}'."
229 | entries_map[h] = entry
230 | if map_function is not None:
231 | for h, entry in entries_map.items():
232 | entry[h] = map_function(entry)
233 | return entries_map
234 |
235 | def get_groups(self, hash_properties: List[str], sep: str="_", map_function: Union[None, Callable]=None) -> Dict[str, List[Dict]]:
236 | """
237 | Obtain a map for groups {hash(entry) -> [entries_list]} from CSV.
238 | * if map_function is set, values of the map are defined as [map_function(entry), ...]
239 | """
240 | for property in hash_properties:
241 | assert property in self, f"ERROR in {self}.to_map(): property='{property}' not in header."
242 | groups_map = {}
243 | for entry in self:
244 | h = self.hash_entry(entry, hash_properties, sep=sep)
245 | if h not in groups_map:
246 | groups_map[h] = []
247 | groups_map[h].append(entry)
248 | if map_function is not None:
249 | for h, group in groups_map.items():
250 | groups_map[h] = [map_function(e) for e in group]
251 | return groups_map
252 |
253 | def copy(self):
254 | """Copy CSV object."""
255 | new_csv = CSV()
256 | new_csv.name = self.name
257 | new_csv.print_warnings = self.print_warnings
258 | new_csv._header = self._header.copy()
259 | new_csv.entries = [
260 | {k: v for k, v in entry.items()}
261 | for entry in self.entries
262 | ]
263 | return new_csv
264 |
265 | def show(self, n_entries: int=5, min_colsize: int=3, max_colsize: int=20, max_linesize: int=200, round_digit: int=4, sep: str=" | ") -> None:
266 | """Show summary of CSV."""
267 | lines = [self._header.properties] + [self.get_row(id) for id in range(min(n_entries, len(self)))]
268 | col_sizes = [
269 | max([min_colsize, min([max([len(stringify_float(line[i], round_digit=round_digit)) for line in lines]), max_colsize])])
270 | for i in range(len(self._header))
271 | ]
272 | print(self)
273 | for line in lines:
274 | print_line(line, sizes=col_sizes, max_linesize=max_linesize, round_digit=round_digit, sep=sep)
275 | if len(self) > n_entries:
276 | print(" ...")
277 |
278 | # IO -----------------------------------------------------------------------
279 | def write(self, output_path: str):
280 | """Save to file."""
281 |
282 | # Guardians
283 | output_path = os.path.abspath(output_path)
284 | assert any([output_path.endswith(f".{extention}")] for extention in CSV.ALLOWED_EXTENTIONS), f"ERROR in {self}.write('{output_path}'): extention sould be among {CSV.ALLOWED_EXTENTIONS})."
285 | assert os.path.isdir(os.path.dirname(output_path)), f"ERROR in {self}.write('{output_path}'): destination folder does not exists."
286 | if output_path.endswith("tsv"):
287 | assert self.sep == "\t", f"ERROR in {self}.write('{output_path}'): if extention is '.tsv', separator should be '\\t' however sep='{self.sep}'."
288 |
289 | # Stringify
290 | str_header = self.sep.join(self._header.properties)
291 | str_entries_list = [
292 | self.sep.join(str(entry[prop]) for prop in self._header.properties)
293 | for entry in self.entries
294 | ]
295 | str_lines = [str_header] + str_entries_list
296 |
297 | # Write
298 | with open(output_path, "w") as fs:
299 | fs.write("\n".join(str_lines))
300 | return self
301 |
302 | def read(self, input_path: str, col_types: Dict[str, type]={}, col_default: dict={}):
303 | """Read from file."""
304 |
305 | # Guardians
306 | assert any([input_path.endswith(f".{extention}")] for extention in CSV.ALLOWED_EXTENTIONS), f"ERROR in {self}.read('{input_path}'): extention sould be among {CSV.ALLOWED_EXTENTIONS})."
307 | assert os.path.isfile(input_path), f"ERROR in {self}.read('{input_path}'): input_path file does not exists."
308 |
309 | # Set name
310 | file_name = os.path.basename(input_path)
311 | name = ".".join(file_name.split(".")[:-1])
312 | self.name = name
313 |
314 | # Parse csv from file
315 | with open(input_path, newline='') as csvfile:
316 | csv_lines = list(csv.reader(csvfile, delimiter=self.sep))
317 |
318 | # Set CSV header
319 | header = csv_lines[0]
320 | if len(header) <= 1:
321 | self.warning(f".read('{input_path}'): header contains {len(header)} values. Maybe sep='{self.sep}' parameter in incorrect.")
322 | self._header = Header(header, self.sep)
323 |
324 | # Set CSV entries
325 | self.entries = []
326 | for i, line in enumerate(csv_lines[1:]):
327 | assert len(line) == len(header), f"ERROR in {self}.read('{input_path}'): number of elements ({len(line)}) in entry ({i+1}/{len(csv_lines)-1}) does not match the header ({len(header)})."
328 | self.entries.append({prop: value for prop, value in zip(header, line)})
329 |
330 | # Set column types if required
331 | for col_name, dt in col_types.items():
332 | col_default_value = col_default.get(col_name, None)
333 | self.set_col_type(col_name, dt, default_value=col_default_value)
334 |
335 | return self
336 |
337 | # Dependencies -----------------------------------------------------------------
338 |
339 | class Header:
340 | """
341 | Container for the Header of a CSV object.
342 | -> ordered list with no repetitions allowed and a separator of length = 1.
343 | """
344 |
345 | # Constructor --------------------------------------------------------------
346 | def __init__(self, properties: List[str], sep: str):
347 |
348 | # Init
349 | self.sep = ""
350 | self.properties = []
351 | self.properties_set = set()
352 |
353 | # Set header values
354 | self.set_sep(sep)
355 | for property in properties:
356 | self.add(property)
357 |
358 | # Basic properties ---------------------------------------------------------
359 | def __getitem__(self, id: int) -> str:
360 | return self.properties[id]
361 |
362 | def __iter__(self):
363 | return iter(self.properties)
364 |
365 | def __contains__(self, property_name: str) -> bool:
366 | return property_name in self.properties_set
367 |
368 | def __len__(self) -> int:
369 | return len(self.properties)
370 |
371 | def __str__(self) -> str:
372 | return f"CSV.Header(l={len(self)})"
373 |
374 | def show(self):
375 | MAX_CHAR = 80
376 | properties_str = f"'{self.properties[0]}'"
377 | for property in self.properties[1:]:
378 | if len(properties_str) + len(property) > MAX_CHAR:
379 | properties_str += ", ..."
380 | break
381 | properties_str += f", '{property}'"
382 | print(f"CSV.Header([{properties_str}], len={len(self)}, sep='{self.sep}')")
383 | return self
384 |
385 | def idof(self, property_name: str) -> int:
386 | assert property_name in self, f"ERROR in {self}.idof(): property_name='{property_name}' not in header."
387 | for i, current_property_name in enumerate(self):
388 | if property_name == current_property_name:
389 | return i
390 |
391 | # Methods ------------------------------------------------------------------
392 | def set_sep(self, sep: str):
393 | assert len(sep) == 1, f"ERROR in {self}.set_sep(): sep='{sep}' should be of length 1."
394 | for property in self:
395 | assert sep not in property, f"ERROR in {self}.set_sep(): sep='{sep}' is contained in property '{property}'."
396 | self.sep = sep
397 | return self
398 |
399 | def add(self, property_name: str):
400 | assert self.sep not in property_name, f"ERROR in {self}.add('{property_name}'): property contains sep='{self.sep}'."
401 | assert property_name not in self, f"ERROR in {self}.add('{property_name}'): property already exists."
402 | self.properties.append(property_name)
403 | self.properties_set.add(property_name)
404 | return self
405 |
406 | def remove(self, property_name: str):
407 | assert property_name in self, f"ERROR in {self}.remove('{property_name}'): property does not exists."
408 | self.properties.remove(property_name)
409 | self.properties_set.remove(property_name)
410 | return self
411 |
412 | def rename(self, property_old: str, property_new: str):
413 | assert property_old != property_new, f"ERROR in {self}.rename(): old property and new property have the same value '{property_old}'."
414 | assert property_old in self, f"ERROR in {self}.rename(): old property '{property_old}' is not in header."
415 | assert property_new not in self, f"ERROR in {self}.rename(): new property '{property_new}' already in header."
416 | assert self.sep not in property_new, f"ERROR in {self}.rename(): new property '{property_new}' contains sep='{self.sep}'."
417 | id = self.idof(property_old)
418 | self.properties[id] = property_new
419 | self.properties_set.add(property_new)
420 | self.properties_set.remove(property_old)
421 | return self
422 |
423 | def order(self, header_order: List[str]):
424 | for property in header_order:
425 | assert property in self, f"ERROR in {self}.order(): property '{property}' not in header."
426 | ordered_properties_set = set(header_order)
427 | unordered_properties = [property for property in self if property not in ordered_properties_set]
428 | self.properties = header_order + unordered_properties
429 | return self
430 |
431 | def copy(self):
432 | return Header([p for p in self], self.sep)
433 |
434 | # Dependency: Utils Funcions ---------------------------------------------------
435 | def to_type(input, dt:type, default_value=None):
436 | """Convert input to type dt. If default_value is set, returns default_value when convertion fails."""
437 | try:
438 | return dt(input)
439 | except:
440 | if default_value is None:
441 | raise ValueError(f"ERROR in CSV().to_type(): input='{input}' not convertable to {dt}. Please correct input or set a default_value.")
442 | else:
443 | return default_value
444 |
445 | def print_line(
446 | line_list,
447 | sep: str=" | ", dots_str: str="...",
448 | size: int=20, sizes: Union[None, List[int]]=None, max_linesize: int=200,
449 | round_digit: int=4,
450 | ) -> None:
451 | """Print a line from a table (dataframe) in a standardized way."""
452 | if sizes is None: sizes = [size for _ in line_list]
453 | line_str = ""
454 | unprinted_cols = False
455 | for element, size in zip(line_list, sizes):
456 | line_new_col = sep + format_string(element, size, round_digit=round_digit)
457 | if len(line_str) + len(line_new_col) > max_linesize - (len(sep) + len(dots_str)):
458 | unprinted_cols = True
459 | break
460 | line_str += line_new_col
461 | if unprinted_cols:
462 | line_str += sep + "..."
463 | line_str += sep
464 | print(line_str[1:-1])
465 |
466 | def format_string(input, size: int=20, filler: int=" ", dots_str: str="...", round_digit: int=4) -> str:
467 | """Format a string to standardized form (length, ...)"""
468 | input_str = stringify_float(input, round_digit=round_digit)
469 | if len(input_str) > size:
470 | return input_str[:size-len(dots_str)] + dots_str
471 | else:
472 | return input_str + filler*(size - len(input_str))
473 |
474 | def stringify_float(input, round_digit: int=4) -> str:
475 | if isinstance(input, float):
476 | str_float = str(round(input, round_digit))
477 | n_digits = len(str_float.split(".")[-1])
478 | str_float = str_float + ("0"*(round_digit-n_digits))
479 | if str_float[0] != "-":
480 | str_float = " " + str_float
481 | return str_float
482 | else:
483 | return str(input)
484 |
--------------------------------------------------------------------------------
/rsalor/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from rsalor.utils.utils import is_convertable_to, memory_str, time_str, find_file
2 | from rsalor.utils.CSV import CSV
3 | from rsalor.utils.logger import Logger
4 | from rsalor.utils.ali_to_fasta import ali_to_fasta
--------------------------------------------------------------------------------
/rsalor/utils/ali_to_fasta.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from os import remove
5 | from typing import Union
6 | from Bio import SeqIO
7 |
8 | # MSA files processing functions -----------------------------------------------
9 | def ali_to_fasta(input_path: str, output_path: str, delete_input: bool=False) -> Union[None, str]:
10 | """Convert '.ali' (Stickholm format) file to a '.fasta' file.
11 | * Then, deletes input '.ali' file if required.
12 | * Returns output_path or None if execution failed.
13 | Source: https://stackoverflow.com/questions/24156578/using-bio-seqio-to-write-single-line-fasta
14 | """
15 |
16 | # Guardians
17 | error_log = f"ERROR in ali_to_fasta(): "
18 | error_log += f"\n * input_path : '{input_path}'"
19 | error_log += f"\n * output_path : '{output_path}'\n"
20 | if not os.path.isfile(input_path):
21 | print(f"{error_log} -> input file does not exists.")
22 | return None
23 | if not is_nonempty_file(input_path):
24 | print(f"{error_log} -> input file is empty.")
25 | return None
26 | if not input_path.endswith(".ali"):
27 | print(f"{error_log} -> input file should end with '.ali'.")
28 | return None
29 | if not output_path.endswith(".fasta"):
30 | print(f"{error_log} -> output file should end with '.fasta'.")
31 | return None
32 |
33 | # Run convertion
34 | try:
35 | records = SeqIO.parse(input_path, "stockholm")
36 | except Exception as error:
37 | print(f"{error_log} -> input file parsing failed.")
38 | print(error)
39 | return None
40 | try:
41 | SeqIO.FastaIO.FastaWriter(output_path, wrap=None).write_file(records)
42 | except Exception as error:
43 | print(f"{error_log} -> file convertion + writing failed.")
44 | print(error)
45 | return None
46 |
47 | # Detect errors
48 | if not is_nonempty_file(output_path):
49 | print(f"{error_log} -> converted output file is empty.")
50 | return None
51 |
52 | # Delete initial '.ali' file if required
53 | if delete_input:
54 | if os.path.isfile(input_path):
55 | remove(input_path)
56 |
57 | # Return
58 | return output_path
59 |
60 | # Dependency -------------------------------------------------------------------
61 | def is_nonempty_file(input_path: str) -> bool:
62 | """Check if 'input_path' is an existing non-empty file."""
63 | if not os.path.isfile(input_path):
64 | return False
65 | with open(input_path, "r") as fs:
66 | line = fs.readline()
67 | return len(line) > 0
--------------------------------------------------------------------------------
/rsalor/utils/logger.py:
--------------------------------------------------------------------------------
1 |
2 | # Logger -----------------------------------------------------------------------
3 | class Logger:
4 |
5 | # Constants ----------------------------------------------------------------
6 | HEADER = '\033[95m'
7 | OKBLUE = '\033[94m'
8 | OKCYAN = '\033[96m'
9 | OKGREEN = '\033[92m'
10 | WARNING = '\033[93m'
11 | FAIL = '\033[91m'
12 | ENDC = '\033[0m'
13 | BOLD = '\033[1m'
14 | UNDERLINE = '\033[4m'
15 |
16 | # Constructor --------------------------------------------------------------
17 | def __init__(
18 | self,
19 | verbose: bool,
20 | disable_warnings: bool=True,
21 | step_prefix: str="STEP",
22 | warning_prefix: str="WARNING",
23 | error_prefix: str="ERROR",
24 | step_note: str="",
25 | warning_note: str="",
26 | error_note: str="",
27 | ):
28 | """Minimalistic logger:
29 | * manage verbose and disable_warnings
30 | * add colored prefixes to logs
31 | """
32 | self.verbose = verbose
33 | self.disable_warnings = disable_warnings
34 | self._step_prefix = step_prefix
35 | self._warning_prefix = warning_prefix
36 | self._error_prefix = error_prefix
37 | self._step_note = step_note
38 | self._warning_note = warning_note
39 | self._error_note = error_note
40 |
41 | # Methods ------------------------------------------------------------------
42 | @property
43 | def STEP_PREFIX(self) -> str:
44 | return f"{self.OKGREEN}{self._step_prefix}{self.ENDC}{self._step_note}"
45 |
46 | @property
47 | def WARNING_PREFIX(self) -> str:
48 | return f"{self.WARNING}{self._warning_prefix}{self.ENDC}{self._warning_note}"
49 |
50 | @property
51 | def CRITICAL_WARNING_PREFIX(self) -> str:
52 | return f"{self.FAIL}{self._warning_prefix}{self.ENDC}{self._warning_note}"
53 |
54 | @property
55 | def ERROR_PREFIX(self) -> str:
56 | return f"{self.FAIL}{self._error_prefix}{self.ENDC}{self._error_note}"
57 |
58 | def log(self, log_str: str) -> None:
59 | if self.verbose:
60 | print(log_str)
61 |
62 | def step(self, log_str: str) -> None:
63 | if self.verbose:
64 | print(f"{self.STEP_PREFIX}: {log_str}")
65 |
66 | def warning(self, log_str: str, critical: bool=False) -> None:
67 | prefix = self.WARNING_PREFIX
68 | if critical:
69 | prefix = self.CRITICAL_WARNING_PREFIX
70 | if not self.disable_warnings:
71 | print(f"{prefix}: {log_str}")
72 |
73 | def error(self, log_str: str) -> None:
74 | print(f"{self.ERROR_PREFIX}: {log_str}")
75 |
76 | def error_str(self, log_str: str) -> str:
77 | return f"{self.ERROR_PREFIX}: {log_str}"
--------------------------------------------------------------------------------
/rsalor/utils/utils.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from shutil import which
5 | from typing import List, Union
6 |
7 |
8 | # Base functions ---------------------------------------------------------------
9 | def is_convertable_to(input_object, input_type) -> bool:
10 | """Return if input_object is convertable to input_type."""
11 | try:
12 | _ = input_type(input_object)
13 | return True
14 | except:
15 | return False
16 |
17 | def memory_str(n_bytes: int) -> str:
18 | """Return a human readable string for a memory size measure (input in bytes)."""
19 | if n_bytes / 1000**3 > 1.0:
20 | return f"{n_bytes / 1000**3:.3f} GB"
21 | elif n_bytes / 1000**2 > 1.0:
22 | return f"{n_bytes / 1000**2:.3f} MB"
23 | elif n_bytes / 1000 > 1.0:
24 | return f"{n_bytes / 1000:.3f} kB"
25 | else:
26 | return f"{n_bytes} B"
27 |
28 | def time_str(n_sec: float) -> str:
29 | """Return a human readable string for a time measure (input in seconds)."""
30 | if n_sec / (60*60*24) > 1.0:
31 | return f"{n_sec / (60*60*24):.3f} d."
32 | elif n_sec / (60*60) > 1.0:
33 | return f"{n_sec / (60*60):.3f} h."
34 | elif n_sec / 60 > 1.0:
35 | return f"{n_sec / 60:.3f} min."
36 | else:
37 | return f"{n_sec:.3f} sec."
38 |
39 | def find_file(path_list: List[str], is_software: bool, name: str, description: Union[str, None]=None, verbose: bool=False,) -> str:
40 | """Find first existing file among path_list."""
41 |
42 | # Find valid path among candidates
43 | output_path = None
44 | for candidate_path in path_list:
45 |
46 | # Find as a path to a file
47 | if os.path.isfile(candidate_path):
48 | output_path = candidate_path
49 | if verbose:
50 | print(f" * Set path for [{name}] (AS PATH TO A FILE): '{output_path}'")
51 | break
52 |
53 | # Find valid bash command executable in PATH
54 | if output_path is None and is_software:
55 | for candidate_path in path_list:
56 | basename = os.path.basename(candidate_path)
57 | which_candidate_path = which(basename)
58 | if which_candidate_path is not None:
59 | output_path = which_candidate_path
60 | if verbose:
61 | print(f" * set path for [{name}] (AS EXECUTABLE): '{output_path}'")
62 | break
63 |
64 | # Raise error if no valid path is found
65 | if output_path is None:
66 |
67 | # Init error message
68 | instance_name = "software" if is_software else "file"
69 | error_str = f"\nERROR in find_file(): no valid path found for {instance_name} '{name}':"
70 | error_str += "\nPath to file not found among: "
71 |
72 | # List failed candidates
73 | for candidate_path in path_list:
74 | error_str += f"\n - '{candidate_path}'"
75 | if is_software:
76 | error_str += "\nCommand not found in the system PATH among: "
77 | for candidate_path in path_list:
78 | error_str += f"\n - '{os.path.basename(candidate_path)}'"
79 |
80 | # Add recommendaiton
81 | if is_software:
82 | error_str += f"\n -> Please install software '{name}' and provide the path to its executable file or add it to system PATH."
83 |
84 | # Add description
85 | if description is not None:
86 | error_str += f"\nDescription: \n{description}"
87 | raise ValueError(error_str)
88 |
89 | # Return first found valid path
90 | return output_path
91 |
92 |
--------------------------------------------------------------------------------
/rsalor/weights/CMakeLists.txt:
--------------------------------------------------------------------------------
1 | # Specify the minimum CMake version required
2 | cmake_minimum_required(VERSION 3.10)
3 | project(computeWeightsBackend)
4 |
5 | # Specify C++ standard
6 | set(CMAKE_CXX_STANDARD 11)
7 | set(CMAKE_CXX_STANDARD_REQUIRED True)
8 |
9 | # Include directories
10 | include_directories(include)
11 |
12 | # Source files
13 | set(SRC_FILES
14 | computeWeightsBackend.cpp
15 | msa.cpp
16 | )
17 |
18 | # Shared library output
19 | add_library(computeWeightsBackend SHARED ${SRC_FILES})
20 |
21 | # Change output name to _computeWeightsBackend.so
22 | set_target_properties(computeWeightsBackend PROPERTIES
23 | OUTPUT_NAME "_computeWeightsBackend"
24 | SUFFIX ".so"
25 | )
26 |
27 | # Compiler options (optional: add any optimization/debug flags here)
28 | target_compile_options(computeWeightsBackend PRIVATE -fPIC -Ofast)
29 |
30 | # Link libraries (add more if needed, e.g., -lpthread)
31 | target_link_libraries(computeWeightsBackend PRIVATE)
--------------------------------------------------------------------------------
/rsalor/weights/__init__.py:
--------------------------------------------------------------------------------
1 | from rsalor.weights.compute_weights import compute_weights, read_weights, write_weights
--------------------------------------------------------------------------------
/rsalor/weights/computeWeightsBackend.cpp:
--------------------------------------------------------------------------------
1 | #include "include/msa.h"
2 |
3 | extern "C" float* computeWeightsBackend(
4 | const char* msa_path,
5 | unsigned int const msa_len,
6 | unsigned int const msa_depth,
7 | float seqid,
8 | bool count_target_sequence,
9 | unsigned int num_threads,
10 | bool verbose
11 | )
12 | {
13 |
14 | // Init MSA
15 | MSA msa(
16 | msa_path,
17 | msa_len,
18 | msa_depth,
19 | seqid,
20 | count_target_sequence,
21 | num_threads,
22 | verbose
23 | );
24 |
25 | // Check depth consistency
26 | unsigned int observed_msa_depth = msa.getDepth();
27 | if(observed_msa_depth != msa_depth) {
28 | std::cerr << "ERROR in computeWeights() (C++ backend): input msa_depth do not match to computed msa depth." << std::endl;
29 | std::cerr << " * msa_path: " << msa_path << std::endl;
30 | std::cerr << " * input msa_depth: " << msa_depth << std::endl;
31 | std::cerr << " * observed msa_depth: " << observed_msa_depth << std::endl;
32 | throw std::runtime_error("Invalid msa_depth argument");
33 | }
34 |
35 | // Allocate memory to the weights pointer because it will be passed to python
36 | float* weight_ptr = (float*)malloc(msa_depth*sizeof(float));
37 | auto weights_ptr_local = msa.getWeightsPointer();
38 | for(int i = 0; i < msa_depth; i++) { // Copy content from local
39 | weight_ptr[i]= weights_ptr_local[i];
40 | }
41 | return weight_ptr;
42 |
43 | }
44 |
45 | extern "C" void freeWeights(void* weights_ptr) {
46 | float* weights_ptr_casted = static_cast(weights_ptr);
47 | if(weights_ptr_casted !=nullptr){
48 | delete [] weights_ptr_casted;
49 | weights_ptr_casted = nullptr;
50 | }
51 | }
--------------------------------------------------------------------------------
/rsalor/weights/compute_weights.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | import os.path
4 | from typing import List
5 | import glob
6 | import numpy as np
7 | import ctypes
8 |
9 |
10 | # Main -------------------------------------------------------------------------
11 | def compute_weights(
12 | msa_path: str,
13 | msa_len: int,
14 | msa_depth: int,
15 | seqid: float=0.80,
16 | count_target_sequence: bool=True,
17 | num_threads: int=1,
18 | verboses: bool=False,
19 | ) -> List[float]:
20 | """Compute weights for all sequences of an MSA.
21 | Use C++ backend for time-performance. Implementation inspired from python package 'pycofitness'.
22 |
23 | Arguments:
24 | msa_path (str): path to msa '.fasta' file
25 | msa_len (int): length of the MSA (length of target sequence)
26 | msa_depth (int): depth of the MSA (number of sequences in the MSA)
27 | seqid (float): sequence identity threshold to consider two sequences as similar (default=0.80)
28 | count_target_sequence (bool): count target sequence in weights computations
29 | num_threads (int): number of threads (CPUs) used by C++ backend (default=1)
30 | verboses (bool): set True to log steps of execution (default=False)
31 |
32 | Return:
33 | weights (List[float])
34 | """
35 |
36 | # Guardians
37 | assert msa_path.endswith(".fasta"), f"ERROR in compute_weights(): msa_path='{msa_path}' should end with '.fasta'."
38 | assert os.path.exists(msa_path), f"ERROR in compute_weights('{msa_path}'): msa_path='{msa_path}' files does not exist."
39 | assert 0.0 < seqid < 1.0, f"ERROR in compute_weights('{msa_path}'): seqid={seqid} (for clustering to compute weights) should be in [0, 1] excluded."
40 | assert num_threads > 0, f"ERROR in compute_weights('{msa_path}'): num_threads={num_threads} should be stricktly positive."
41 |
42 | # Find C++ computeWeightsBackend compiled executable file
43 | path_prefix = os.path.join(os.path.dirname(__file__), "lib_computeWeightsBackend*")
44 | backend_so_paths = glob.glob(path_prefix)
45 | try:
46 | BACKEND_SO_PATH = backend_so_paths[0]
47 | except IndexError:
48 | error_log = "ERROR in compute_weights(): C++ computeWeightsBackend '.so' library path not found.\n"
49 | error_log += f" * Unable to find C++ computeWeightsBackend '.so' library path in '{path_prefix}'\n"
50 | error_log += " * Please install the pip package or compile the C++ code."
51 | raise ValueError(error_log)
52 |
53 | # Init C++ bridge
54 | computeWeightsBackend = ctypes.CDLL(BACKEND_SO_PATH)
55 | computeWeightsFunction = computeWeightsBackend.computeWeightsBackend
56 | computeWeightsFunction.argtypes = (
57 | ctypes.c_char_p, # msa_path
58 | ctypes.c_uint, # msa_len
59 | ctypes.c_uint, # msa_depth
60 | ctypes.c_float, # seqid
61 | ctypes.c_bool, # count_target_sequence
62 | ctypes.c_uint, # num_threads
63 | ctypes.c_bool # verboses
64 | )
65 | computeWeightsFunction.restype = ctypes.POINTER(ctypes.c_float * msa_depth)
66 | freeWeights = computeWeightsBackend.freeWeights
67 | #freeWeights.argtypes # not need to define argtypes ???
68 | freeWeights.restype = None
69 |
70 | # Run backend
71 | weights_ptr = computeWeightsFunction(
72 | msa_path.encode('utf-8'),
73 | msa_len,
74 | msa_depth,
75 | seqid,
76 | count_target_sequence,
77 | num_threads,
78 | verboses,
79 | )
80 |
81 | # Convert to list
82 | weights = np.zeros((msa_depth), dtype=np.float32)
83 | for i, x in enumerate(weights_ptr.contents):
84 | weights[i]= x
85 |
86 | # Free memory
87 | weights_ptr_casted = ctypes.cast(weights_ptr, ctypes.POINTER(ctypes.c_void_p))
88 | freeWeights(weights_ptr_casted)
89 |
90 | # Return
91 | return weights
92 |
93 |
94 | def write_weights(weights: List[float], weights_path: str) -> None:
95 | """Read weights list from a file."""
96 |
97 | # Guardians
98 | assert os.path.isdir(os.path.dirname(weights_path)), f"ERROR in write_weights(): directory of weights_path='{weights_path}' does not exist."
99 | assert len(weights) > 0, f"ERROR in write_weights(): weigths list can not be of length zero."
100 |
101 | # Write
102 | weights_str = "\n".join([str(w) for w in weights])
103 | with open(weights_path, "w") as fs:
104 | fs.write(weights_str)
105 |
106 |
107 | def read_weights(weights_path: str) -> List[float]:
108 | """Write weights list to a file."""
109 |
110 | # Guardians (for input)
111 | assert os.path.isfile(weights_path), f"ERROR in read_weights(): weights_path='{weights_path}' file does not exist."
112 |
113 | # Read from file
114 | with open(weights_path, "r") as fs:
115 | lines = fs.readlines()
116 |
117 | # Parse
118 | weights: List[float] = []
119 | for i, line in enumerate(lines):
120 | if len(line) > 1 and line[0] != "#":
121 | try:
122 | weights.append(float(line))
123 | except:
124 | line = line.replace('\n', '')
125 | error_log = f"ERROR in read_weights(): failed to parse line {i+1} / {len(lines)} as a float."
126 | error_log += f" * weights_path='{weights_path}'"
127 | error_log += f" * line='{line}'"
128 | raise ValueError(error_log)
129 | if len(weights) == 0:
130 | raise ValueError(f"ERROR in read_weights(): no parsable weights line found in weights_path='{weights_path}'.")
131 | return weights
--------------------------------------------------------------------------------
/rsalor/weights/include/msa.h:
--------------------------------------------------------------------------------
1 | #ifndef MSA_H
2 | #define MSA_H
3 |
4 | #include
5 | #include
6 | #include
7 | #include
8 | #include
9 | #include
10 | #include
11 | #include // For uint8_t type
12 | #include // Standard C++ multi-threading
13 | //#include // To time code execution
14 |
15 | class MSA {
16 | protected:
17 | const char* msa_path;
18 | unsigned int msa_len;
19 | unsigned int msa_depth;
20 | float seqid;
21 | bool count_target_sequence;
22 | unsigned int num_threads;
23 | bool verbose;
24 | std::vector> seqs_int_form;
25 | std::vector weights;
26 |
27 | public:
28 |
29 | // Constructor
30 | MSA(
31 | const char* msa_path,
32 | unsigned int msa_len,
33 | unsigned int msa_depth,
34 | float seqid,
35 | bool count_target_sequence,
36 | unsigned int num_threads,
37 | bool verbose
38 | );
39 |
40 | // Methods
41 | std::vector> readSequences();
42 | std::vector computeWeights();
43 | void countClustersInRange(
44 | const std::vector& range_indices,
45 | std::vector& thread_counts,
46 | const unsigned int start_loop
47 | );
48 |
49 | // Getters
50 | float* getWeightsPointer();
51 | unsigned int getDepth();
52 | unsigned int getLength();
53 | float getNeff();
54 |
55 | };
56 | #endif
--------------------------------------------------------------------------------
/rsalor/weights/msa.cpp:
--------------------------------------------------------------------------------
1 |
2 | // Header ----------------------------------------------------------------------
3 | #include "include/msa.h"
4 |
5 | // MSA: Constructor ------------------------------------------------------------
6 | MSA::MSA(
7 | const char* m_msa_path,
8 | unsigned int const m_msa_len,
9 | unsigned int const m_msa_depth,
10 | float m_seqid,
11 | bool m_count_target_sequence,
12 | unsigned int m_num_threads,
13 | bool m_verbose
14 | ):
15 | msa_path(m_msa_path),
16 | msa_len(m_msa_len),
17 | msa_depth(m_msa_depth),
18 | seqid(m_seqid),
19 | count_target_sequence(m_count_target_sequence),
20 | num_threads(m_num_threads),
21 | verbose(m_verbose)
22 | {
23 | // Read MSA
24 | if(this->verbose) {
25 | std::cout << " - RSALOR (C++ backend): read sequences from file." << std::endl;
26 | }
27 | this->seqs_int_form = readSequences();
28 |
29 | // Compute weights
30 | if(this->verbose) {
31 | std::cout << " - RSALOR (C++ backend): compute sequences weights." << std::endl;
32 | }
33 | this->weights = this->computeWeights();
34 | }
35 |
36 | // Parse MSA sequences from file ----------------------------------------------
37 | std::vector> MSA::readSequences()
38 | {
39 |
40 | // Init residues mapping to int
41 | std::unordered_map res_mapping;
42 | res_mapping['A'] = 0; res_mapping['C'] = 1; res_mapping['D'] = 2;
43 | res_mapping['E'] = 3; res_mapping['F'] = 4; res_mapping['G'] = 5;
44 | res_mapping['H'] = 6; res_mapping['I'] = 7; res_mapping['K'] = 8;
45 | res_mapping['L'] = 9; res_mapping['M'] = 10; res_mapping['N'] = 11;
46 | res_mapping['P'] = 12; res_mapping['Q'] = 13; res_mapping['R'] = 14;
47 | res_mapping['S'] = 15; res_mapping['T'] = 16; res_mapping['V'] = 17;
48 | res_mapping['W'] = 18; res_mapping['Y'] = 19; res_mapping['-'] = 20;
49 | res_mapping['.'] = 20; res_mapping['~'] = 20; res_mapping['B'] = 20;
50 | res_mapping['J'] = 20; res_mapping['O'] = 20; res_mapping['U'] = 20;
51 | res_mapping['X'] = 20; res_mapping['Z'] = 20;
52 |
53 | // Init
54 | std::vector> seqs_int_form;
55 | std::ifstream msa_file_stream(this->msa_path);
56 | std::string current_line;
57 |
58 | // Check file streaming
59 | if(msa_file_stream.fail()){
60 | std::cerr << "ERROR in MSA (C++ backend): Unable to open file." << this->msa_path << std::endl;
61 | throw std::runtime_error("Unable to open file containing the MSA data\n");
62 | }
63 |
64 | // Loop on lines of the file
65 | while(std::getline(msa_file_stream, current_line)){
66 | if(!current_line.empty() && current_line[0] != '>') { // Skip header and empty lines
67 | std::vector current_seq_int;
68 | current_seq_int.reserve(this->msa_len); // optimize by putting the vector in the correct size which is known
69 | for (char c : current_line) {
70 | current_seq_int.push_back(res_mapping.at(toupper(c)));
71 | }
72 | seqs_int_form.push_back(current_seq_int);
73 | }
74 | }
75 |
76 | // Return
77 | return seqs_int_form;
78 | }
79 |
80 | // Assign weights for all sequences based on clusters --------------------------
81 |
82 | // Compute sequences weight
83 | std::vector MSA::computeWeights(){
84 |
85 | // Init counts (all threads)
86 | std::vector counts(this->msa_depth, 1);
87 |
88 | // Count or ignore first sequence for weights computations by starting loop at 0 or 1
89 | unsigned int start_loop = this->count_target_sequence ? 0 : 1;
90 |
91 | // Initialize the per-thread counts vectors
92 | std::vector> thread_counts(
93 | num_threads, std::vector(this->msa_depth, 0)
94 | );
95 |
96 | // Separate indices in chunks for each thread
97 | // * Trick: Since we only loop on half (i, j)-matrix (j < i), first i iterations will stop much earlier than last,
98 | // so we distribute i indices evenly across threads, so they all terminate approximatively at the same time
99 | std::vector> threads_indices(num_threads);
100 | for (unsigned int i = start_loop; i < this->msa_depth; ++i) {
101 | unsigned int thread_id = i % num_threads;
102 | threads_indices[thread_id].push_back(i);
103 | }
104 |
105 | // Manage multi-threading
106 | std::vector threads;
107 | for (unsigned int t = 0; t < num_threads; ++t) {
108 | threads.emplace_back( // ok here some magic
109 | [this, &threads_indices, &thread_counts, t, start_loop]() {
110 | countClustersInRange(threads_indices[t], thread_counts[t], start_loop); // compute cluster by chunks
111 | });
112 | }
113 | for (auto& thread : threads) {
114 | thread.join();
115 | }
116 |
117 | // Merge thread counts into global counts
118 | for (const auto& thread_count : thread_counts) {
119 | for (unsigned int i = 0; i < this->msa_depth; ++i) {
120 | counts[i] += thread_count[i];
121 | }
122 | }
123 |
124 | // Convert counts to weights
125 | std::vector weights(this->msa_depth);
126 | for(unsigned int i = 0; i < this->msa_depth; ++i){
127 | weights[i] = 1.f/ static_cast(counts[i]);
128 | }
129 |
130 | // Remove first sequences weight (that was initally assigned to 1.0)
131 | if(!this->count_target_sequence) {
132 | weights[0] = 0.f;
133 | }
134 |
135 | // Return
136 | return weights;
137 | }
138 |
139 | void MSA::countClustersInRange(
140 | const std::vector& range_indices,
141 | std::vector& range_counts,
142 | const unsigned int start_loop
143 | )
144 | {
145 | // Init
146 | unsigned int num_identical_residues;
147 | unsigned int identical_residues_thr = static_cast(this->seqid * this->msa_len);
148 |
149 | // Loop on range
150 | for (auto i : range_indices) {
151 | const auto& seq_i = this->seqs_int_form[i];
152 | // Loop on other sequences j < i (half matrix because (i, i)=(j, i))
153 | for (unsigned int j = start_loop; j < i; ++j) {
154 | const auto& seq_j = this->seqs_int_form[j];
155 |
156 | // Compute seqid(i, j)
157 | num_identical_residues = 0;
158 | for (unsigned int site = 0; site < this->msa_len; ++site) {
159 | num_identical_residues += seq_i[site] == seq_j[site];
160 | }
161 |
162 | // Update if (i, j) in same cluster
163 | if (num_identical_residues > identical_residues_thr) {
164 | ++range_counts[i];
165 | ++range_counts[j];
166 | }
167 | }
168 | }
169 | }
170 |
171 | // Getter ----------------------------------------------------------------------
172 | float* MSA::getWeightsPointer() {
173 | return weights.data();
174 | }
175 |
176 | // Getters
177 | unsigned int MSA::getDepth() {
178 | return this->msa_depth;
179 | }
180 |
181 | unsigned int MSA::getLength() {
182 | return this->msa_len;
183 | }
184 |
185 | float MSA::getNeff() {
186 | return std::accumulate(this->weights.begin(), this->weights.end(), 0.f);
187 | }
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 |
2 | # Imports ----------------------------------------------------------------------
3 | from setuptools import setup, find_packages, Extension
4 | #from setuptools.command.build_ext import build_ext
5 |
6 |
7 | # Extensions -------------------------------------------------------------------
8 | # Define extension (C++ code that need to be compiled)
9 | compute_weights_ext = Extension(
10 | 'rsalor.weights.lib_computeWeightsBackend', # name
11 | sources=[ # .cpp files
12 | 'rsalor/weights/computeWeightsBackend.cpp',
13 | 'rsalor/weights/msa.cpp',
14 | ],
15 | include_dirs=[ # .h directories
16 | 'rsalor/weights/include',
17 | ],
18 | extra_compile_args=['-std=c++11', '-O3'], # optimization and other flags
19 | extra_link_args=['-O3'],
20 | language='c++',
21 | )
22 |
23 |
24 | # Setup ------------------------------------------------------------------------
25 | setup(
26 | name="rsalor",
27 | version="1.1.1",
28 | author="Matsvei Tsishyn",
29 | author_email="matsvei.tsishyn@protonmail.com",
30 | description="Combines structural data (Relative Solvent Accessibility, RSA) and evolutionary data (Log Odd Ratio, LOR from MSA) to evaluate missense mutations in proteins.",
31 | long_description=open("README.md").read(),
32 | long_description_content_type="text/markdown",
33 | url="https://github.com/3BioCompBio/RSALOR",
34 | python_requires=">=3.9",
35 | packages=find_packages(),
36 | install_requires=[
37 | #'llvmlite>0.30.0',
38 | 'numpy',
39 | 'biopython>=1.75',
40 | ],
41 | ext_modules = [compute_weights_ext],
42 | classifiers=[
43 | "Programming Language :: Python :: 3",
44 | "Programming Language :: C++",
45 | "Programming Language :: C",
46 | "License :: OSI Approved :: MIT License",
47 | "Operating System :: OS Independent",
48 | "Topic :: Scientific/Engineering :: Bio-Informatics",
49 | ],
50 | #entry_points={
51 | # "console_scripts":[
52 | # "rsalor=rsalor.main:run_mutation",
53 | # ],
54 | #},
55 | )
--------------------------------------------------------------------------------