├── .gitignore ├── LICENSE ├── README.md ├── environment.yml ├── setup.cfg ├── setup.py ├── src └── ab_characterisation │ ├── __init__.py │ ├── cli.py │ ├── developability_tools │ ├── __init__.py │ ├── sequence_liabilities │ │ ├── __init__.py │ │ ├── definitions.py │ │ ├── main.py │ │ ├── outputs.py │ │ ├── scanner_classes.py │ │ └── scanners.py │ ├── sequence_properties │ │ ├── calculations.py │ │ ├── main.py │ │ └── outputs.py │ ├── tap │ │ ├── __init__.py │ │ ├── definitions.py │ │ ├── main.py │ │ ├── metrics │ │ │ ├── __init__.py │ │ │ ├── base_calculator.py │ │ │ ├── hydrophobic_patches.py │ │ │ ├── negative_patches.py │ │ │ ├── positive_patches.py │ │ │ ├── sfvcsp.py │ │ │ └── total_cdr_length.py │ │ ├── outputs.py │ │ ├── psa_executables │ │ │ ├── psa │ │ │ └── psa_mac │ │ └── structure_annotation.py │ └── utils │ │ ├── input_handling.py │ │ └── outputs.py │ ├── filter_steps.py │ ├── pipeline_orchestration.py │ ├── rosetta_steps.py │ ├── sequence_steps.py │ ├── structure_steps.py │ └── utils │ ├── __init__.py │ ├── anarci_region_definition_utils.py │ ├── anarci_utils.py │ ├── chimerax_utils.py │ ├── data_classes.py │ ├── rosetta_templates │ ├── rosetta_metrics_ab_only.sh │ ├── rosetta_metrics_ab_only.xml │ ├── rosetta_metrics_complex.sh │ └── rosetta_metrics_complex.xml │ └── rosetta_utils.py └── tests ├── data ├── test_complex_reference.pdb └── test_pipeline.csv └── integration ├── chimera_test_script.py └── test_pipeline.py /.gitignore: -------------------------------------------------------------------------------- 1 | # These are some examples of commonly ignored file patterns. 2 | # You should customize this list as applicable to your project. 3 | # Learn more about .gitignore: 4 | # https://www.atlassian.com/git/tutorials/saving-changes/gitignore 5 | 6 | # Node artifact files 7 | node_modules/ 8 | dist/ 9 | 10 | # Compiled Java class files 11 | *.class 12 | 13 | # Compiled Python bytecode 14 | *.py[cod] 15 | 16 | # Log files 17 | *.log 18 | 19 | # Package files 20 | *.jar 21 | 22 | # Maven 23 | target/ 24 | dist/ 25 | 26 | # JetBrains IDE 27 | .idea/ 28 | 29 | # Unit test reports 30 | TEST*.xml 31 | 32 | # Generated by MacOS 33 | .DS_Store 34 | 35 | # Generated by Windows 36 | Thumbs.db 37 | 38 | # Applications 39 | *.app 40 | *.exe 41 | *.war 42 | 43 | # Large media files 44 | *.mp4 45 | *.tiff 46 | *.avi 47 | *.flv 48 | *.mov 49 | *.wmv 50 | 51 | 52 | .cache/ 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Exscientia 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Antibody characterisation pipeline 2 | 3 | ## Overview 4 | This repository contains code to run the antibody characterisation pipeline described in "Computational design of 5 | developable therapeutic antibodies: efficient traversal of binder landscapes and rescue of escape mutations" 6 | (see citation below). 7 | 8 | ## Installation 9 | ### Environment setup 10 | Use the `environment.yml` file to create a conda environment 11 | ```shell 12 | conda env create -f environment.yml -n ab-characterisation 13 | conda activate ab-characterisation 14 | ``` 15 | ### Dependencies 16 | Ensure that you have working installs of the following: 17 | 18 | 1) Rosetta: https://www.rosettacommons.org/demos/latest/tutorials/install_build/install_build 19 | 20 | 2) ChimeraX: https://www.cgl.ucsf.edu/chimerax/download.html 21 | 22 | a) ensure that you can run the basic ChimeraX script in tests/data: 23 | ``` 24 | ChimeraX --script tests/data/chimera_test_script.py --nogui 25 | ``` 26 | b) set environment variable `DEBIAN_FRONTEND="noninteractive"` 27 | 28 | 3) ANARCI: https://github.com/oxpig/ANARCI. Note: On MacOS machines, install the hmmer dependency via brew, otherwise via conda. 29 | 30 | 4) Ensure you have the correct licences for all linked software. 31 | 32 | ## Testing your installation 33 | You can test the installation of the environment using `pytest`. 34 | For this, first set the Rosetta base directory as an environment variable, for example like this: 35 | ```shell 36 | export ROSETTA_BASE=/path/to/rosetta/rosetta.binary.linux.release-315 37 | ``` 38 | Then run pytest 39 | ```shell 40 | pytest 41 | ``` 42 | Which will run an end-to-end example run of the pipeline on a set of 4 antibody sequences (note that depending on your 43 | setup this may take 1h). 44 | 45 | ## Running the pipeline 46 | With the conda environment active, the pipeline can be run as follows: 47 | ```shell 48 | ab-characterisation --input-file tests/data/test_pipeline.csv --rosetta-base-dir $ROSETTA_BASE 49 | ``` 50 | (assuming ROSETTA_BASE to have been set as described above). 51 | 52 | If you want to multiprocess the pipeline, instead run as 53 | ```shell 54 | mpiexec -n N_PROCESSES ab-characterisation --input-file tests/data/test_pipeline.csv --rosetta-base-dir $ROSETTA_BASE 55 | ``` 56 | 57 | ``` 58 | Usage: ab-characterisation [OPTIONS] 59 | 60 | ╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────╮ 61 | │ * --input-file TEXT Input .csv file, containing sequence_name, heavy_sequence, light_sequence and │ 62 | │ reference_complex columns. [required] │ 63 | │ --chimera-resolution FLOAT Resolution of the map used for alignment within ChimeraX. [default: 6.0] │ 64 | │ --output-dir TEXT Directory to which output files are written. │ 65 | │ [default: ./ab_characterisation_output] │ 66 | │ --rosetta-replicates INTEGER How many replicates to run for Rosetta characterisation steps. [default: 1] │ 67 | │ * --rosetta-base-dir TEXT Base directory for the Roestta software suite, │ 68 | │ e.g. /path/to/rosetta/rosetta.binary.linux.release-315 [required] │ 69 | │ --top-n INTEGER Top N candidate antibodies to provide from the provided .csv file of │ 70 | │ antibodies [default: 10] │ 71 | │ --help Show this message and exit. │ 72 | ╰──────────────────────────────────────────────── 73 | ``` 74 | ## Acknowledgements 75 | The antibody characterisation pipeline was developed by researchers and engineers at Exscientia: 76 | 77 | - Frederic Dreyer 78 | 79 | - Constantin Schneider 80 | 81 | - Aleksandr Kovaltsuk 82 | 83 | - Daniel Cutting 84 | 85 | - Matthew J. Byrne 86 | 87 | - Daniel A. Nissley 88 | 89 | - Newton Wahome 90 | 91 | - Henry Kenlay 92 | 93 | - Claire Marks 94 | 95 | - David Errington 96 | 97 | - Richard J. Gildea 98 | 99 | - David Damerell 100 | 101 | - Pedro Tizei 102 | 103 | - Wilawan Bunjobpol 104 | 105 | - Sachin Surade 106 | 107 | - Douglas E. V. Pires 108 | 109 | - Charlotte M. Deane 110 | 111 | ## Citation 112 | If you use this code in your research, please cite the following paper: 113 | 114 | ``` 115 | @article{Computational_design_of_developable_therapeutic_antibodies, 116 | author = {Dreyer, Fr{\'e}d{\'e}ric A. and Schneider, Constantin and Kovaltsuk, Aleksandr and Cutting, Daniel and Byrne, Matthew J. and Nissley, Daniel A. and Wahome, Newton and Kenlay, Henry and Marks, Claire and Errington, David and Gildea, Richard J. and Damerell, David and Tizei, Pedro and Bunjobpol, Wilawan and Darby, John F. and Drulyte, Ieva and Hurdiss, Daniel L. and Surade, Sachin and Pires, Douglas E. V. and Deane, Charlotte M.}, 117 | title = {Computational design of developable therapeutic antibodies: efficient traversal of binder landscapes and rescue of escape mutations}, 118 | year = {2024}, 119 | doi = {10.1101/2024.10.03.616038}, 120 | eprint = {https://www.biorxiv.org/content/early/2024/10/04/2024.10.03.616038.full.pdf}, 121 | journal = {bioRxiv} 122 | } 123 | ``` 124 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | channels: 2 | - conda-forge 3 | - bioconda 4 | - pytorch 5 | dependencies: 6 | - python=3.10 7 | - pip 8 | - typer 9 | - pandas 10 | - loguru 11 | - biopython 12 | - openmm 13 | - pdbfixer 14 | - pytorch=2.1.0 15 | - mpi4py 16 | - pytest 17 | - pip: 18 | - Immunebuilder 19 | - -e . 20 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = ab-characterisation 3 | description = AB-characterisation 4 | version = 1.0.0 5 | long_description = file: README.md 6 | long_description_content_type = text/markdown 7 | classifiers = 8 | Intended Audience :: Science/Research 9 | Natural Language :: English 10 | Operating System :: OS Independent 11 | Programming Language :: Python 12 | Programming Language :: Python :: 3 :: Only 13 | Programming Language :: Python :: 3.10 14 | Typing :: Typed 15 | 16 | [options] 17 | packages = find_namespace: 18 | package_dir = 19 | =src 20 | python_requires = >= 3.10 21 | include_package_data = True 22 | install_requires = 23 | ImmuneBuilder 24 | 25 | [options.packages.find] 26 | where = src 27 | [options.entry_points] 28 | console_scripts = 29 | ab-characterisation = ab_characterisation.cli:app 30 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | """Legacy file required for editable installs on pip<21.3.1""" 2 | from setuptools import setup 3 | 4 | setup() 5 | -------------------------------------------------------------------------------- /src/ab_characterisation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/__init__.py -------------------------------------------------------------------------------- /src/ab_characterisation/cli.py: -------------------------------------------------------------------------------- 1 | from mpi4py import MPI 2 | from pathlib import Path 3 | import typer 4 | 5 | from ab_characterisation.pipeline_orchestration import RunConfig, pipeline 6 | 7 | app = typer.Typer( 8 | name="ab-characterisation-pipeline", 9 | add_completion=False, 10 | ) 11 | 12 | 13 | @app.command() 14 | def run_pipeline( 15 | input_file: str = typer.Option(..., help='Input .csv file, containing sequence_name, heavy_sequence, light_sequence ' 16 | 'and reference_complex columns.'), 17 | chimera_resolution: float = typer.Option(6.0, help='Resolution of the map used for alignment within ChimeraX.'), 18 | output_dir: str = typer.Option("./ab_characterisation_output", help='Directory to which output files are written.'), 19 | rosetta_replicates: int = typer.Option(1, help='How many replicates to run for Rosetta characterisation steps.'), 20 | rosetta_base_dir: str = typer.Option(..., help='Base directory for the Roestta software suite, e.g. ' 21 | '/path/to/rosetta/rosetta.binary.linux.release-315'), 22 | top_n: int = typer.Option(10, help='Top N candidate antibodies to provide from the provided .csv file of antibodies'), 23 | no_complex_analysis: bool = typer.Option(False, help='If provided, the pipeline does not perform antibody-antigen ' 24 | 'complex generation and analysis.') 25 | ): 26 | output_dir = Path(output_dir) 27 | config = RunConfig( 28 | chimera_map_resolution=chimera_resolution, 29 | input_file=input_file, 30 | output_directory=output_dir, 31 | rosetta_base_directory=rosetta_base_dir, 32 | top_n=top_n, 33 | rosetta_replicates=rosetta_replicates, 34 | exclude_complex_analysis=no_complex_analysis, 35 | ) 36 | comm = MPI.COMM_WORLD 37 | rank = comm.Get_rank() 38 | size = comm.Get_size() 39 | pipeline(config, mpi_rank=rank, mpi_size=size) 40 | 41 | 42 | if __name__ == "__main__": 43 | app() 44 | 45 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/__init__.py -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_liabilities/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/sequence_liabilities/__init__.py -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_liabilities/definitions.py: -------------------------------------------------------------------------------- 1 | custom_regions = { 2 | "verniers": { 3 | "H": [ 4 | (2, " "), 5 | (28, " "), 6 | (29, " "), 7 | (54, " "), 8 | (55, " "), 9 | (78, " "), 10 | (88, " "), 11 | (105, " "), 12 | (106, " "), 13 | (118, " "), 14 | ], 15 | "L": [ 16 | (4, " "), 17 | (27, " "), 18 | (28, " "), 19 | (29, " "), 20 | (30, " "), 21 | (31, " "), 22 | (32, " "), 23 | (33, " "), 24 | (34, " "), 25 | (35, " "), 26 | (36, " "), 27 | (41, " "), 28 | (42, " "), 29 | (52, " "), 30 | (53, " "), 31 | (55, " "), 32 | (84, " "), 33 | (94, " "), 34 | (118, " "), 35 | ], 36 | }, 37 | } 38 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_liabilities/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Optional 3 | 4 | from loguru import logger 5 | 6 | from ab_characterisation.developability_tools.utils.input_handling import get_numbering 7 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import SequenceLiability 8 | from ab_characterisation.developability_tools.sequence_liabilities.scanners import (asparagine_deamidation_scanner, 9 | aspartic_acid_isomeration_scanner, 10 | cd11c_cd18_binding_scanner, fragmentation_scanner, 11 | integrin_binding_scanner, lysine_glycation_scanner, 12 | methionine_oxidation_scanner, 13 | n_linked_glycosylation_scanner, 14 | n_terminal_glutamate_scanner, 15 | tryptophan_oxidation_scanner, unpaired_cysteine_scanner) 16 | 17 | logger.remove() 18 | logger.add(sys.stderr, format="{message}") 19 | 20 | 21 | scanner_list = [ 22 | unpaired_cysteine_scanner, 23 | n_linked_glycosylation_scanner, 24 | methionine_oxidation_scanner, 25 | tryptophan_oxidation_scanner, 26 | asparagine_deamidation_scanner, 27 | aspartic_acid_isomeration_scanner, 28 | lysine_glycation_scanner, 29 | integrin_binding_scanner, 30 | cd11c_cd18_binding_scanner, 31 | fragmentation_scanner, 32 | n_terminal_glutamate_scanner, 33 | ] 34 | 35 | 36 | def scan_single( 37 | heavy_sequence: Optional[str], light_sequence: Optional[str], quiet: bool = False 38 | ) -> list[SequenceLiability]: 39 | """ 40 | Scans the sequence of an antibody for potential liabilities. 41 | 42 | Args: 43 | heavy_sequence: the amino acid sequence of the antibody heavy chain 44 | light_sequence: the amino acid sequence of the antibody light chain 45 | 46 | Returns: 47 | a list of identified sequence liabilities. 48 | """ 49 | 50 | numbering_dict = {} 51 | if heavy_sequence: 52 | numbering_dict["H"] = get_numbering(heavy_sequence, "H") 53 | if light_sequence: 54 | numbering_dict["L"] = get_numbering(light_sequence, "L") 55 | 56 | liabilities = [] 57 | for scanner in scanner_list: 58 | liabilities += scanner.scan(numbering_dict, quiet=quiet) 59 | 60 | return liabilities 61 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_liabilities/outputs.py: -------------------------------------------------------------------------------- 1 | from loguru import logger 2 | 3 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import \ 4 | SequenceLiability 5 | from ab_characterisation.developability_tools.utils.outputs import write_file 6 | 7 | 8 | def display_results(liabilities: list[SequenceLiability]) -> None: 9 | """ 10 | Nicely prints the identified liabilities to the terminal. 11 | 12 | Args: 13 | liabilities: the list of identified sequence liabilities. 14 | """ 15 | color = "red" if liabilities else "green" 16 | logger.opt(colors=True).info( 17 | f"\n<{color}>{len(liabilities)} liabilities were found" 18 | ) 19 | for lia in liabilities: 20 | logger.opt(colors=True).info( 21 | f"{lia.liability_type} - residue motif {lia.motif}, position(s) {lia.positions_string}" 22 | ) 23 | return 24 | 25 | 26 | def write_liabilities_to_csv( 27 | liabilities: list[SequenceLiability], filepath: str 28 | ) -> None: 29 | """ 30 | Writes a list of identified sequence liabilities to a file in .csv format. 31 | 32 | Args: 33 | liabilities: the list of identified sequence liabilities 34 | filepath: the path to the output file. Can be an S3 path. 35 | """ 36 | outstr = "Liability,Motif,Positions\n" 37 | for liability in liabilities: 38 | outstr += f"{liability.liability_type},{liability.motif},{liability.positions_string}\n" 39 | 40 | write_file(outstr, filepath) 41 | 42 | return 43 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_liabilities/scanner_classes.py: -------------------------------------------------------------------------------- 1 | import re 2 | from abc import ABC, abstractmethod 3 | from dataclasses import dataclass 4 | from typing import List, Optional 5 | 6 | from loguru import logger 7 | 8 | from ab_characterisation.developability_tools.sequence_liabilities.definitions import \ 9 | custom_regions 10 | from ab_characterisation.utils.anarci_utils import Accept 11 | 12 | 13 | @dataclass 14 | class Position: 15 | chain: str 16 | number: int 17 | ins_code: str 18 | 19 | def to_string(self) -> str: 20 | if self.ins_code != " ": 21 | return f"{self.chain}{self.number}{self.ins_code}" 22 | return f"{self.chain}{self.number}" 23 | 24 | 25 | @dataclass 26 | class SequenceLiability: 27 | liability_type: str 28 | motif: str 29 | positions: list[Position] 30 | 31 | @property 32 | def positions_string(self) -> str: 33 | return "-".join([pos.to_string() for pos in self.positions]) 34 | 35 | 36 | @dataclass 37 | class BaseScannerDataclassMixin: 38 | name: str 39 | description: str 40 | 41 | 42 | class BaseScanner(ABC, BaseScannerDataclassMixin): 43 | name: str 44 | description: str 45 | 46 | @abstractmethod 47 | def scan( 48 | self, 49 | numbering_dict: dict[str, list[tuple[tuple[int, str], str]]], 50 | quiet: bool = False, 51 | ) -> List[SequenceLiability]: 52 | """ 53 | Scans an input sequence for liabilities. 54 | 55 | Args: 56 | numbering_dict: a dictionary of ANARCI numberings 57 | e.g. {"H": [((1, ' '), 'E'), ((2, ' '), 'L'), ((3, ' '), 'K'), ...], 58 | "L": [((1, ' '), 'D'), ((2, ' '), 'V'), ((3, ' '), 'L'), ...]} 59 | 60 | Returns: 61 | a list of identified liabilities. 62 | """ 63 | 64 | 65 | @dataclass 66 | class RegexScanner(BaseScanner): 67 | regions: list[str] 68 | regex_search_string: str 69 | ignored_positions: Optional[list[tuple[int, str]]] = None 70 | 71 | def __post_init__(self) -> None: 72 | self.regex_pattern = re.compile(self.regex_search_string) 73 | 74 | def _get_acceptor(self, chain: str) -> Accept: 75 | acceptor = Accept(numbering_scheme="imgt", definition="imgt") 76 | for region in self.regions: 77 | if region in custom_regions: 78 | acceptor.add_positions(custom_regions[region][chain], chain) 79 | else: 80 | acceptor.add_regions([region]) 81 | return acceptor 82 | 83 | def scan( 84 | self, 85 | numbering_dict: dict[str, list[tuple[tuple[int, str], str]]], 86 | quiet: bool = False, 87 | ) -> List[SequenceLiability]: 88 | identified = [] 89 | for chain, numbering in numbering_dict.items(): 90 | acceptor = self._get_acceptor(chain) 91 | 92 | sequence = "".join([res[1] for res in numbering if res[1] != "-"]) 93 | numbers = [res[0] for res in numbering if res[1] != "-"] 94 | 95 | for match in self.regex_pattern.finditer(sequence): 96 | start, end = match.start(), match.end() 97 | identified_positions = numbers[start:end] 98 | 99 | # Check if any of the residues identified should be ignored; skip if so 100 | if self.ignored_positions: 101 | if set(identified_positions).intersection(self.ignored_positions): 102 | continue 103 | 104 | # Check if the first residue of the set identified belongs to a region of interest 105 | if acceptor.accept(identified_positions[0], chain): 106 | identified.append( 107 | SequenceLiability( 108 | liability_type=self.name, 109 | motif=match.group(), 110 | positions=[ 111 | Position(chain=chain, number=pos[0], ins_code=pos[1]) 112 | for pos in identified_positions 113 | ], 114 | ) 115 | ) 116 | 117 | if not quiet: 118 | color = "red" if identified else "green" 119 | logger.opt(colors=True).info( 120 | f"<{color}>{self.name}: identified <{color}>{len(identified)} liabilities" 121 | ) 122 | 123 | return identified 124 | 125 | 126 | class NTerminalGlutamateScanner(BaseScanner): 127 | # This does not look for a consecutive pattern like the other liabilities 128 | # Checks for E residues at the start of each chain instead 129 | def scan( 130 | self, 131 | numbering_dict: dict[str, list[tuple[tuple[int, str], str]]], 132 | quiet: bool = False, 133 | ) -> List[SequenceLiability]: 134 | if "H" not in numbering_dict or "L" not in numbering_dict: 135 | if not quiet: 136 | logger.opt(colors=True).warning( 137 | f"{self.name}: both H and L chain sequences are required for this check; skipping" 138 | ) 139 | return [] 140 | 141 | heavy_dict: dict[tuple[int, str], str] = dict(numbering_dict["H"]) 142 | light_dict: dict[tuple[int, str], str] = dict(numbering_dict["L"]) 143 | 144 | identified = [] 145 | if ( 146 | heavy_dict.get((1, " "), None) == "E" 147 | and light_dict.get((1, " "), None) == "E" 148 | ): 149 | identified = [ 150 | SequenceLiability( 151 | liability_type=self.name, 152 | motif="EE", 153 | positions=[ 154 | Position(chain="H", number=1, ins_code=" "), 155 | Position(chain="L", number=1, ins_code=" "), 156 | ], 157 | ) 158 | ] 159 | 160 | if not quiet: 161 | color = "red" if identified else "green" 162 | logger.opt(colors=True).info( 163 | f"<{color}>{self.name}: identified <{color}>{len(identified)} liabilities" 164 | ) 165 | 166 | return identified 167 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_liabilities/scanners.py: -------------------------------------------------------------------------------- 1 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import ( 2 | NTerminalGlutamateScanner, RegexScanner) 3 | 4 | unpaired_cysteine_scanner = RegexScanner( 5 | name="Unpaired cysteine", 6 | description="Checks for C residues in locations other than positions 23 and 104", 7 | regions=["fv"], 8 | regex_search_string="C", 9 | ignored_positions=[(23, " "), (104, " ")], 10 | ) 11 | 12 | n_linked_glycosylation_scanner = RegexScanner( 13 | name="N-linked glycosylation", 14 | description="Checks for an N residue followed by any residue apart from P, followed by S or T", 15 | regions=["fv"], 16 | regex_search_string="N[^P][ST]", 17 | ) 18 | 19 | methionine_oxidation_scanner = RegexScanner( 20 | name="Methionine oxidation", 21 | description="Checks for M residues in CDRs or Vernier zones", 22 | regions=["cdrs", "verniers"], 23 | regex_search_string="M", 24 | ) 25 | 26 | tryptophan_oxidation_scanner = RegexScanner( 27 | name="Tryptophan oxidation", 28 | description="Checks for W residues in CDRs or Vernier zones", 29 | regions=["cdrs", "verniers"], 30 | regex_search_string="W", 31 | ) 32 | 33 | asparagine_deamidation_scanner = RegexScanner( 34 | name="Asparagine deamidation", 35 | description="Checks for residue pairs NG, NS, or NT in CDRs or Vernier zones", 36 | regions=["cdrs", "verniers"], 37 | regex_search_string="N[GST]", 38 | ) 39 | 40 | aspartic_acid_isomeration_scanner = RegexScanner( 41 | name="Aspartic acid isomeration", 42 | description="Checks for residue pairs DG, DS, DT, DD, or DH in CDRs or Vernier zones", 43 | regions=["cdrs", "verniers"], 44 | regex_search_string="D[GSTDH]", 45 | ) 46 | 47 | lysine_glycation_scanner = RegexScanner( 48 | name="Lysine isomeration", 49 | description="Checks for residue pairs KE, KD, EK, or ED in CDRs or Vernier zones", 50 | regions=["cdrs", "verniers"], 51 | regex_search_string="KE|KD|EK|ED", 52 | ) 53 | 54 | integrin_binding_scanner = RegexScanner( 55 | name="Integrin binding", 56 | description="Checks for residue triplets RGD, RYD, or LDV within the Fv", 57 | regions=["fv"], 58 | regex_search_string="RGD|RYD|LDV", 59 | ) 60 | 61 | cd11c_cd18_binding_scanner = RegexScanner( 62 | name="CD11c/CD18 binding", 63 | description="Checks for residue triple GPR within the Fv", 64 | regions=["fv"], 65 | regex_search_string="GPR", 66 | ) 67 | 68 | fragmentation_scanner = RegexScanner( 69 | name="Fragmentation", 70 | description="Checks for residue pair DP in the CDRs or Vernier zones", 71 | regions=["cdrs", "verniers"], 72 | regex_search_string="DP", 73 | ) 74 | 75 | n_terminal_glutamate_scanner = NTerminalGlutamateScanner( 76 | name="N-terminal glutamate", 77 | description="Checks for glutamate residues at the N-termini of both heavy and light chains", 78 | ) 79 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_properties/calculations.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import numpy as np 4 | from Bio.SeqUtils.ProtParam import ProteinAnalysis 5 | 6 | 7 | class PropertyCalculator: 8 | def __init__(self, sequence: str) -> None: 9 | self.sequence = sequence 10 | self.protein_analysis = ProteinAnalysis(self.sequence) 11 | 12 | def calculate_aromaticity(self) -> float: 13 | return self.protein_analysis.aromaticity() # type: ignore 14 | 15 | def calculate_charge_at_ph(self, ph: float) -> float: 16 | return self.protein_analysis.charge_at_pH(ph) # type: ignore 17 | 18 | def calculate_flexibility(self) -> dict: 19 | flexibility_scores = self.protein_analysis.flexibility() 20 | return { 21 | "residue_scores": flexibility_scores, 22 | "mean": np.mean(flexibility_scores), 23 | "stdev": np.std(flexibility_scores), 24 | "min": min(flexibility_scores), 25 | "max": max(flexibility_scores), 26 | } 27 | 28 | def calculate_gravy(self) -> float: 29 | return self.protein_analysis.gravy() # type: ignore 30 | 31 | def calculate_instability_index(self) -> float: 32 | return self.protein_analysis.instability_index() # type: ignore 33 | 34 | def calculate_isoelectric_point(self) -> float: 35 | return self.protein_analysis.isoelectric_point() # type: ignore 36 | 37 | def calculate_properties(self) -> dict: 38 | """ 39 | Calculates several properties from the sequence of an antibody. 40 | Returns: 41 | a dictionary of calculated properties. 42 | """ 43 | return { 44 | "sequence": self.sequence, 45 | "aromaticity": self.calculate_aromaticity(), 46 | "charge_pH_6": self.calculate_charge_at_ph(6), 47 | "charge_pH_7.4": self.calculate_charge_at_ph(7.4), 48 | "flexibility": self.calculate_flexibility(), 49 | "gravy": self.calculate_gravy(), 50 | "instability_index": self.calculate_instability_index(), 51 | "isoelectric_point": self.calculate_isoelectric_point(), 52 | } 53 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_properties/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | from ab_characterisation.developability_tools.sequence_properties.calculations import \ 6 | PropertyCalculator 7 | from ab_characterisation.developability_tools.sequence_properties.outputs import \ 8 | write_properties_to_json 9 | from ab_characterisation.developability_tools.utils.input_handling import parse_fasta 10 | from loguru import logger 11 | 12 | logger.remove() 13 | logger.add(sys.stderr, format="{message}") 14 | 15 | 16 | def calculate_properties( 17 | heavy_sequence: Optional[str], light_sequence: Optional[str] 18 | ) -> dict[str, dict]: 19 | results = {} 20 | for chain, sequence in {"heavy": heavy_sequence, "light": light_sequence}.items(): 21 | if sequence: 22 | results[chain] = PropertyCalculator(sequence).calculate_properties() 23 | 24 | return results 25 | 26 | 27 | def property_calculator( 28 | heavy_sequence: Optional[str], 29 | light_sequence: Optional[str], 30 | outfile: Optional[str] = None, 31 | ) -> dict[str, dict]: 32 | """ 33 | Main function to calculate sequence-based properties for a single antibody. 34 | Writes the results to a file in .csv format. 35 | 36 | Args: 37 | heavy_sequence: the amino acid sequence of the antibody heavy chain 38 | light_sequence: the amino acid sequence of the antibody light chain 39 | outfile: the output path where results should be written. If a path is not given, results will only be printed 40 | to the terminal. 41 | """ 42 | property_dict = calculate_properties(heavy_sequence, light_sequence) 43 | 44 | if outfile: 45 | write_properties_to_json(property_dict, outfile) 46 | 47 | return property_dict 48 | 49 | 50 | def property_calculator_fasta( 51 | fasta_file: str, 52 | outdir: Optional[str], 53 | quiet: bool = False, 54 | ) -> dict[str, dict[str, dict]]: 55 | """ 56 | Function to scan a set of antibody sequences in a fasta file for liabilities. 57 | Writes the results to a series of files (one per antibody) in .csv format. 58 | 59 | Args: 60 | fasta_file: the amino acid sequences of the antibodies to be scanned in fasta format. Each antibody should be a 61 | separate fasta entry, with the heavy and light chains being separated by a forward slash. E.g.: 62 | 63 | >antibody1 64 | HEAVYSEQUENCE/LIGHTSEQUENCE 65 | >antibody2 66 | HEAVYSEQUENCE/LIGHTSEQUENCE 67 | >nanobody1 68 | HEAVYSEQUENCE/- 69 | ... 70 | 71 | outdir: Path to the directory where results should be written. Individual files will be named according to the 72 | IDs in the fasta file. 73 | """ 74 | if outdir: 75 | dirpath = Path(outdir) 76 | dirpath.mkdir(parents=True, exist_ok=True) 77 | else: 78 | dirpath = Path(".") 79 | 80 | sequences = parse_fasta(fasta_file) 81 | results = {} 82 | for antibody_id, seqs in sequences.items(): 83 | if not quiet: 84 | logger.info(f"Calculating properties for {antibody_id}") 85 | 86 | property_dict = calculate_properties(seqs["H"], seqs["L"]) 87 | filepath = dirpath / f"{antibody_id}_properties.json" 88 | write_properties_to_json(property_dict, str(filepath)) 89 | results[antibody_id] = property_dict 90 | 91 | return results 92 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/sequence_properties/outputs.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | from ab_characterisation.developability_tools.utils.outputs import write_file 4 | from loguru import logger 5 | 6 | 7 | def write_properties_to_json(property_dict: dict, filepath: str) -> None: 8 | """ 9 | Writes the calculated property dict to a file in .json format. 10 | 11 | Args: 12 | property_dict: the dictionary of calculated properties 13 | filepath: the path to the output file. Can be an S3 path. 14 | """ 15 | outstr = json.dumps(property_dict) 16 | write_file(outstr, filepath) 17 | 18 | return 19 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/tap/__init__.py -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/definitions.py: -------------------------------------------------------------------------------- 1 | colour_dict = {"RED": "red", "AMBER": "fg #ff6100", "GREEN": "fg #14b853"} 2 | 3 | imgt_cdr_definitions = { 4 | 1: range(27, 39), 5 | 2: range(56, 66), 6 | 3: range(105, 118), 7 | } 8 | 9 | # Two residues on either side of the IMGT CDRs 10 | anchor_residues = [25, 26, 39, 40, 54, 55, 66, 67, 103, 104, 118, 119] 11 | 12 | 13 | # Salt bridge donor/acceptor atom types 14 | donors = {"LYS": ["NZ"], "ARG": ["NH1", "NH2"]} 15 | acceptors = {"ASP": ["OD1", "OD2"], "GLU": ["OE1", "OE2"]} 16 | 17 | 18 | # Kyte and Doolittle hydrophobicity scale, values normalised to values between 1 and 2 19 | normalised_hydrophobicities = { 20 | "ILE": 2.0, 21 | "VAL": 1.9666666666666666, 22 | "LEU": 1.9222222222222223, 23 | "PHE": 1.8111111111111111, 24 | "CYS": 1.7777777777777777, 25 | "MET": 1.7111111111111112, 26 | "ALA": 1.7, 27 | "GLY": 1.4555555555555555, 28 | "THR": 1.4222222222222223, 29 | "SER": 1.4111111111111112, 30 | "TRP": 1.4, 31 | "TYR": 1.3555555555555556, 32 | "PRO": 1.3222222222222222, 33 | "HIS": 1.1444444444444444, 34 | "GLU": 1.1111111111111112, 35 | "GLN": 1.1111111111111112, 36 | "ASP": 1.1111111111111112, 37 | "ASN": 1.1111111111111112, 38 | "LYS": 1.0666666666666667, 39 | "ARG": 1.0, 40 | } 41 | 42 | # Charges at pH 7.4 43 | residue_charges = { 44 | "ALA": 0.0, 45 | "ARG": 1.0, 46 | "ASN": 0.0, 47 | "ASP": -1.0, 48 | "CYS": 0.0, 49 | "GLN": 0.0, 50 | "GLU": -1.0, 51 | "GLY": 0.0, 52 | "HIS": 0.1, 53 | "ILE": 0.0, 54 | "LEU": 0.0, 55 | "LYS": 1.0, 56 | "MET": 0.0, 57 | "PHE": 0.0, 58 | "PRO": 0.0, 59 | "SER": 0.0, 60 | "THR": 0.0, 61 | "TRP": 0.0, 62 | "TYR": 0.0, 63 | "VAL": 0.0, 64 | } 65 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/main.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from typing import Optional 3 | 4 | from loguru import logger 5 | 6 | from ab_characterisation.developability_tools.tap.definitions import colour_dict 7 | from ab_characterisation.developability_tools.tap.metrics import ( 8 | HydrophobicPatchScoreCalculator, NegativePatchScoreCalculator, 9 | PositivePatchScoreCalculator, SFvCSPCalculator, TotalCDRLengthCalculator) 10 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import MetricResult 11 | from ab_characterisation.developability_tools.tap.outputs import write_output_file 12 | from ab_characterisation.developability_tools.tap.structure_annotation import \ 13 | StructureAnnotator 14 | 15 | logger.remove() 16 | logger.add(sys.stderr, format="{message}") 17 | 18 | 19 | def run_tap( 20 | modelfile: str, 21 | outfile: Optional[str], 22 | quiet: bool = False, 23 | ) -> list[MetricResult]: 24 | """ 25 | Main function to calculate TAP metrics for a pre-generated ABodyBuilder2 model. 26 | Writes the results to a file in .csv format. 27 | 28 | Args: 29 | modelfile: the path to the input model .pdb file. Can be an S3 path - in this case the file will be temporarily 30 | downloaded before TAP is run. 31 | This should be a model created by ABodyBuilder2, and should be already IMGT numbered. 32 | outfile: the output path where results should be written. 33 | quiet: suppresses all log messages if set to True. 34 | """ 35 | 36 | structure = StructureAnnotator().load_and_annotate_structure(modelfile) 37 | 38 | # Calculate the 5 metrics 39 | results = [] 40 | for calculator in [ 41 | HydrophobicPatchScoreCalculator, 42 | NegativePatchScoreCalculator, 43 | PositivePatchScoreCalculator, 44 | SFvCSPCalculator, 45 | TotalCDRLengthCalculator, 46 | ]: 47 | results.append(calculator(quiet=quiet).calculate(structure)) # type: ignore 48 | 49 | if outfile: 50 | write_output_file(results, outfile) 51 | 52 | return results 53 | 54 | 55 | def list_metrics() -> list[dict]: 56 | """ 57 | Returns (and prints) a list of the metrics and their green/amber region definitions. 58 | """ 59 | metrics = [] 60 | for calculator in [ 61 | HydrophobicPatchScoreCalculator, 62 | NegativePatchScoreCalculator, 63 | PositivePatchScoreCalculator, 64 | SFvCSPCalculator, 65 | TotalCDRLengthCalculator, 66 | ]: 67 | metric = calculator() # type: ignore 68 | green_str = "; ".join( 69 | [ 70 | str(region[0]) + " to " + str(region[1]) 71 | for region in metric.green_flag_regions 72 | ] 73 | ) 74 | amber_str = "; ".join( 75 | [ 76 | str(region[0]) + " to " + str(region[1]) 77 | for region in metric.amber_flag_regions 78 | ] 79 | ) 80 | logger.opt(colors=True).info(f"TAP METRIC {metric.name}:") 81 | logger.opt(colors=True).info( 82 | f"<{colour_dict['GREEN']}>GREEN region: {green_str}" 83 | ) 84 | logger.opt(colors=True).info( 85 | f"<{colour_dict['AMBER']}>AMBER region: {amber_str}\n" 86 | ) 87 | 88 | metrics.append( 89 | { 90 | "name": metric.name, 91 | "green_flag_regions": metric.green_flag_regions, 92 | "amber_flag_regions": metric.amber_flag_regions, 93 | } 94 | ) 95 | 96 | return metrics 97 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/metrics/__init__.py: -------------------------------------------------------------------------------- 1 | from ab_characterisation.developability_tools.tap.metrics.hydrophobic_patches import \ 2 | HydrophobicPatchScoreCalculator 3 | from ab_characterisation.developability_tools.tap.metrics.negative_patches import \ 4 | NegativePatchScoreCalculator 5 | from ab_characterisation.developability_tools.tap.metrics.positive_patches import \ 6 | PositivePatchScoreCalculator 7 | from ab_characterisation.developability_tools.tap.metrics.sfvcsp import SFvCSPCalculator 8 | from ab_characterisation.developability_tools.tap.metrics.total_cdr_length import \ 9 | TotalCDRLengthCalculator 10 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/metrics/base_calculator.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from dataclasses import dataclass 3 | 4 | from Bio import PDB 5 | from loguru import logger 6 | 7 | from ab_characterisation.developability_tools.tap.definitions import colour_dict 8 | 9 | 10 | @dataclass 11 | class MetricResult: 12 | metric_name: str 13 | calculated_value: float 14 | flag: str 15 | 16 | 17 | class BaseMetricCalculator(ABC): 18 | @abstractmethod 19 | def __init__(self, quiet: bool = False) -> None: 20 | self.quiet: bool = quiet 21 | self.name: str = "" 22 | self.green_flag_regions: list[tuple[float, float]] = [] 23 | self.amber_flag_regions: list[tuple[float, float]] = [] 24 | 25 | def get_flag(self, value: float) -> str: 26 | """ 27 | Assigns either a green, amber, or red flag to a value depending on the defined regions 28 | (which were established by calculating the same metrics on structural models of known therapeutics). 29 | 30 | Args: 31 | value: the value calculated for the query antibody for this metric 32 | 33 | Returns: 34 | a string representing the flag colour. 35 | """ 36 | for minval, maxval in self.amber_flag_regions: 37 | if minval <= value <= maxval: 38 | return "AMBER" 39 | 40 | for minval, maxval in self.green_flag_regions: 41 | if minval <= value <= maxval: 42 | return "GREEN" 43 | 44 | # If the calculated value lies outside the defined green and amber regions, it is assigned a red flag 45 | return "RED" 46 | 47 | def log_result(self, result: MetricResult) -> None: 48 | """Logs a message to the terminal summarising the result of the metric calculation.""" 49 | if not self.quiet: 50 | colour = colour_dict[result.flag] 51 | logger.opt(colors=True).info( 52 | f"METRIC {result.metric_name} = <{colour}>{result.calculated_value:.2f} ({result.flag})" 53 | ) 54 | 55 | @abstractmethod 56 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult: 57 | pass 58 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/metrics/hydrophobic_patches.py: -------------------------------------------------------------------------------- 1 | from Bio import PDB 2 | 3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import ( 4 | BaseMetricCalculator, MetricResult) 5 | 6 | 7 | class HydrophobicPatchScoreCalculator(BaseMetricCalculator): 8 | def __init__(self, quiet: bool = False) -> None: 9 | self.quiet = quiet 10 | self.name = "Hydrophobic Patch Score" 11 | self.green_flag_regions = [(137.61, 200.71)] 12 | self.amber_flag_regions = [(106.44, 137.61), (200.71, 225.85)] 13 | 14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult: 15 | """ 16 | Calculates the 'patches of surface hydrophobicity' score (PSH) and assigns a flag colour. 17 | Considers residues that are in the CDR vicinity only. 18 | The input structure must have been annotated using the 19 | ab_characterisation.developability_tools.tap.structure_annotation module. 20 | """ 21 | cdr_vicinity = [ 22 | res for res in annotated_structure[0].get_residues() if res.in_cdr_vicinity 23 | ] 24 | 25 | score = 0 26 | for res1 in cdr_vicinity: 27 | for res2 in cdr_vicinity: 28 | if res1 == res2: 29 | continue 30 | 31 | distance = res1.neighbours.get(res2.get_full_id(), None) 32 | if not distance: 33 | continue 34 | 35 | score += (res1.hydrophobicity * res2.hydrophobicity) / distance**2 36 | 37 | flag = self.get_flag(score) 38 | 39 | result = MetricResult( 40 | metric_name=self.name, 41 | calculated_value=score, 42 | flag=flag, 43 | ) 44 | 45 | self.log_result(result) 46 | return result 47 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/metrics/negative_patches.py: -------------------------------------------------------------------------------- 1 | from Bio import PDB 2 | 3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import ( 4 | BaseMetricCalculator, MetricResult) 5 | 6 | 7 | class NegativePatchScoreCalculator(BaseMetricCalculator): 8 | def __init__(self, quiet: bool = False) -> None: 9 | self.quiet = quiet 10 | self.name = "Negative Patch Score" 11 | self.green_flag_regions = [(0, 1.67)] 12 | self.amber_flag_regions = [(1.67, 3.50)] 13 | 14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult: 15 | """ 16 | Calculates the 'patches of negative charge' score (PNC) and assigns a flag colour. 17 | Considers residues that are in the CDR vicinity only. 18 | The input structure must have been annotated using the 19 | ab_characterisation.developability_tools.tap.structure_annotation module. 20 | """ 21 | cdr_vicinity = [ 22 | res for res in annotated_structure[0].get_residues() if res.in_cdr_vicinity 23 | ] 24 | 25 | score = 0 26 | for res1 in cdr_vicinity: 27 | for res2 in cdr_vicinity: 28 | if res1 == res2: 29 | continue 30 | 31 | if res1.charge >= 0 or res2.charge >= 0: 32 | continue 33 | 34 | distance = res1.neighbours.get(res2.get_full_id(), None) 35 | if not distance: 36 | continue 37 | 38 | score += (abs(res1.charge) * abs(res2.charge)) / distance**2 39 | 40 | flag = self.get_flag(score) 41 | 42 | result = MetricResult( 43 | metric_name=self.name, 44 | calculated_value=score, 45 | flag=flag, 46 | ) 47 | 48 | self.log_result(result) 49 | return result 50 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/metrics/positive_patches.py: -------------------------------------------------------------------------------- 1 | from Bio import PDB 2 | 3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import ( 4 | BaseMetricCalculator, MetricResult) 5 | 6 | 7 | class PositivePatchScoreCalculator(BaseMetricCalculator): 8 | def __init__(self, quiet: bool = False) -> None: 9 | self.quiet = quiet 10 | self.name = "Positive Patch Score" 11 | self.green_flag_regions = [(0, 1.19)] 12 | self.amber_flag_regions = [(1.19, 3.58)] 13 | 14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult: 15 | """ 16 | Calculates the 'patches of positive charge' score (PPC) and assigns a flag colour. 17 | Considers residues that are in the CDR vicinity only. 18 | The input structure must have been annotated using the 19 | ab_characterisation.developability_tools.tap.structure_annotation module. 20 | """ 21 | cdr_vicinity = [ 22 | res for res in annotated_structure[0].get_residues() if res.in_cdr_vicinity 23 | ] 24 | 25 | score = 0 26 | for res1 in cdr_vicinity: 27 | for res2 in cdr_vicinity: 28 | if res1 == res2: 29 | continue 30 | 31 | if res1.charge <= 0 or res2.charge <= 0: 32 | continue 33 | 34 | distance = res1.neighbours.get(res2.get_full_id(), None) 35 | if not distance: 36 | continue 37 | 38 | score += (abs(res1.charge) * abs(res2.charge)) / distance**2 39 | 40 | flag = self.get_flag(score) 41 | 42 | result = MetricResult( 43 | metric_name=self.name, 44 | calculated_value=score, 45 | flag=flag, 46 | ) 47 | 48 | self.log_result(result) 49 | return result 50 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/metrics/sfvcsp.py: -------------------------------------------------------------------------------- 1 | from Bio import PDB 2 | 3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import ( 4 | BaseMetricCalculator, MetricResult) 5 | 6 | 7 | class SFvCSPCalculator(BaseMetricCalculator): 8 | def __init__(self, quiet: bool = False) -> None: 9 | self.quiet = quiet 10 | self.name = "SFvCSP" 11 | self.green_flag_regions = [(-4.20, 100000)] 12 | self.amber_flag_regions = [(-20.50, -4.20)] 13 | 14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult: 15 | """ 16 | Calculates the 'structural Fv charge symmetry parameter' (SFvCSP) and assigns a flag colour. 17 | Considers surface residues only. 18 | The input structure must have been annotated using the 19 | ab_characterisation.developability_tools.tap.structure_annotation module. 20 | """ 21 | h_charge = sum( 22 | res.charge 23 | for res in annotated_structure[0]["H"].get_residues() 24 | if res.is_surface 25 | ) 26 | l_charge = sum( 27 | res.charge 28 | for res in annotated_structure[0]["L"].get_residues() 29 | if res.is_surface 30 | ) 31 | sfvcsp = h_charge * l_charge 32 | flag = self.get_flag(sfvcsp) 33 | 34 | result = MetricResult( 35 | metric_name=self.name, 36 | calculated_value=sfvcsp, 37 | flag=flag, 38 | ) 39 | 40 | self.log_result(result) 41 | return result 42 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/metrics/total_cdr_length.py: -------------------------------------------------------------------------------- 1 | from Bio import PDB 2 | 3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import ( 4 | BaseMetricCalculator, MetricResult) 5 | 6 | 7 | class TotalCDRLengthCalculator(BaseMetricCalculator): 8 | def __init__(self, quiet: bool = False) -> None: 9 | self.quiet = quiet 10 | self.name = "Total IMGT CDR Length" 11 | self.green_flag_regions = [(43, 55)] 12 | self.amber_flag_regions = [(37, 43), (55, 63)] 13 | 14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult: 15 | """ 16 | Calculates the total number of CDR residues and assigns a flag colour. 17 | Uses the IMGT CDR definition. 18 | The input structure must have been annotated using the 19 | ab_characterisation.developability_tools.tap.structure_annotation module. 20 | """ 21 | cdr_residues = [ 22 | res for res in annotated_structure[0].get_residues() if res.is_cdr 23 | ] 24 | total_cdr_length = len(cdr_residues) 25 | flag = self.get_flag(total_cdr_length) 26 | 27 | result = MetricResult( 28 | metric_name=self.name, 29 | calculated_value=total_cdr_length, 30 | flag=flag, 31 | ) 32 | 33 | self.log_result(result) 34 | return result 35 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/outputs.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import MetricResult 5 | from ab_characterisation.developability_tools.utils.outputs import write_file 6 | 7 | 8 | def write_output_file(results: list[MetricResult], outfile: str) -> None: 9 | """ 10 | Writes the TAP results to an output file in csv format. 11 | 12 | Args: 13 | results: the list of metric results 14 | outfile: the path to where the results should be written. 15 | """ 16 | outstr = "Metric,Value,Flag\n" 17 | for res in results: 18 | outstr += f"{res.metric_name},{res.calculated_value:.2f},{res.flag}\n" 19 | 20 | write_file(outstr, outfile) 21 | 22 | return 23 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/psa_executables/psa: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/tap/psa_executables/psa -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/psa_executables/psa_mac: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/tap/psa_executables/psa_mac -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/tap/structure_annotation.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | from dataclasses import dataclass, field 4 | from pathlib import Path 5 | from typing import Optional 6 | 7 | from Bio import PDB 8 | from Bio.PDB.NeighborSearch import NeighborSearch 9 | 10 | from ab_characterisation.developability_tools.tap.definitions import ( 11 | acceptors, anchor_residues, donors, imgt_cdr_definitions, 12 | normalised_hydrophobicities, residue_charges) 13 | 14 | 15 | class PSAError(Exception): 16 | """Raised when something has gone awry when running psa to calculate surface areas.""" 17 | 18 | 19 | @dataclass 20 | class AnnotatedResidue(PDB.Residue.Residue): # type: ignore 21 | salt_bridge_partner: Optional[tuple] = None 22 | neighbours: dict = field(default_factory=dict) 23 | cdr_number: int = field(init=False) 24 | relative_surface_area: float = field(init=False) 25 | hydrophobicity: float = field(init=False) 26 | charge: float = field(init=False) 27 | in_cdr_vicinity: bool = field(init=False) 28 | 29 | def __eq__(self, other): # type: ignore 30 | """Direct copy from Biopython Entity""" 31 | if isinstance(other, type(self)): 32 | if self.parent is None: 33 | return self.id == other.id 34 | return self.full_id[1:] == other.full_id[1:] 35 | return NotImplemented 36 | 37 | def __hash__(self) -> int: 38 | """Direct copy from Biopython Entity""" 39 | return hash(self.full_id) 40 | 41 | @property 42 | def is_cdr(self) -> bool: 43 | """Whether the residue is part of a CDR (IMGT definition) or not.""" 44 | return self.cdr_number > 0 45 | 46 | @property 47 | def is_anchor(self) -> bool: 48 | """ 49 | Whether the residue an anchor residue to a CDR (IMGT definition) or not. 50 | Anchor residues are defined as the two residues on each side of the CDR. 51 | """ 52 | return self.id[1] in anchor_residues 53 | 54 | @property 55 | def is_surface(self) -> bool: 56 | """ 57 | Uses the relative surface area as calculated by psa to determine whether the residue is on the surface or not. 58 | Surface residues have a relative sidechain surface area of 7.5 or above. 59 | """ 60 | return self.relative_surface_area >= 7.5 61 | 62 | @property 63 | def res_number(self) -> str: 64 | """Returns a formatted string containing the residue number and insertion code, if present.""" 65 | return f"{self.id[1]}{self.id[2]}".strip() 66 | 67 | @property 68 | def is_donor(self) -> bool: 69 | """Whether the residue is a salt bridge donor residue type.""" 70 | return self.resname in donors 71 | 72 | @property 73 | def is_acceptor(self) -> bool: 74 | """Whether the residue is a salt bridge acceptor residue type.""" 75 | return self.resname in acceptors 76 | 77 | 78 | @dataclass 79 | class StructureAnnotator: 80 | """ 81 | Class containing methods for structural annotation of properties required by TAP": 82 | - relative surface area 83 | - minimum distances between neighbouring residues 84 | - CDR vicinity (surface residues within 4 A of CDRs/anchors) 85 | - salt bridges (donor/acceptor atoms within 3.2 A) 86 | - hydrophobicity 87 | - charge 88 | """ 89 | 90 | neighbour_cutoff: float = 7.5 91 | salt_bridge_cutoff: float = 3.2 92 | vicinity_cutoff: float = 4.0 93 | psa_path: Path = field(init=False) 94 | cdr_lookup_dict: dict[tuple[str, int], int] = field(init=False) 95 | 96 | def __post_init__(self) -> None: 97 | lookup_dict = {} 98 | for chain in "HL": 99 | for cdr, residue_range in imgt_cdr_definitions.items(): 100 | for res in residue_range: 101 | lookup_dict[(chain, res)] = cdr 102 | self.cdr_lookup_dict = lookup_dict 103 | 104 | psa_version = "psa_mac" if sys.platform == "darwin" else "psa" 105 | self.psa_path = ( 106 | Path(__file__).resolve().parent / f"psa_executables/{psa_version}" 107 | ) 108 | 109 | @staticmethod 110 | def _convert_residues(structure: PDB.Structure.Structure) -> None: 111 | """Converts normal Biopython residues to our annotated version with extra properties/methods.""" 112 | for res in structure[0].get_residues(): 113 | res.__class__ = AnnotatedResidue 114 | res.neighbours = {} 115 | return 116 | 117 | def _run_psa(self, structure_path: str) -> list[str]: 118 | """Runs the psa executable on the .pdb file to get surface accessibility information.""" 119 | if self.psa_path.exists() is False: 120 | raise PSAError("psa executable was not found.") 121 | 122 | result, error = subprocess.Popen( 123 | [str(self.psa_path), "-t", structure_path], 124 | stdout=subprocess.PIPE, 125 | stderr=subprocess.PIPE, 126 | ).communicate() 127 | if not result: 128 | raise PSAError(error.decode()) 129 | 130 | psa_output = result.decode().split("\n") 131 | return psa_output 132 | 133 | def _annotate_sasa( 134 | self, structure: PDB.Structure.Structure, structure_path: str 135 | ) -> None: 136 | """Runs psa and extracts surface accessibility information from its output.""" 137 | # Run psa and get the relevant lines from the output 138 | psa_output = self._run_psa(structure_path) 139 | residue_lines = [line for line in psa_output if line.startswith("ACCESS")] 140 | 141 | # Check that the number of residues in the psa output is the same as the number of residues in our structure 142 | all_residues = list(structure[0].get_residues()) 143 | if len(all_residues) != len(residue_lines): 144 | raise PSAError("PSA output contained the wrong number of residues.") 145 | 146 | # Iterate through residues and annotate the structure 147 | for res, psa_line in zip(all_residues, residue_lines): 148 | # Check we are on the correct residue with the correct type 149 | psa_residue_number = psa_line[6:12].strip() 150 | if res.res_number != psa_residue_number: 151 | raise PSAError( 152 | f"Residue number mismatch: {res.res_number} != {psa_residue_number}" 153 | ) 154 | psa_residue_type = psa_line[14:17] 155 | if res.resname != psa_residue_type: 156 | raise PSAError( 157 | f"Expected type {res.resname} for residue {res.parent}{res.res_number}; got {psa_residue_type}" 158 | ) 159 | 160 | res.relative_surface_area = float(psa_line[61:67]) 161 | return 162 | 163 | @staticmethod 164 | def _get_minimum_distance(res1: AnnotatedResidue, res2: AnnotatedResidue) -> float: 165 | """Calculates the minimum distance between the heavy atoms of two residues.""" 166 | min_dist = 100.0 167 | for atom1 in res1.get_atoms(): 168 | for atom2 in res2.get_atoms(): 169 | dist = atom1 - atom2 170 | if dist < min_dist: 171 | min_dist = dist 172 | return min_dist 173 | 174 | def _get_neighbours(self, structure: PDB.Structure.Structure) -> None: 175 | """For each residue in the structure, gets a list of neighbouring residues and their minimum distance""" 176 | # Quickly get list of residue pairs that are less than 7.5A apart 177 | all_heavy_atoms = [ 178 | atom for atom in structure[0].get_atoms() if atom.element != "H" 179 | ] 180 | residue_pairs = NeighborSearch(atom_list=all_heavy_atoms).search_all( 181 | self.neighbour_cutoff, level="R" 182 | ) 183 | 184 | for res1, res2 in residue_pairs: 185 | if res1 == res2: 186 | continue 187 | min_dist = self._get_minimum_distance(res1, res2) 188 | res1.neighbours[res2.get_full_id()] = min_dist 189 | res2.neighbours[res1.get_full_id()] = min_dist 190 | return 191 | 192 | def _cdr_lookup(self, chain_id: str, residue_number: int) -> int: 193 | """ 194 | Returns the number of the CDR a residue is part of from its residue number. 195 | Returns zero if the residue is not part of a CDR. 196 | """ 197 | return self.cdr_lookup_dict.get((chain_id, residue_number), 0) 198 | 199 | def _annotate_cdrs(self, structure: PDB.Structure.Structure) -> None: 200 | """Annotates residues with their CDR number (0 if not in a CDR)""" 201 | for res in structure[0].get_residues(): 202 | chain_id = res.parent.id 203 | res_number = res.id[1] 204 | res.cdr_number = self._cdr_lookup(chain_id, res_number) 205 | 206 | def _annotate_cdr_vicinity(self, structure: PDB.Structure.Structure) -> None: 207 | """Finds and annotates which residues are on the surface and less than 4A away from the CDRs/anchors.""" 208 | surface_cdrs_and_anchors = [] 209 | for res in structure[0].get_residues(): 210 | if res.is_surface and (res.is_cdr or res.is_anchor): 211 | res.in_cdr_vicinity = True 212 | surface_cdrs_and_anchors.append(res) 213 | else: 214 | res.in_cdr_vicinity = False 215 | 216 | for res in surface_cdrs_and_anchors: 217 | res.in_cdr_vicinity = True 218 | for neighbour_id, distance in res.neighbours.items(): 219 | res2 = structure[0][neighbour_id[2]][neighbour_id[3]] 220 | if distance < self.vicinity_cutoff and res2.is_surface: 221 | res2.in_cdr_vicinity = True 222 | return 223 | 224 | def _annotate_salt_bridges(self, structure: PDB.Structure.Structure) -> None: 225 | """Identifies which residues form salt bridges based on distance.""" 226 | donor_atoms = [ 227 | atom 228 | for atom in structure[0].get_atoms() 229 | if atom.id in donors.get(atom.parent.resname, []) 230 | ] 231 | acceptor_atoms = [ 232 | atom 233 | for atom in structure[0].get_atoms() 234 | if atom.id in acceptors.get(atom.parent.resname, []) 235 | ] 236 | residue_pairs = NeighborSearch( 237 | atom_list=donor_atoms + acceptor_atoms 238 | ).search_all(self.salt_bridge_cutoff, level="R") 239 | 240 | for res1, res2 in residue_pairs: 241 | # Ignore if already part of a salt bridge 242 | if res1.salt_bridge_partner or res2.salt_bridge_partner: 243 | continue 244 | if res1.is_surface and res2.is_surface: 245 | if (res1.is_donor and res2.is_acceptor) or ( 246 | res2.is_donor and res1.is_acceptor 247 | ): 248 | res1.salt_bridge_partner = res2.get_full_id() 249 | res2.salt_bridge_partner = res1.get_full_id() 250 | return 251 | 252 | def _annotate_hydrophobicity(self, structure: PDB.Structure.Structure) -> None: 253 | """ 254 | Annotates residues with their normalised (between 1 and 2) hydrophobicity values. 255 | If the residue forms part of a salt bridge, it is assigned the hydrophobicity of glycine. 256 | """ 257 | for res in structure[0].get_residues(): 258 | if res.salt_bridge_partner: 259 | res.hydrophobicity = normalised_hydrophobicities["GLY"] 260 | else: 261 | res.hydrophobicity = normalised_hydrophobicities[res.resname] 262 | return 263 | 264 | def _annotate_charge(self, structure: PDB.Structure.Structure) -> None: 265 | """ 266 | Annotates residues with their charges. 267 | If the residue forms part of a salt bridge, it is assigned a charge of zero. 268 | """ 269 | for res in structure[0].get_residues(): 270 | if res.salt_bridge_partner: 271 | res.charge = 0 272 | else: 273 | res.charge = residue_charges[res.resname] 274 | return 275 | 276 | def load_and_annotate_structure( 277 | self, structure_path: str 278 | ) -> PDB.Structure.Structure: 279 | """ 280 | Loads an antibody structure from the provided file, and annotates the residues for later use in TAP metric 281 | calculations. 282 | Assumes the structure is already IMGT-numbered!! 283 | 284 | Args: 285 | structure_path: the path to the structure that is to be annotated. 286 | 287 | Returns: 288 | the annotated structure (Biopython Structure entity, with residues converted to an AnnotatedResidue type). 289 | """ 290 | structure = PDB.PDBParser(QUIET=True).get_structure( 291 | "input_structure", structure_path 292 | ) 293 | self._convert_residues(structure) 294 | self._annotate_sasa(structure, structure_path) 295 | self._get_neighbours(structure) 296 | self._annotate_cdrs(structure) 297 | self._annotate_cdr_vicinity(structure) 298 | self._annotate_salt_bridges(structure) 299 | self._annotate_hydrophobicity(structure) 300 | self._annotate_charge(structure) 301 | return structure 302 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/utils/input_handling.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Optional 3 | 4 | from anarci.anarci import number 5 | from Bio import SeqIO 6 | 7 | 8 | class InputError(Exception): 9 | pass 10 | 11 | 12 | def get_numbering( 13 | sequence: str, expected_type: str 14 | ) -> list[tuple[tuple[int, str], str]]: 15 | """ 16 | Uses ANARCI to number an input sequence. 17 | 18 | Args: 19 | sequence: the amino acid sequence of the antibody chain 20 | expected_type: H or L (if the ANARCI annotation does not match this an error will be raised) 21 | 22 | Returns: 23 | the ANARCI residue numbering, e.g. [((1, ' '), 'E'), ((2, ' '), 'L'), ... ] 24 | 25 | """ 26 | anarci_result: tuple[list[tuple[tuple[int, str], str]], str] = number(sequence) 27 | numbering, chain_type = anarci_result 28 | if numbering: 29 | if chain_type == expected_type: 30 | return numbering 31 | raise InputError( 32 | f"Incorrect chain type: expected {expected_type}, got {chain_type}" 33 | ) 34 | raise InputError(f"ANARCI failed to number {expected_type} sequence") 35 | 36 | 37 | def parse_fasta(fasta_file: str) -> dict[str, dict[str, Optional[str]]]: 38 | if not Path(fasta_file).exists(): 39 | raise InputError(f"Fasta file {fasta_file} does not exist.") 40 | 41 | sequences: dict[str, dict[str, Optional[str]]] = {} 42 | with open(fasta_file) as handle: 43 | for record in SeqIO.parse(handle, "fasta"): 44 | if '/' not in str(record.seq): 45 | raise( 46 | AssertionError, 47 | f"Antibody fasta sequences need to be formatted as HEAVY/LIGHT, an entry in {fasta_file} does not" 48 | f"contain /" 49 | ) 50 | heavy, light = str(record.seq).split("/") 51 | sequences[record.id] = { 52 | "H": heavy if heavy not in ["-", ""] else None, 53 | "L": light if light not in ["-", ""] else None, 54 | } 55 | 56 | return sequences 57 | -------------------------------------------------------------------------------- /src/ab_characterisation/developability_tools/utils/outputs.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | from pathlib import Path 3 | 4 | 5 | def write_file(contents: str, filepath: str) -> None: 6 | """Writes an output file to the given location with the given contents.""" 7 | outpath = Path(filepath) 8 | outpath.parent.mkdir(parents=True, exist_ok=True) 9 | with outpath.open("w") as openf: 10 | openf.write(contents) 11 | -------------------------------------------------------------------------------- /src/ab_characterisation/filter_steps.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from loguru import logger 4 | import numpy as np 5 | import pandas as pd 6 | from numpy import typing as npt 7 | from scipy.stats import multivariate_normal 8 | 9 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig 10 | from ab_characterisation.utils.rosetta_utils import aggregate_rosetta_metrics 11 | 12 | 13 | def sequence_liability_filter(biol_data: BiologicsData, config: RunConfig) -> bool: 14 | """ 15 | 16 | Args: 17 | biol_data: 18 | config: 19 | 20 | Returns: 21 | 22 | """ 23 | for liability in biol_data.sequence_liabilities: 24 | if liability.liability_type in config.dq_sequence_liabilities: 25 | return True 26 | return False 27 | 28 | 29 | def tap_filter(biol_data: BiologicsData, config: RunConfig) -> bool: 30 | """ 31 | 32 | Args: 33 | biol_data: 34 | config: 35 | 36 | Returns: 37 | 38 | """ 39 | for tap_metric in biol_data.tap_flags: 40 | if tap_metric.flag == "RED": 41 | return True 42 | return False 43 | 44 | 45 | def rosetta_antibody_filter(biol_data: BiologicsData, config: RunConfig) -> bool: 46 | """ 47 | 48 | Args: 49 | biol_data: 50 | config: 51 | 52 | Returns: 53 | 54 | """ 55 | rosetta_antibody_data = aggregate_rosetta_metrics(biol_data.rosetta_output_ab_only) 56 | if rosetta_antibody_data.dG_separated.iloc[0] < 5: 57 | return False 58 | return True 59 | 60 | 61 | def find_top_n( 62 | biol_data_ls: list[BiologicsData], config: RunConfig 63 | ) -> list[BiologicsData]: 64 | pd_row_ls = [] 65 | out_ls = [] 66 | for biol_data in biol_data_ls: 67 | if biol_data.discarded_by is None: 68 | rosetta_complex_scores = aggregate_rosetta_metrics( 69 | biol_data.rosetta_output_complex, metrics=["dG_separated", "total_score"] 70 | ).iloc[0] 71 | pd_row_ls.append(rosetta_complex_scores) 72 | metric_df = pd.concat(pd_row_ls) 73 | top_indices = find_top_candidates( 74 | metric_df.total_score, 75 | metric_df.dG_separated, 76 | config.top_n, 77 | scale_factor=1, 78 | fit_without_outliers=True, 79 | ) 80 | valid_candidate_idx = -1 81 | for biol_data in biol_data_ls: 82 | if biol_data.discarded_by is None: 83 | valid_candidate_idx += 1 84 | if valid_candidate_idx in top_indices: 85 | biol_data.rank = list(top_indices).index(valid_candidate_idx) 86 | else: 87 | biol_data.discarded_by = "Not in top N" 88 | out_ls.append(biol_data) 89 | return out_ls 90 | 91 | 92 | def find_top_candidates( 93 | total_score: npt.ArrayLike, 94 | dG_separated: npt.ArrayLike, 95 | n: int, 96 | scale_factor: float = 1, 97 | total_score_max: Optional[float] = None, 98 | dG_separated_max: Optional[float] = None, 99 | fit_without_outliers: bool = True, 100 | ) -> np.ndarray: 101 | """ 102 | Fit multivariate gaussian (centered on median rather than mean) and then select best points 103 | according to lowest probability of being drawn subject to bounds. 104 | Args: 105 | total_score: Total score for candidates to select 106 | dG_separated: dG_seperated of candidates to select 107 | n: N candidates to select 108 | scale_factor: Scale total score of data points by this factor AFTER fitting multivariate 109 | total_score_max: Maximum total score of selected candidates, if not specified use median 110 | dG_separated_max: Maximum dG_separated of selected candidates, if not specified use medians 111 | fit_without_outliers: Ignore points that are 1.5 IQR above/below the upper/lower quartile 112 | when fitting the gaussian. 113 | 114 | Returns: 115 | 116 | """ 117 | data = np.stack([np.array(total_score), np.array(dG_separated)], axis=1) 118 | 119 | if fit_without_outliers: 120 | # Define outliers (don't fit gaussian on these) 121 | ts_q1 = np.quantile(total_score, 0.25) 122 | ts_q3 = np.quantile(total_score, 0.75) 123 | ts_IQR = ts_q3 - ts_q1 124 | ts_lower_bound = ts_q1 - ts_IQR * 1.5 125 | ts_upper_bound = ts_q3 + ts_IQR * 1.5 126 | 127 | dGs_q1 = np.quantile(dG_separated, 0.25) 128 | dGs_q3 = np.quantile(dG_separated, 0.75) 129 | dGs_IQR = dGs_q3 - dGs_q1 130 | dGs_lower_bound = dGs_q1 - dGs_IQR * 1.5 131 | dGs_upper_bound = dGs_q3 + dGs_IQR * 1.5 132 | 133 | outlier_mask = ( 134 | (ts_lower_bound < data[:, 0]) 135 | & (data[:, 0] < ts_upper_bound) 136 | & (dGs_lower_bound < data[:, 1]) 137 | & (data[:, 1] < dGs_upper_bound) 138 | ) 139 | 140 | median = np.median(data[outlier_mask], axis=0) 141 | cov = np.cov(data[outlier_mask], rowvar=0) 142 | else: 143 | median = np.median(data, axis=0) 144 | cov = np.cov(data, rowvar=0) 145 | multivar_f = multivariate_normal(mean=median, cov=cov, allow_singular=True) 146 | xmax = total_score_max if total_score_max is not None else median[0] 147 | ymax = dG_separated_max if dG_separated_max is not None else median[1] 148 | centroid = np.array([xmax, ymax]) 149 | mask = np.all((data < centroid), axis=1) 150 | data[:, 0] = scale_factor * (data[:, 0] - median[0]) + median[0] 151 | top_idx = np.argsort(multivar_f.pdf(data[mask]))[:n] 152 | indices = np.where(mask == True)[0] 153 | return indices[top_idx] 154 | -------------------------------------------------------------------------------- /src/ab_characterisation/pipeline_orchestration.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import typing as t 3 | 4 | import pandas as pd 5 | from loguru import logger 6 | from mpi4py import MPI 7 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig, save_output 8 | 9 | from ab_characterisation.filter_steps import ( 10 | find_top_n, rosetta_antibody_filter, sequence_liability_filter, tap_filter 11 | ) 12 | from ab_characterisation.rosetta_steps import rosetta_antibody_step, rosetta_complex_step 13 | from ab_characterisation.sequence_steps import sequence_liability_check 14 | from ab_characterisation.structure_steps import run_abb2, run_chimerax_superposition, run_tap 15 | 16 | 17 | def get_objects(config: RunConfig) -> list[BiologicsData]: 18 | """ 19 | 20 | Args: 21 | config: 22 | 23 | Returns: 24 | 25 | """ 26 | data_objects = [] 27 | df = pd.read_csv(config.input_file) 28 | for idx, row in df.iterrows(): 29 | data_objects.append( 30 | BiologicsData( 31 | heavy_sequence=row.heavy_sequence, 32 | light_sequence=row.light_sequence, 33 | name=row.sequence_name, 34 | target_complex_reference=row.reference_complex, 35 | ) 36 | ) 37 | return data_objects 38 | 39 | 40 | def filtering_step( 41 | input_data: list[BiologicsData], 42 | step_name: str, 43 | criterion_function: t.Callable, 44 | config: RunConfig, 45 | ) -> list[BiologicsData]: 46 | """ 47 | General framework for a step that performs filtering of the input data, labelling datapoints as discarded if they 48 | fail to pass a filter criterion. 49 | Args: 50 | input_data: 51 | step_name: 52 | criterion_function: Function mapping BiologicsData -> bool 53 | config: 54 | 55 | Returns: 56 | list of BiologicsData objects 57 | """ 58 | output_data: list[BiologicsData] = [] 59 | filter_count = 0 60 | 61 | for biol_data in input_data: 62 | if biol_data.discarded_by is None: 63 | filtered = criterion_function(biol_data, config) 64 | if filtered: 65 | biol_data.discarded_by = step_name 66 | filter_count += 1 67 | output_data.append(biol_data) 68 | 69 | logger.info(f"{filter_count} datapoints discarded during step {step_name}.") 70 | return output_data 71 | 72 | 73 | def computation_step( 74 | input_data: list[BiologicsData], computation_function: t.Callable, config: RunConfig 75 | ) -> list[BiologicsData]: 76 | """ 77 | General framework for a step that performs computation on the input data, manipulating one or more of the dataclass 78 | fields. 79 | 80 | Args: 81 | input_data: 82 | computation_function: Function mapping BiologicsData -> BiologicsData, modifying the dataclass fields with the 83 | results of the computation 84 | config: 85 | 86 | Returns: 87 | list of BiologicsData objects 88 | """ 89 | 90 | comm = MPI.COMM_WORLD 91 | rank = comm.Get_rank() 92 | size = comm.Get_size() 93 | 94 | # Calculate the chunk size for each process 95 | chunk_size = len(input_data) // size 96 | remainder = len(input_data) % size 97 | 98 | # Calculate the range for the current process 99 | local_start = rank * chunk_size + min(rank, remainder) 100 | local_end = local_start + chunk_size + (1 if rank < remainder else 0) 101 | 102 | # Perform the local computation 103 | local_results: list[BiologicsData] = [] 104 | for biol_data in input_data[local_start:local_end]: 105 | if biol_data.discarded_by is None: 106 | biol_data = computation_function(biol_data, config) 107 | local_results.append(biol_data) 108 | 109 | # Gather the local results at the root process 110 | all_results = comm.gather(local_results, root=0) 111 | 112 | # Combine the results into a single list 113 | output_data: list[BiologicsData] = [] 114 | if rank == 0: 115 | for result_list in all_results: 116 | output_data.extend(result_list) 117 | output_data = comm.bcast(output_data, root=0) 118 | return output_data 119 | 120 | 121 | def pipeline(config: RunConfig, mpi_rank: int, mpi_size: int) -> None: 122 | """ 123 | 124 | Args: 125 | config: 126 | 127 | Returns: 128 | 129 | """ 130 | logger.remove() 131 | if mpi_rank == 0: 132 | logger.add( 133 | sys.stdout, 134 | format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}", 135 | level="INFO", 136 | ) 137 | else: 138 | logger.add( 139 | sys.stdout, 140 | format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}", 141 | level="WARNING", 142 | ) 143 | biologics_objects = get_objects(config) 144 | 145 | logger.info("Identifying sequence liabilities") 146 | biologics_objects = computation_step( 147 | biologics_objects, sequence_liability_check, config 148 | ) 149 | logger.info("Filtering by sequence liabilities") 150 | biologics_objects = filtering_step( 151 | biologics_objects, 152 | step_name="liabilities", 153 | criterion_function=sequence_liability_filter, 154 | config=config, 155 | ) 156 | 157 | logger.info("Running ABB2") 158 | biologics_objects = computation_step(biologics_objects, run_abb2, config) 159 | logger.info("Running TAP") 160 | biologics_objects = computation_step(biologics_objects, run_tap, config) 161 | logger.info("Filtering TAP") 162 | biologics_objects = filtering_step(biologics_objects, "tap", tap_filter, config) 163 | logger.info("Running antibody-only Rosetta analysis") 164 | biologics_objects = computation_step( 165 | biologics_objects, rosetta_antibody_step, config 166 | ) 167 | logger.info("Running filtering based on antibody-only Rosetta analysis") 168 | biologics_objects = filtering_step( 169 | biologics_objects, "rosetta_antibody", rosetta_antibody_filter, config 170 | ) 171 | if not config.exclude_complex_analysis: 172 | logger.info("Running ChimeraX complex generation") 173 | biologics_objects = computation_step( 174 | biologics_objects, run_chimerax_superposition, config 175 | ) 176 | logger.info("Running Rosetta complex analysis") 177 | biologics_objects = computation_step( 178 | biologics_objects, rosetta_complex_step, config 179 | ) 180 | 181 | if mpi_rank == 0: 182 | logger.info("Identifying top N candidates") 183 | biologics_objects = find_top_n(biologics_objects, config) 184 | save_output(biol_data_ls=biologics_objects, config=config) 185 | -------------------------------------------------------------------------------- /src/ab_characterisation/rosetta_steps.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import subprocess 3 | import tempfile 4 | from pathlib import Path 5 | 6 | import pandas as pd 7 | 8 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig 9 | 10 | 11 | def generic_rosetta_step( 12 | biol_data: BiologicsData, 13 | variables: dict[str, str], 14 | template: str, 15 | config: RunConfig, 16 | step_name: str, 17 | replicates: int = 1, 18 | ) -> pd.DataFrame: 19 | """ 20 | Args: 21 | biol_data: 22 | variables: 23 | template: 24 | config: 25 | step_name: 26 | replicates: 27 | 28 | Returns: 29 | 30 | """ 31 | outputs = [] 32 | for replicate in range(replicates): 33 | with tempfile.TemporaryDirectory() as temp_dir: 34 | bash_template_path = ( 35 | Path(__file__).parent / "utils" / "rosetta_templates" / f"{template}.sh" 36 | ) 37 | xml_template_path = ( 38 | Path(__file__).parent 39 | / "utils" 40 | / "rosetta_templates" 41 | / f"{template}.xml" 42 | ) 43 | 44 | with open(bash_template_path) as inf_sh, open( 45 | Path(temp_dir) / f"{template}.sh", "w" 46 | ) as outf_sh: 47 | for line in inf_sh: 48 | for key, value in variables.items(): 49 | line = line.replace(key, value) 50 | outf_sh.write(line) 51 | 52 | shutil.copy(xml_template_path, Path(temp_dir) / f"{template}.xml") 53 | 54 | with open( 55 | config.output_directory 56 | / "logs" 57 | / f"{biol_data.name}_rosetta_{step_name}_{replicate}.log", 58 | "w", 59 | ) as outf: 60 | subprocess.run( 61 | ["bash", f"{template}.sh"], cwd=temp_dir, stdout=outf, stderr=outf 62 | ) 63 | output = pd.read_csv( 64 | Path(temp_dir) / "score.sc", delim_whitespace=True, skiprows=1 65 | ) 66 | output["replicate"] = replicate 67 | outputs.append(output) 68 | return pd.concat(outputs) 69 | 70 | 71 | def rosetta_antibody_step(biol_data: BiologicsData, config: RunConfig) -> BiologicsData: 72 | """ 73 | 74 | Args: 75 | biol_data: 76 | config: 77 | 78 | Returns: 79 | 80 | """ 81 | variables = { 82 | "": str(biol_data.antibody_structure), 83 | "": config.rosetta_base_directory, 84 | } 85 | result_df = generic_rosetta_step( 86 | biol_data, 87 | variables, 88 | "rosetta_metrics_ab_only", 89 | config, 90 | step_name="ab_only", 91 | replicates=config.rosetta_replicates, 92 | ) 93 | biol_data.rosetta_output_ab_only = result_df 94 | result_df.to_csv( 95 | config.output_directory 96 | / "rosetta_output" 97 | / f"{biol_data.name}_rosetta_ab_only.csv" 98 | ) 99 | return biol_data 100 | 101 | 102 | def rosetta_complex_step(biol_data: BiologicsData, config: RunConfig) -> BiologicsData: 103 | """ 104 | 105 | Args: 106 | biol_data: 107 | config: 108 | 109 | Returns: 110 | 111 | """ 112 | variables = { 113 | "": biol_data.chimerax_complex_structure, 114 | "": config.rosetta_base_directory, 115 | } 116 | result_df = generic_rosetta_step( 117 | biol_data, 118 | variables, 119 | "rosetta_metrics_complex", 120 | config, 121 | step_name="complex", 122 | replicates=config.rosetta_replicates, 123 | ) 124 | biol_data.rosetta_output_complex = result_df 125 | result_df.to_csv( 126 | config.output_directory 127 | / "rosetta_output" 128 | / f"{biol_data.name}_rosetta_complex.csv" 129 | ) 130 | return biol_data 131 | -------------------------------------------------------------------------------- /src/ab_characterisation/sequence_steps.py: -------------------------------------------------------------------------------- 1 | from ab_characterisation.developability_tools.sequence_liabilities.main import scan_single 2 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig 3 | 4 | 5 | def sequence_liability_check( 6 | input_data: BiologicsData, config: RunConfig 7 | ) -> BiologicsData: 8 | """ 9 | 10 | Args: 11 | input_data: 12 | 13 | Returns: 14 | 15 | """ 16 | liabilities = scan_single( 17 | input_data.heavy_sequence, input_data.light_sequence, quiet=True 18 | ) 19 | input_data.sequence_liabilities = liabilities 20 | return input_data 21 | -------------------------------------------------------------------------------- /src/ab_characterisation/structure_steps.py: -------------------------------------------------------------------------------- 1 | from ImmuneBuilder import ABodyBuilder2 2 | 3 | from ab_characterisation.developability_tools.tap.main import run_tap as tap 4 | from ab_characterisation.utils.chimerax_utils import ChimeraInput, ChimeraOutput, run_chimerax 5 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig 6 | 7 | 8 | def run_abb2(biol_data: BiologicsData, config: RunConfig) -> BiologicsData: 9 | """ 10 | Run ABodyBuilder2 on input data, save output and output path. 11 | Args: 12 | biol_data: 13 | config: 14 | 15 | Returns: 16 | 17 | """ 18 | predictor = ABodyBuilder2() 19 | 20 | sequences = {"H": biol_data.heavy_sequence, "L": biol_data.light_sequence} 21 | 22 | antibody = predictor.predict(sequences) 23 | antibody.save( 24 | str(config.output_directory / "antibody_models" / f"{biol_data.name}_model.pdb") 25 | ) 26 | biol_data.antibody_structure = ( 27 | config.output_directory / "antibody_models" / f"{biol_data.name}_model.pdb" 28 | ).resolve() 29 | return biol_data 30 | 31 | 32 | def run_tap(biol_data: BiologicsData, config: RunConfig) -> BiologicsData: 33 | """ 34 | 35 | Args: 36 | biol_data: 37 | config: 38 | 39 | Returns: 40 | 41 | """ 42 | results = tap(biol_data.antibody_structure, outfile=None, quiet=True) 43 | biol_data.tap_flags = results 44 | return biol_data 45 | 46 | 47 | def run_chimerax_superposition( 48 | biol_data: BiologicsData, config: RunConfig 49 | ) -> BiologicsData: 50 | """ 51 | 52 | Args: 53 | biol_data: 54 | config: 55 | 56 | Returns: 57 | 58 | """ 59 | chimera_input = ChimeraInput( 60 | name=biol_data.name, 61 | template=biol_data.target_complex_reference, 62 | query_ab=biol_data.antibody_structure, 63 | template_ab_chains=biol_data.target_complex_antibody_chains, 64 | map_resolution=config.chimera_map_resolution, 65 | query_ab_chains="HL", 66 | template_ag_chains=biol_data.target_complex_antigen_chains, 67 | output_file=str( 68 | ( 69 | config.output_directory 70 | / "complex_structures" 71 | / f"{biol_data.name}_complex.pdb" 72 | ).resolve() 73 | ), 74 | ) 75 | 76 | chimera_output = run_chimerax(chimera_input, config) 77 | if chimera_output.success: 78 | biol_data.chimerax_complex_structure = chimera_output.output_file 79 | else: 80 | biol_data.discarded_by = "ChimeraX failure" 81 | 82 | return biol_data 83 | -------------------------------------------------------------------------------- /src/ab_characterisation/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/utils/__init__.py -------------------------------------------------------------------------------- /src/ab_characterisation/utils/anarci_region_definition_utils.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from dataclasses import dataclass, field # pylint: disable=C0302 3 | from typing import Dict, List, Tuple 4 | 5 | 6 | @dataclass 7 | class ExtractedRegions: 8 | """ 9 | Class to record numbered antibody sequence with region annotation 10 | """ 11 | 12 | region_numbering: List[Tuple[Tuple[int, str], str, str]] = field( 13 | init=False, repr=False, default_factory=list 14 | ) 15 | region_sequences: Dict[str, str] = field( 16 | init=False, repr=True, default_factory=lambda: defaultdict(str) 17 | ) 18 | 19 | def add_residue( 20 | self, current_regions: str, amino_acid: str, residue: Tuple[int, str] 21 | ) -> None: 22 | """ 23 | Function to record a numbered residue with region annotation 24 | Args: 25 | current_regions: name of the region e.g. cdrh1 26 | amino_acid: one letter amino acid letter 27 | residue: antibody numbered position e.g. (110, " ") 28 | 29 | Returns: 30 | 31 | """ 32 | if self.region_numbering and self.region_numbering[-1][0][0] > residue[0]: 33 | raise AssertionError( 34 | f"Incorrect numbering. Previous residue cannot come" 35 | f" after the added one: {self.region_numbering[-1][0]} and {residue}" 36 | ) 37 | self.region_numbering.append((residue, amino_acid, current_regions)) 38 | self.region_sequences[current_regions] += ( 39 | amino_acid if amino_acid != "-" else "" 40 | ) 41 | 42 | 43 | def define_imgt_regions() -> Dict[str, str]: 44 | """ 45 | Antibody region definition according to imgt scheme 46 | """ 47 | heavy_region = ( 48 | "1" * 26 + "2" * 12 + "3" * 17 + "4" * 10 + "5" * 39 + "6" * 13 + "7" * 11 49 | ) 50 | light_region = heavy_region 51 | assert len(heavy_region) == 128 52 | 53 | return {"H": heavy_region, "L": light_region} 54 | 55 | 56 | def define_chothia_regions() -> Dict[str, str]: 57 | light_region = ( 58 | "1" * 23 + "2" * 17 + "3" * 15 + "4" * 14 + "5" * 35 + "6" * 13 + "7" * 11 59 | ) 60 | 61 | heavy_region = ( 62 | "1" * 26 + "2" * 11 + "3" * 19 + "4" * 8 + "5" * 42 + "6" * 11 + "7" * 11 63 | ) 64 | for reg in [light_region, heavy_region]: 65 | assert len(reg) == 128 66 | 67 | return {"H": heavy_region, "L": light_region} 68 | 69 | 70 | def define_kabat_regions() -> Dict[str, str]: 71 | light_region = ( 72 | "1" * 23 + "2" * 17 + "3" * 15 + "4" * 14 + "5" * 35 + "6" * 13 + "7" * 11 73 | ) 74 | heavy_region = ( 75 | "1" * 35 + "2" * 5 + "3" * 14 + "4" * 20 + "5" * 32 + "6" * 11 + "7" * 11 76 | ) 77 | 78 | for reg in [light_region, heavy_region]: 79 | assert len(reg) == 128 80 | 81 | return {"H": heavy_region, "L": light_region} 82 | 83 | 84 | def define_contact_regions() -> Dict[str, str]: 85 | light_region = ( 86 | "1" * 35 + "2" * 7 + "3" * 9 + "4" * 17 + "5" * 36 + "6" * 12 + "7" * 12 87 | ) 88 | heavy_region = ( 89 | "1" * 30 + "2" * 10 + "3" * 11 + "4" * 15 + "5" * 38 + "6" * 12 + "7" * 12 90 | ) 91 | for reg in [light_region, heavy_region]: 92 | assert len(reg) == 128 93 | 94 | return {"H": heavy_region, "L": light_region} 95 | 96 | 97 | def define_north_regions() -> Dict[str, str]: 98 | light_region = ( 99 | "1" * 23 + "2" * 17 + "3" * 14 + "4" * 15 + "5" * 35 + "6" * 13 + "7" * 11 100 | ) 101 | heavy_region = ( 102 | "1" * 23 + "2" * 17 + "3" * 14 + "4" * 12 + "5" * 38 + "6" * 13 + "7" * 11 103 | ) 104 | for reg in [light_region, heavy_region]: 105 | assert len(reg) == 128 106 | 107 | return {"H": heavy_region, "L": light_region} 108 | 109 | 110 | _regions = {} 111 | for scheme, define_function in [ 112 | ("imgt", define_imgt_regions), 113 | ("chothia", define_chothia_regions), 114 | ("kabat", define_kabat_regions), 115 | ("north", define_north_regions), 116 | ("contact", define_contact_regions), 117 | ]: 118 | _regions[scheme] = define_function() 119 | 120 | # For internal use only. These are not direct conversions and are handled heuristically. 121 | _index_to_imgt_state = { 122 | ("chothia", "H"): { 123 | 1: 0, 124 | 2: 1, 125 | 3: 2, 126 | 4: 3, 127 | 5: 4, 128 | 6: 6, 129 | 7: 7, 130 | 8: 8, 131 | 9: 9, 132 | 10: 10, 133 | 11: 11, 134 | 12: 12, 135 | 13: 13, 136 | 14: 14, 137 | 15: 15, 138 | 16: 16, 139 | 17: 17, 140 | 18: 18, 141 | 19: 19, 142 | 20: 20, 143 | 21: 21, 144 | 22: 22, 145 | 23: 23, 146 | 24: 24, 147 | 25: 25, 148 | 26: 26, 149 | 27: 27, 150 | 28: 28, 151 | 29: 29, 152 | 30: 30, 153 | 31: 35, 154 | 32: 36, 155 | 33: 37, 156 | 34: 38, 157 | 35: 39, 158 | 36: 40, 159 | 37: 41, 160 | 38: 42, 161 | 39: 43, 162 | 40: 44, 163 | 41: 45, 164 | 42: 46, 165 | 43: 47, 166 | 44: 48, 167 | 45: 49, 168 | 46: 50, 169 | 47: 51, 170 | 48: 52, 171 | 49: 53, 172 | 50: 54, 173 | 51: 55, 174 | 52: 59, 175 | 53: 60, 176 | 54: 61, 177 | 55: 62, 178 | 56: 63, 179 | 57: 64, 180 | 58: 65, 181 | 59: 66, 182 | 60: 67, 183 | 61: 68, 184 | 62: 69, 185 | 63: 70, 186 | 64: 72, 187 | 65: 73, 188 | 66: 74, 189 | 67: 75, 190 | 68: 76, 191 | 69: 77, 192 | 70: 78, 193 | 71: 79, 194 | 72: 80, 195 | 73: 81, 196 | 74: 82, 197 | 75: 83, 198 | 76: 84, 199 | 77: 85, 200 | 78: 86, 201 | 79: 87, 202 | 80: 88, 203 | 81: 89, 204 | 82: 93, 205 | 83: 94, 206 | 84: 95, 207 | 85: 96, 208 | 86: 97, 209 | 87: 98, 210 | 88: 99, 211 | 89: 100, 212 | 90: 101, 213 | 91: 102, 214 | 92: 103, 215 | 93: 104, 216 | 94: 105, 217 | 95: 106, 218 | 96: 107, 219 | 97: 108, 220 | 98: 109, 221 | 99: 110, 222 | 100: 114, 223 | 101: 115, 224 | 102: 116, 225 | 103: 117, 226 | 104: 118, 227 | 105: 119, 228 | 106: 120, 229 | 107: 121, 230 | 108: 122, 231 | 109: 123, 232 | 110: 124, 233 | 111: 125, 234 | 112: 126, 235 | 113: 127, 236 | }, 237 | ("kabat", "H"): { 238 | 1: 0, 239 | 2: 1, 240 | 3: 2, 241 | 4: 3, 242 | 5: 4, 243 | 6: 6, 244 | 7: 7, 245 | 8: 8, 246 | 9: 9, 247 | 10: 10, 248 | 11: 11, 249 | 12: 12, 250 | 13: 13, 251 | 14: 14, 252 | 15: 15, 253 | 16: 16, 254 | 17: 17, 255 | 18: 18, 256 | 19: 19, 257 | 20: 20, 258 | 21: 21, 259 | 22: 22, 260 | 23: 23, 261 | 24: 24, 262 | 25: 25, 263 | 26: 26, 264 | 27: 27, 265 | 28: 28, 266 | 29: 29, 267 | 30: 30, 268 | 31: 31, 269 | 32: 32, 270 | 33: 33, 271 | 34: 34, 272 | 35: 35, 273 | 36: 40, 274 | 37: 41, 275 | 38: 42, 276 | 39: 43, 277 | 40: 44, 278 | 41: 45, 279 | 42: 46, 280 | 43: 47, 281 | 44: 48, 282 | 45: 49, 283 | 46: 50, 284 | 47: 51, 285 | 48: 52, 286 | 49: 53, 287 | 50: 54, 288 | 51: 55, 289 | 52: 59, 290 | 53: 60, 291 | 54: 61, 292 | 55: 62, 293 | 56: 63, 294 | 57: 64, 295 | 58: 65, 296 | 59: 66, 297 | 60: 67, 298 | 61: 68, 299 | 62: 69, 300 | 63: 70, 301 | 64: 72, 302 | 65: 73, 303 | 66: 74, 304 | 67: 75, 305 | 68: 76, 306 | 69: 77, 307 | 70: 78, 308 | 71: 79, 309 | 72: 80, 310 | 73: 81, 311 | 74: 82, 312 | 75: 83, 313 | 76: 84, 314 | 77: 85, 315 | 78: 86, 316 | 79: 87, 317 | 80: 88, 318 | 81: 89, 319 | 82: 93, 320 | 83: 94, 321 | 84: 95, 322 | 85: 96, 323 | 86: 97, 324 | 87: 98, 325 | 88: 99, 326 | 89: 100, 327 | 90: 101, 328 | 91: 102, 329 | 92: 103, 330 | 93: 104, 331 | 94: 105, 332 | 95: 106, 333 | 96: 107, 334 | 97: 108, 335 | 98: 109, 336 | 99: 110, 337 | 100: 114, 338 | 101: 115, 339 | 102: 116, 340 | 103: 117, 341 | 104: 118, 342 | 105: 119, 343 | 106: 120, 344 | 107: 121, 345 | 108: 122, 346 | 109: 123, 347 | 110: 124, 348 | 111: 125, 349 | 112: 126, 350 | 113: 127, 351 | }, 352 | ("imgt", "H"): { 353 | 1: 0, 354 | 2: 1, 355 | 3: 2, 356 | 4: 3, 357 | 5: 4, 358 | 6: 5, 359 | 7: 6, 360 | 8: 7, 361 | 9: 8, 362 | 10: 9, 363 | 11: 10, 364 | 12: 11, 365 | 13: 12, 366 | 14: 13, 367 | 15: 14, 368 | 16: 15, 369 | 17: 16, 370 | 18: 17, 371 | 19: 18, 372 | 20: 19, 373 | 21: 20, 374 | 22: 21, 375 | 23: 22, 376 | 24: 23, 377 | 25: 24, 378 | 26: 25, 379 | 27: 26, 380 | 28: 27, 381 | 29: 28, 382 | 30: 29, 383 | 31: 30, 384 | 32: 31, 385 | 33: 32, 386 | 34: 33, 387 | 35: 34, 388 | 36: 35, 389 | 37: 36, 390 | 38: 37, 391 | 39: 38, 392 | 40: 39, 393 | 41: 40, 394 | 42: 41, 395 | 43: 42, 396 | 44: 43, 397 | 45: 44, 398 | 46: 45, 399 | 47: 46, 400 | 48: 47, 401 | 49: 48, 402 | 50: 49, 403 | 51: 50, 404 | 52: 51, 405 | 53: 52, 406 | 54: 53, 407 | 55: 54, 408 | 56: 55, 409 | 57: 56, 410 | 58: 57, 411 | 59: 58, 412 | 60: 59, 413 | 61: 60, 414 | 62: 61, 415 | 63: 62, 416 | 64: 63, 417 | 65: 64, 418 | 66: 65, 419 | 67: 66, 420 | 68: 67, 421 | 69: 68, 422 | 70: 69, 423 | 71: 70, 424 | 72: 71, 425 | 73: 72, 426 | 74: 73, 427 | 75: 74, 428 | 76: 75, 429 | 77: 76, 430 | 78: 77, 431 | 79: 78, 432 | 80: 79, 433 | 81: 80, 434 | 82: 81, 435 | 83: 82, 436 | 84: 83, 437 | 85: 84, 438 | 86: 85, 439 | 87: 86, 440 | 88: 87, 441 | 89: 88, 442 | 90: 89, 443 | 91: 90, 444 | 92: 91, 445 | 93: 92, 446 | 94: 93, 447 | 95: 94, 448 | 96: 95, 449 | 97: 96, 450 | 98: 97, 451 | 99: 98, 452 | 100: 99, 453 | 101: 100, 454 | 102: 101, 455 | 103: 102, 456 | 104: 103, 457 | 105: 104, 458 | 106: 105, 459 | 107: 106, 460 | 108: 107, 461 | 109: 108, 462 | 110: 109, 463 | 111: 110, 464 | 112: 111, 465 | 113: 112, 466 | 114: 113, 467 | 115: 114, 468 | 116: 115, 469 | 117: 116, 470 | 118: 117, 471 | 119: 118, 472 | 120: 119, 473 | 121: 120, 474 | 122: 121, 475 | 123: 122, 476 | 124: 123, 477 | 125: 124, 478 | 126: 125, 479 | 127: 126, 480 | 128: 127, 481 | }, 482 | ("chothia", "L"): { 483 | 1: 0, 484 | 2: 1, 485 | 3: 2, 486 | 4: 3, 487 | 5: 4, 488 | 6: 5, 489 | 7: 6, 490 | 8: 7, 491 | 9: 8, 492 | 10: 9, 493 | 11: 10, 494 | 12: 11, 495 | 13: 12, 496 | 14: 13, 497 | 15: 14, 498 | 16: 15, 499 | 17: 16, 500 | 18: 17, 501 | 19: 18, 502 | 20: 19, 503 | 21: 20, 504 | 22: 21, 505 | 23: 22, 506 | 24: 23, 507 | 25: 24, 508 | 26: 25, 509 | 27: 26, 510 | 28: 27, 511 | 29: 28, 512 | 30: 35, 513 | 31: 36, 514 | 32: 37, 515 | 33: 38, 516 | 34: 39, 517 | 35: 40, 518 | 36: 41, 519 | 37: 42, 520 | 38: 43, 521 | 39: 44, 522 | 40: 45, 523 | 41: 46, 524 | 42: 47, 525 | 43: 48, 526 | 44: 49, 527 | 45: 50, 528 | 46: 51, 529 | 47: 52, 530 | 48: 53, 531 | 49: 54, 532 | 50: 55, 533 | 51: 56, 534 | 52: 57, 535 | 53: 65, 536 | 54: 66, 537 | 55: 67, 538 | 56: 68, 539 | 57: 69, 540 | 58: 70, 541 | 59: 72, 542 | 60: 73, 543 | 61: 74, 544 | 62: 75, 545 | 63: 76, 546 | 64: 77, 547 | 65: 78, 548 | 66: 81, 549 | 67: 82, 550 | 68: 83, 551 | 69: 84, 552 | 70: 85, 553 | 71: 86, 554 | 72: 87, 555 | 73: 88, 556 | 74: 89, 557 | 75: 90, 558 | 76: 91, 559 | 77: 92, 560 | 78: 93, 561 | 79: 94, 562 | 80: 95, 563 | 81: 96, 564 | 82: 97, 565 | 83: 98, 566 | 84: 99, 567 | 85: 100, 568 | 86: 101, 569 | 87: 102, 570 | 88: 103, 571 | 89: 104, 572 | 90: 105, 573 | 91: 106, 574 | 92: 107, 575 | 93: 108, 576 | 94: 109, 577 | 95: 114, 578 | 96: 115, 579 | 97: 116, 580 | 98: 117, 581 | 99: 118, 582 | 100: 119, 583 | 101: 120, 584 | 102: 121, 585 | 103: 122, 586 | 104: 123, 587 | 105: 124, 588 | 106: 125, 589 | 107: 126, 590 | 108: 127, 591 | }, 592 | ("martin", "H"): { 593 | 1: 0, 594 | 2: 1, 595 | 3: 2, 596 | 4: 3, 597 | 5: 4, 598 | 6: 5, 599 | 7: 6, 600 | 8: 8, 601 | 9: 9, 602 | 10: 10, 603 | 11: 11, 604 | 12: 12, 605 | 13: 13, 606 | 14: 14, 607 | 15: 15, 608 | 16: 16, 609 | 17: 17, 610 | 18: 18, 611 | 19: 19, 612 | 20: 20, 613 | 21: 21, 614 | 22: 22, 615 | 23: 23, 616 | 24: 24, 617 | 25: 25, 618 | 26: 26, 619 | 27: 27, 620 | 28: 28, 621 | 29: 29, 622 | 30: 30, 623 | 31: 35, 624 | 32: 36, 625 | 33: 37, 626 | 34: 38, 627 | 35: 39, 628 | 36: 40, 629 | 37: 41, 630 | 38: 42, 631 | 39: 43, 632 | 40: 44, 633 | 41: 45, 634 | 42: 46, 635 | 43: 47, 636 | 44: 48, 637 | 45: 49, 638 | 46: 50, 639 | 47: 51, 640 | 48: 52, 641 | 49: 53, 642 | 50: 54, 643 | 51: 55, 644 | 52: 59, 645 | 53: 60, 646 | 54: 61, 647 | 55: 62, 648 | 56: 63, 649 | 57: 64, 650 | 58: 65, 651 | 59: 66, 652 | 60: 67, 653 | 61: 68, 654 | 62: 69, 655 | 63: 70, 656 | 64: 72, 657 | 65: 73, 658 | 66: 74, 659 | 67: 75, 660 | 68: 76, 661 | 69: 77, 662 | 70: 78, 663 | 71: 79, 664 | 72: 83, 665 | 73: 84, 666 | 74: 85, 667 | 75: 86, 668 | 76: 87, 669 | 77: 88, 670 | 78: 89, 671 | 79: 90, 672 | 80: 91, 673 | 81: 92, 674 | 82: 93, 675 | 83: 94, 676 | 84: 95, 677 | 85: 96, 678 | 86: 97, 679 | 87: 98, 680 | 88: 99, 681 | 89: 100, 682 | 90: 101, 683 | 91: 102, 684 | 92: 103, 685 | 93: 104, 686 | 94: 105, 687 | 95: 106, 688 | 96: 107, 689 | 97: 108, 690 | 98: 109, 691 | 99: 110, 692 | 100: 114, 693 | 101: 115, 694 | 102: 116, 695 | 103: 117, 696 | 104: 118, 697 | 105: 119, 698 | 106: 120, 699 | 107: 121, 700 | 108: 122, 701 | 109: 123, 702 | 110: 124, 703 | 111: 125, 704 | 112: 126, 705 | 113: 127, 706 | }, 707 | ("kabat", "L"): { 708 | 1: 0, 709 | 2: 1, 710 | 3: 2, 711 | 4: 3, 712 | 5: 4, 713 | 6: 5, 714 | 7: 6, 715 | 8: 7, 716 | 9: 8, 717 | 10: 9, 718 | 11: 10, 719 | 12: 11, 720 | 13: 12, 721 | 14: 13, 722 | 15: 14, 723 | 16: 15, 724 | 17: 16, 725 | 18: 17, 726 | 19: 18, 727 | 20: 19, 728 | 21: 20, 729 | 22: 21, 730 | 23: 22, 731 | 24: 23, 732 | 25: 24, 733 | 26: 25, 734 | 27: 32, 735 | 28: 33, 736 | 29: 34, 737 | 30: 35, 738 | 31: 36, 739 | 32: 37, 740 | 33: 38, 741 | 34: 39, 742 | 35: 40, 743 | 36: 41, 744 | 37: 42, 745 | 38: 43, 746 | 39: 44, 747 | 40: 45, 748 | 41: 46, 749 | 42: 47, 750 | 43: 48, 751 | 44: 49, 752 | 45: 50, 753 | 46: 51, 754 | 47: 52, 755 | 48: 53, 756 | 49: 54, 757 | 50: 55, 758 | 51: 56, 759 | 52: 57, 760 | 53: 65, 761 | 54: 66, 762 | 55: 67, 763 | 56: 68, 764 | 57: 69, 765 | 58: 70, 766 | 59: 72, 767 | 60: 73, 768 | 61: 74, 769 | 62: 75, 770 | 63: 76, 771 | 64: 77, 772 | 65: 78, 773 | 66: 81, 774 | 67: 82, 775 | 68: 83, 776 | 69: 84, 777 | 70: 85, 778 | 71: 86, 779 | 72: 87, 780 | 73: 88, 781 | 74: 89, 782 | 75: 90, 783 | 76: 91, 784 | 77: 92, 785 | 78: 93, 786 | 79: 94, 787 | 80: 95, 788 | 81: 96, 789 | 82: 97, 790 | 83: 98, 791 | 84: 99, 792 | 85: 100, 793 | 86: 101, 794 | 87: 102, 795 | 88: 103, 796 | 89: 104, 797 | 90: 105, 798 | 91: 106, 799 | 92: 107, 800 | 93: 108, 801 | 94: 109, 802 | 95: 114, 803 | 96: 115, 804 | 97: 116, 805 | 98: 117, 806 | 99: 118, 807 | 100: 119, 808 | 101: 120, 809 | 102: 121, 810 | 103: 122, 811 | 104: 123, 812 | 105: 124, 813 | 106: 125, 814 | 107: 126, 815 | 108: 127, 816 | }, 817 | ("imgt", "L"): { 818 | 1: 0, 819 | 2: 1, 820 | 3: 2, 821 | 4: 3, 822 | 5: 4, 823 | 6: 5, 824 | 7: 6, 825 | 8: 7, 826 | 9: 8, 827 | 10: 9, 828 | 11: 10, 829 | 12: 11, 830 | 13: 12, 831 | 14: 13, 832 | 15: 14, 833 | 16: 15, 834 | 17: 16, 835 | 18: 17, 836 | 19: 18, 837 | 20: 19, 838 | 21: 20, 839 | 22: 21, 840 | 23: 22, 841 | 24: 23, 842 | 25: 24, 843 | 26: 25, 844 | 27: 26, 845 | 28: 27, 846 | 29: 28, 847 | 30: 29, 848 | 31: 30, 849 | 32: 31, 850 | 33: 32, 851 | 34: 33, 852 | 35: 34, 853 | 36: 35, 854 | 37: 36, 855 | 38: 37, 856 | 39: 38, 857 | 40: 39, 858 | 41: 40, 859 | 42: 41, 860 | 43: 42, 861 | 44: 43, 862 | 45: 44, 863 | 46: 45, 864 | 47: 46, 865 | 48: 47, 866 | 49: 48, 867 | 50: 49, 868 | 51: 50, 869 | 52: 51, 870 | 53: 52, 871 | 54: 53, 872 | 55: 54, 873 | 56: 55, 874 | 57: 56, 875 | 58: 57, 876 | 59: 58, 877 | 60: 59, 878 | 61: 60, 879 | 62: 61, 880 | 63: 62, 881 | 64: 63, 882 | 65: 64, 883 | 66: 65, 884 | 67: 66, 885 | 68: 67, 886 | 69: 68, 887 | 70: 69, 888 | 71: 70, 889 | 72: 71, 890 | 73: 72, 891 | 74: 73, 892 | 75: 74, 893 | 76: 75, 894 | 77: 76, 895 | 78: 77, 896 | 79: 78, 897 | 80: 79, 898 | 81: 80, 899 | 82: 81, 900 | 83: 82, 901 | 84: 83, 902 | 85: 84, 903 | 86: 85, 904 | 87: 86, 905 | 88: 87, 906 | 89: 88, 907 | 90: 89, 908 | 91: 90, 909 | 92: 91, 910 | 93: 92, 911 | 94: 93, 912 | 95: 94, 913 | 96: 95, 914 | 97: 96, 915 | 98: 97, 916 | 99: 98, 917 | 100: 99, 918 | 101: 100, 919 | 102: 101, 920 | 103: 102, 921 | 104: 103, 922 | 105: 104, 923 | 106: 105, 924 | 107: 106, 925 | 108: 107, 926 | 109: 108, 927 | 110: 109, 928 | 111: 110, 929 | 112: 111, 930 | 113: 112, 931 | 114: 113, 932 | 115: 114, 933 | 116: 115, 934 | 117: 116, 935 | 118: 117, 936 | 119: 118, 937 | 120: 119, 938 | 121: 120, 939 | 122: 121, 940 | 123: 122, 941 | 124: 123, 942 | 125: 124, 943 | 126: 125, 944 | 127: 126, 945 | 128: 127, 946 | }, 947 | ("martin", "L"): { 948 | 1: 0, 949 | 2: 1, 950 | 3: 2, 951 | 4: 3, 952 | 5: 4, 953 | 6: 5, 954 | 7: 6, 955 | 8: 7, 956 | 9: 8, 957 | 10: 9, 958 | 11: 10, 959 | 12: 11, 960 | 13: 12, 961 | 14: 13, 962 | 15: 14, 963 | 16: 15, 964 | 17: 16, 965 | 18: 17, 966 | 19: 18, 967 | 20: 19, 968 | 21: 20, 969 | 22: 21, 970 | 23: 22, 971 | 24: 23, 972 | 25: 24, 973 | 26: 25, 974 | 27: 26, 975 | 28: 27, 976 | 29: 28, 977 | 30: 35, 978 | 31: 36, 979 | 32: 37, 980 | 33: 38, 981 | 34: 39, 982 | 35: 40, 983 | 36: 41, 984 | 37: 42, 985 | 38: 43, 986 | 39: 44, 987 | 40: 45, 988 | 41: 46, 989 | 42: 47, 990 | 43: 48, 991 | 44: 49, 992 | 45: 50, 993 | 46: 51, 994 | 47: 52, 995 | 48: 53, 996 | 49: 54, 997 | 50: 55, 998 | 51: 56, 999 | 52: 57, 1000 | 53: 65, 1001 | 54: 66, 1002 | 55: 67, 1003 | 56: 68, 1004 | 57: 69, 1005 | 58: 70, 1006 | 59: 72, 1007 | 60: 73, 1008 | 61: 74, 1009 | 62: 75, 1010 | 63: 76, 1011 | 64: 77, 1012 | 65: 78, 1013 | 66: 81, 1014 | 67: 82, 1015 | 68: 83, 1016 | 69: 84, 1017 | 70: 85, 1018 | 71: 86, 1019 | 72: 87, 1020 | 73: 88, 1021 | 74: 89, 1022 | 75: 90, 1023 | 76: 91, 1024 | 77: 92, 1025 | 78: 93, 1026 | 79: 94, 1027 | 80: 95, 1028 | 81: 96, 1029 | 82: 97, 1030 | 83: 98, 1031 | 84: 99, 1032 | 85: 100, 1033 | 86: 101, 1034 | 87: 102, 1035 | 88: 103, 1036 | 89: 104, 1037 | 90: 105, 1038 | 91: 106, 1039 | 92: 107, 1040 | 93: 108, 1041 | 94: 109, 1042 | 95: 114, 1043 | 96: 115, 1044 | 97: 116, 1045 | 98: 117, 1046 | 99: 118, 1047 | 100: 119, 1048 | 101: 120, 1049 | 102: 121, 1050 | 103: 122, 1051 | 104: 123, 1052 | 105: 124, 1053 | 106: 125, 1054 | 107: 126, 1055 | 108: 127, 1056 | }, 1057 | } 1058 | 1059 | # Wolfguy will be deprecated in ANARCI v1.0.0 1060 | wolfguy_indexdiv50_to_region = { 1061 | "H": ["fwh1", "cdrh1", "fwh2", "cdrh2", "fwh3", "cdrh3", "fwh4"], 1062 | "L": ["fwl1", "cdrl1", "fwl2", "cdrl2", "fwl3", "cdrl3", "fwl4"], 1063 | } 1064 | -------------------------------------------------------------------------------- /src/ab_characterisation/utils/anarci_utils.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import Optional, Union 3 | 4 | from ab_characterisation.utils.anarci_region_definition_utils import (_index_to_imgt_state, 5 | _regions) 6 | 7 | _reg_one2three = { 8 | "1": "fw%s1", 9 | "2": "cdr%s1", 10 | "3": "fw%s2", 11 | "4": "cdr%s2", 12 | "5": "fw%s3", 13 | "6": "cdr%s3", 14 | "7": "fw%s4", 15 | } 16 | 17 | 18 | @dataclass 19 | class Accept: # pylint: disable=R0902 20 | """ 21 | Class that taken ANARCI numbering and classifies each position according to antibody region 22 | """ 23 | 24 | _defined_regions: list[str] = field( 25 | init=False, 26 | repr=False, 27 | default_factory=lambda: [ 28 | "fwh1", 29 | "fwh2", 30 | "fwh3", 31 | "fwh4", 32 | "fwl1", 33 | "fwl2", 34 | "fwl3", 35 | "fwl4", 36 | "cdrh1", 37 | "cdrh2", 38 | "cdrh3", 39 | "cdrl1", 40 | "cdrl2", 41 | "cdrl3", 42 | ], 43 | ) 44 | numbering_scheme: str = field(default="imgt") 45 | definition: str = field(default="imgt") 46 | not_defined: bool = field(default=False) 47 | positions: dict[str, set[tuple[int, str]]] = field(init=False) 48 | exclude: dict[str, set[tuple[int, str]]] = field(init=False) 49 | regions: set[str] = field(init=False, default_factory=set) 50 | 51 | def __post_init__(self) -> None: 52 | 53 | self._macro_regions = { 54 | "hframework": {"fwh1", "fwh2", "fwh3", "fwh4"}, 55 | "hcdrs": {"cdrh1", "cdrh2", "cdrh3"}, 56 | "lframework": {"fwl1", "fwl2", "fwl3", "fwl4"}, 57 | "lcdrs": {"cdrl1", "cdrl2", "cdrl3"}, 58 | } 59 | self._macro_regions.update( 60 | { 61 | "framework": self._macro_regions["hframework"] 62 | | self._macro_regions["lframework"], 63 | "cdrs": self._macro_regions["hcdrs"] | self._macro_regions["lcdrs"], 64 | "vh": self._macro_regions["hcdrs"] | self._macro_regions["hframework"], 65 | "vl": self._macro_regions["lcdrs"] | self._macro_regions["lframework"], 66 | } 67 | ) 68 | 69 | self._macro_regions.update( 70 | {"fv": self._macro_regions["vh"] | self._macro_regions["vl"]} 71 | ) 72 | 73 | self.positions = {"H": set(), "L": set()} 74 | self.exclude = {"H": set(), "L": set()} 75 | 76 | def set_regions(self, regions: Union[list, str, None] = None) -> None: 77 | """ 78 | Set the regions to be used. Will clear anything added using add regions. 79 | """ 80 | if not regions: 81 | raise AssertionError( 82 | f"Need to specify a list of regions: {self._defined_regions}" 83 | ) 84 | 85 | if isinstance(regions, str): 86 | regions = [regions] 87 | 88 | if self.not_defined: 89 | self.regions = self._macro_regions["fv"] 90 | else: 91 | self.regions = set() 92 | 93 | self.add_regions(regions) 94 | 95 | def add_regions(self, regions: list) -> None: 96 | """ 97 | Add regions to the selection. 98 | """ 99 | for region in regions: 100 | region = region.lower() 101 | if region in self._defined_regions: 102 | if self.not_defined: 103 | self.regions = self.regions - set([region]) 104 | else: 105 | self.regions.add(region) 106 | elif region in self._macro_regions: 107 | if self.not_defined: 108 | self.regions = self.regions - self._macro_regions[region] 109 | else: 110 | self.regions = self.regions | self._macro_regions[region] 111 | else: 112 | raise AssertionError( 113 | f"Got unexpected region: {region}. Allowed: {self._defined_regions} " 114 | ) 115 | 116 | def add_positions(self, positions: list[tuple[int, str]], chain: str) -> None: 117 | for position in positions: 118 | self.positions[chain].add(position) 119 | 120 | def exclude_positions(self, positions: list[tuple[int, str]], chain: str) -> None: 121 | for position in positions: 122 | self.exclude[chain].add(position) 123 | 124 | def accept(self, position: tuple[int, str], chain: str) -> Optional[int]: 125 | if position in self.exclude[chain]: 126 | return None 127 | if ( 128 | get_region(position, chain, self.numbering_scheme, self.definition) 129 | in self.regions 130 | or position in self.positions[chain] 131 | ): 132 | return 1 133 | return None 134 | 135 | 136 | def get_region( # pylint: disable=R0911 137 | position: tuple[int, str], 138 | chain: str, 139 | numbering_scheme: str = "imgt", 140 | definition: str = "imgt", 141 | ) -> str: 142 | """ 143 | Get the region in which the position belongs given the chain, numbering scheme and definition. 144 | **Note** this function does not know about insertions on the sequence. Therefore, it will get the region annotation 145 | wrong when using non-equivalent scheme-definitions. 146 | To get around this please use the annotate_regions function 147 | which implements heuristics to get the definition correct 148 | in the scheme. 149 | """ 150 | 151 | if numbering_scheme == "wolfguy" or definition == "wolfguy": 152 | raise NotImplementedError( 153 | "Wolguy cdr/framework identification is not implemented" 154 | ) 155 | 156 | index, insertion = position 157 | chain = chain.upper() 158 | 159 | # Horrible exception cases revolving around the kabat scheme/definition and cdr h1 160 | # Kabat numbering scheme will be deprecated in ANARCI v1.0.0 161 | if definition == "kabat": 162 | if ( 163 | numbering_scheme == "kabat" and chain == "H" and 31 <= index < 36 164 | ): # Kabat scheme kabat definition. 165 | if index == 35: 166 | if insertion in " AB": # Position 31 to 35B 167 | return "cdrh1" 168 | 169 | return "fwh2" # 31C would be framework. 170 | 171 | return "cdrh1" 172 | if numbering_scheme == "kabat": # Kabat numbering, chothia or imgt definitions. 173 | if definition == "chothia" and chain == "H" and 33 <= index < 36: 174 | return "fwh2" 175 | if definition == "imgt" and chain == "H" and 34 <= index < 36: 176 | return "fwh2" 177 | 178 | try: 179 | return ( 180 | _reg_one2three[ 181 | _regions[definition][chain][ 182 | _index_to_imgt_state[(numbering_scheme, chain)][index] 183 | ] 184 | ] 185 | % chain.lower() 186 | ) 187 | except KeyError: 188 | return "?" 189 | -------------------------------------------------------------------------------- /src/ab_characterisation/utils/chimerax_utils.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import tempfile 3 | from dataclasses import dataclass 4 | from pathlib import Path 5 | 6 | from ImmuneBuilder.refine import refine 7 | 8 | from ab_characterisation.utils.data_classes import RunConfig 9 | 10 | 11 | @dataclass 12 | class ChimeraInput: 13 | name: str 14 | template: str 15 | query_ab: str 16 | template_ab_chains: str 17 | map_resolution: float 18 | query_ab_chains: str 19 | template_ag_chains: str 20 | output_file: str 21 | 22 | 23 | @dataclass 24 | class ChimeraOutput: 25 | success: bool 26 | output_file: str 27 | 28 | 29 | def write_script(script_name: str, payload: ChimeraInput) -> None: 30 | """ 31 | 32 | Args: 33 | script_name: 34 | payload: 35 | 36 | Returns: 37 | 38 | """ 39 | with open(script_name, "w") as outf: 40 | outf.write("from chimerax.core.commands import run\n") 41 | outf.write(f"run(session, 'open {payload.template}')\n") 42 | outf.write( 43 | f"run(session, 'molmap /{','.join(list(payload.template_ab_chains))} {payload.map_resolution}')\n" 44 | ) 45 | outf.write(f"run(session, 'open {payload.query_ab}')\n") 46 | outf.write("run(session, 'fitmap #3 inMap #2 search 10')\n") 47 | outf.write( 48 | f"""run(session, "select #3/{','.join(list(payload.query_ab_chains))}#1/{','.join(list(payload.template_ag_chains))}")\n""" 49 | ) 50 | outf.write( 51 | f"""run(session, "save {payload.output_file} format pdb selectedOnly true")\n""" 52 | ) 53 | outf.write("""run(session, "exit")\n""") 54 | 55 | 56 | def run_chimerax(payload: ChimeraInput, config: RunConfig) -> ChimeraOutput: 57 | """ 58 | Use Chimerax to create complex pdb file of the query AB and the target antigen, using the template context to guide 59 | the complex generation. 60 | Args: 61 | payload: 62 | 63 | Returns: 64 | 65 | """ 66 | with tempfile.NamedTemporaryFile(suffix=".py") as temp_f: 67 | script_name = temp_f.name 68 | write_script(payload=payload, script_name=script_name) 69 | 70 | cmd = ["ChimeraX", "--script", script_name, "--nogui"] 71 | with open( 72 | config.output_directory / "logs" / f"{payload.name}_chimera.log", "w" 73 | ) as outf: 74 | subprocess.run(cmd, check=True, stderr=outf, stdout=outf) 75 | 76 | if not Path(payload.output_file).exists(): 77 | output = ChimeraOutput(output_file=payload.output_file, success=False) 78 | return output 79 | 80 | with open(payload.output_file) as inf: 81 | lines = inf.readlines() 82 | with open(payload.output_file, "w") as outf: 83 | for line in lines: 84 | if line.startswith("ATOM"): 85 | outf.write(line) 86 | 87 | # refinement 88 | refined_output = payload.output_file.replace(".pdb", "_refined.pdb") 89 | success = refine(input_file=payload.output_file, output_file=refined_output) 90 | if not success: 91 | output = ChimeraOutput(output_file=refined_output, success=success) 92 | return output 93 | 94 | output = ChimeraOutput( 95 | output_file=payload.output_file, success=Path(payload.output_file).exists() 96 | ) 97 | return output 98 | -------------------------------------------------------------------------------- /src/ab_characterisation/utils/data_classes.py: -------------------------------------------------------------------------------- 1 | import typing as t 2 | from dataclasses import dataclass, field 3 | from pathlib import Path 4 | from typing import Optional 5 | 6 | import numpy as np 7 | import pandas as pd 8 | 9 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import \ 10 | SequenceLiability 11 | from ab_characterisation.utils.rosetta_utils import aggregate_rosetta_metrics 12 | 13 | 14 | @dataclass 15 | class BiologicsData: 16 | """ """ 17 | 18 | heavy_sequence: str 19 | light_sequence: str 20 | name: str 21 | target_complex_reference: str 22 | target_complex_antigen_chains: str = "A" 23 | target_complex_antibody_chains: str = "HL" 24 | antibody_structure: t.Optional[str] = None 25 | discarded_by: t.Optional[str] = None 26 | tap_flags: list = field(default_factory=lambda: []) 27 | sequence_liabilities: list[SequenceLiability] = field(default_factory=lambda: []) 28 | rosetta_output_ab_only: Optional[pd.DataFrame] = None 29 | chimerax_complex_structure: t.Optional[str] = None 30 | rosetta_output_complex: Optional[pd.DataFrame] = None 31 | rank: Optional[int] = None 32 | 33 | 34 | @dataclass 35 | class RunConfig: 36 | """ """ 37 | 38 | input_file: str 39 | output_directory: Path 40 | rosetta_base_directory: str = None 41 | chimera_map_resolution: float = 6.0 42 | dq_sequence_liabilities: list[str] = field( 43 | default_factory=lambda: ["Unpaired cysteine", "N-linked glycosylation"] 44 | ) 45 | top_n: int = 100 46 | rosetta_replicates: int = 1 47 | exclude_complex_analysis: bool = False 48 | 49 | def __post_init__(self): 50 | self.output_directory.mkdir(exist_ok=True) 51 | (self.output_directory / "complex_structures").mkdir(exist_ok=True) 52 | (self.output_directory / "antibody_models").mkdir(exist_ok=True) 53 | (self.output_directory / "logs").mkdir(exist_ok=True) 54 | (self.output_directory / "rosetta_output").mkdir(exist_ok=True) 55 | 56 | 57 | def save_output(biol_data_ls: list[BiologicsData], config: RunConfig) -> None: 58 | row_dicts = [] 59 | for biol_data in biol_data_ls: 60 | row_dict = {} 61 | for key, value in biol_data.__dict__.items(): 62 | if isinstance(value, str): 63 | row_dict[key] = value 64 | elif isinstance(value, int): 65 | row_dict[key] = value 66 | elif value is None: 67 | row_dict[key] = np.nan 68 | elif key == "tap_flags": 69 | for tap_metric in value: 70 | row_dict[f"TAP-{tap_metric.metric_name}"] = tap_metric.flag 71 | elif key == "sequence_liabilities": 72 | seq_liab_str = "" 73 | for liability in value: 74 | seq_liab_str += f"{liability.liability_type}-{liability.motif}-{liability.positions_string}|" 75 | row_dict[key] = seq_liab_str 76 | elif key == "rosetta_output_ab_only": 77 | value = aggregate_rosetta_metrics(value) 78 | value.columns = ["ab-" + col for col in value.columns] 79 | for col in value.columns: 80 | row_dict[col] = value[col].iloc[0] 81 | elif key == "rosetta_output_complex": 82 | value = aggregate_rosetta_metrics( 83 | value, metrics=("dG_separated", "total_score") 84 | ) 85 | value.columns = ["complex-" + col for col in value.columns] 86 | for col in value.columns: 87 | row_dict[col] = value[col].iloc[0] 88 | row_dicts.append(row_dict) 89 | pd.DataFrame(row_dicts).to_csv(config.output_directory / "output.csv") 90 | -------------------------------------------------------------------------------- /src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_ab_only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ROSETTA3= 4 | 5 | $ROSETTA3/main/source/bin/rosetta_scripts.static.linuxgccrelease \ 6 | -database $ROSETTA3/main/database \ 7 | -in:file:s \ 8 | -in:file:native \ 9 | -parser:protocol ./rosetta_metrics_ab_only.xml \ 10 | -beta \ 11 | -include_sugars \ 12 | -alternate_3_letter_codes pdb_sugar \ 13 | -load_PDB_components false \ 14 | -auto_detect_glycan_connections \ 15 | -write_glycan_pdb_codes \ 16 | -output_alternate_atomids \ 17 | -write_pdb_link_records -------------------------------------------------------------------------------- /src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_ab_only.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_complex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ROSETTA3= 4 | 5 | $ROSETTA3/main/source/bin/rosetta_scripts.static.linuxgccrelease \ 6 | -database $ROSETTA3/main/database \ 7 | -in:file:s \ 8 | -in:file:native \ 9 | -parser:protocol ./rosetta_metrics_complex.xml \ 10 | -beta \ 11 | -include_sugars \ 12 | -alternate_3_letter_codes pdb_sugar \ 13 | -load_PDB_components false \ 14 | -auto_detect_glycan_connections \ 15 | -write_glycan_pdb_codes \ 16 | -output_alternate_atomids \ 17 | -write_pdb_link_records -------------------------------------------------------------------------------- /src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_complex.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /src/ab_characterisation/utils/rosetta_utils.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | 3 | 4 | def aggregate_rosetta_metrics( 5 | metric_df: pd.DataFrame, metrics: list[str] = ["dG_separated"] 6 | ) -> pd.DataFrame: 7 | """ 8 | Args: 9 | metric_df: 10 | metrics: 11 | 12 | Returns: 13 | 14 | """ 15 | metric_df = metric_df.select_dtypes("number") 16 | if len(metrics) == 1: 17 | idx = list(metric_df.sort_values(by=metrics[0], ascending=True)[:3].index) 18 | 19 | else: 20 | idx = [] 21 | for metric in metrics: 22 | idx += list(metric_df.sort_values(by=metric, ascending=True)[:2].index) 23 | idx = set(idx) 24 | metric_df = metric_df.loc[list(idx)].mean().to_frame().T 25 | return metric_df 26 | -------------------------------------------------------------------------------- /tests/data/test_pipeline.csv: -------------------------------------------------------------------------------- 1 | sequence_name,heavy_sequence,light_sequence,reference_complex 2 | test1,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLEWMGIIYPGDSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCARLGGRYYYDSSGYYYFDYWGQGTLVTVSS,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSSWVFGGGTKLTVL,tests/data/test_complex_reference.pdb 3 | test2,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLEWMGIIYPGDSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCARLGGRYYYDSSGYYYFDYWGQGTLVTVSS,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSSWVFGGGTKLTVL,tests/data/test_complex_reference.pdb 4 | test3,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLEWMGIIYPGDSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCARLGGRYYYDSSGYYYFDYWGQGTLVTVSS,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSSWVFGGGTKLTVL,tests/data/test_complex_reference.pdb 5 | test4,ELKLVETGGDLVKPGGSLTLSCEASGFTLRTYGMSWVRQTPQMRLEWVASISYGGLLYFSDSVKGRFTISRDIVRNILTLQMSRLRSEDTAIYYCARGTSFVRYFDVWGAGTTVTVSS,EVLLTQTPLSLPVSLGDQASISCRSSQTIVHTNGNTYFEWYLQKPGQSPHLLIYKVSNRLSGVPDRFSGSGSGTDFTLKISRVEAEDLGLYYCFQGSHSPWTFGGGTKLELK,tests/data/test_complex_reference.pdb -------------------------------------------------------------------------------- /tests/integration/chimera_test_script.py: -------------------------------------------------------------------------------- 1 | from chimerax.core.commands import run 2 | run(session, "exit") -------------------------------------------------------------------------------- /tests/integration/test_pipeline.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from mpi4py import MPI 4 | 5 | from ab_characterisation.pipeline_orchestration import pipeline, RunConfig 6 | 7 | 8 | def test_pipeline(): 9 | comm = MPI.COMM_WORLD 10 | rank = comm.Get_rank() 11 | size = comm.Get_size() 12 | input_file = Path(__file__).parent.parent / "data" / "test_pipeline.csv" 13 | output_dir = Path(__file__).parent.parent / "data" / "ab_characterisation_output" 14 | rosetta_base_directory = os.environ.get('ROSETTA_BASE') 15 | config = RunConfig( 16 | chimera_map_resolution=6, 17 | input_file=str(input_file), 18 | output_directory=output_dir, 19 | rosetta_base_directory=rosetta_base_directory, 20 | ) 21 | pipeline(config, mpi_rank=rank, mpi_size=size) 22 | 23 | 24 | if __name__ == '__main__': 25 | test_pipeline() 26 | --------------------------------------------------------------------------------