├── .gitignore
├── LICENSE
├── README.md
├── environment.yml
├── setup.cfg
├── setup.py
├── src
└── ab_characterisation
│ ├── __init__.py
│ ├── cli.py
│ ├── developability_tools
│ ├── __init__.py
│ ├── sequence_liabilities
│ │ ├── __init__.py
│ │ ├── definitions.py
│ │ ├── main.py
│ │ ├── outputs.py
│ │ ├── scanner_classes.py
│ │ └── scanners.py
│ ├── sequence_properties
│ │ ├── calculations.py
│ │ ├── main.py
│ │ └── outputs.py
│ ├── tap
│ │ ├── __init__.py
│ │ ├── definitions.py
│ │ ├── main.py
│ │ ├── metrics
│ │ │ ├── __init__.py
│ │ │ ├── base_calculator.py
│ │ │ ├── hydrophobic_patches.py
│ │ │ ├── negative_patches.py
│ │ │ ├── positive_patches.py
│ │ │ ├── sfvcsp.py
│ │ │ └── total_cdr_length.py
│ │ ├── outputs.py
│ │ ├── psa_executables
│ │ │ ├── psa
│ │ │ └── psa_mac
│ │ └── structure_annotation.py
│ └── utils
│ │ ├── input_handling.py
│ │ └── outputs.py
│ ├── filter_steps.py
│ ├── pipeline_orchestration.py
│ ├── rosetta_steps.py
│ ├── sequence_steps.py
│ ├── structure_steps.py
│ └── utils
│ ├── __init__.py
│ ├── anarci_region_definition_utils.py
│ ├── anarci_utils.py
│ ├── chimerax_utils.py
│ ├── data_classes.py
│ ├── rosetta_templates
│ ├── rosetta_metrics_ab_only.sh
│ ├── rosetta_metrics_ab_only.xml
│ ├── rosetta_metrics_complex.sh
│ └── rosetta_metrics_complex.xml
│ └── rosetta_utils.py
└── tests
├── data
├── test_complex_reference.pdb
└── test_pipeline.csv
└── integration
├── chimera_test_script.py
└── test_pipeline.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # These are some examples of commonly ignored file patterns.
2 | # You should customize this list as applicable to your project.
3 | # Learn more about .gitignore:
4 | # https://www.atlassian.com/git/tutorials/saving-changes/gitignore
5 |
6 | # Node artifact files
7 | node_modules/
8 | dist/
9 |
10 | # Compiled Java class files
11 | *.class
12 |
13 | # Compiled Python bytecode
14 | *.py[cod]
15 |
16 | # Log files
17 | *.log
18 |
19 | # Package files
20 | *.jar
21 |
22 | # Maven
23 | target/
24 | dist/
25 |
26 | # JetBrains IDE
27 | .idea/
28 |
29 | # Unit test reports
30 | TEST*.xml
31 |
32 | # Generated by MacOS
33 | .DS_Store
34 |
35 | # Generated by Windows
36 | Thumbs.db
37 |
38 | # Applications
39 | *.app
40 | *.exe
41 | *.war
42 |
43 | # Large media files
44 | *.mp4
45 | *.tiff
46 | *.avi
47 | *.flv
48 | *.mov
49 | *.wmv
50 |
51 |
52 | .cache/
53 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Exscientia
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Antibody characterisation pipeline
2 |
3 | ## Overview
4 | This repository contains code to run the antibody characterisation pipeline described in "Computational design of
5 | developable therapeutic antibodies: efficient traversal of binder landscapes and rescue of escape mutations"
6 | (see citation below).
7 |
8 | ## Installation
9 | ### Environment setup
10 | Use the `environment.yml` file to create a conda environment
11 | ```shell
12 | conda env create -f environment.yml -n ab-characterisation
13 | conda activate ab-characterisation
14 | ```
15 | ### Dependencies
16 | Ensure that you have working installs of the following:
17 |
18 | 1) Rosetta: https://www.rosettacommons.org/demos/latest/tutorials/install_build/install_build
19 |
20 | 2) ChimeraX: https://www.cgl.ucsf.edu/chimerax/download.html
21 |
22 | a) ensure that you can run the basic ChimeraX script in tests/data:
23 | ```
24 | ChimeraX --script tests/data/chimera_test_script.py --nogui
25 | ```
26 | b) set environment variable `DEBIAN_FRONTEND="noninteractive"`
27 |
28 | 3) ANARCI: https://github.com/oxpig/ANARCI. Note: On MacOS machines, install the hmmer dependency via brew, otherwise via conda.
29 |
30 | 4) Ensure you have the correct licences for all linked software.
31 |
32 | ## Testing your installation
33 | You can test the installation of the environment using `pytest`.
34 | For this, first set the Rosetta base directory as an environment variable, for example like this:
35 | ```shell
36 | export ROSETTA_BASE=/path/to/rosetta/rosetta.binary.linux.release-315
37 | ```
38 | Then run pytest
39 | ```shell
40 | pytest
41 | ```
42 | Which will run an end-to-end example run of the pipeline on a set of 4 antibody sequences (note that depending on your
43 | setup this may take 1h).
44 |
45 | ## Running the pipeline
46 | With the conda environment active, the pipeline can be run as follows:
47 | ```shell
48 | ab-characterisation --input-file tests/data/test_pipeline.csv --rosetta-base-dir $ROSETTA_BASE
49 | ```
50 | (assuming ROSETTA_BASE to have been set as described above).
51 |
52 | If you want to multiprocess the pipeline, instead run as
53 | ```shell
54 | mpiexec -n N_PROCESSES ab-characterisation --input-file tests/data/test_pipeline.csv --rosetta-base-dir $ROSETTA_BASE
55 | ```
56 |
57 | ```
58 | Usage: ab-characterisation [OPTIONS]
59 |
60 | ╭─ Options ─────────────────────────────────────────────────────────────────────────────────────────────────────────────╮
61 | │ * --input-file TEXT Input .csv file, containing sequence_name, heavy_sequence, light_sequence and │
62 | │ reference_complex columns. [required] │
63 | │ --chimera-resolution FLOAT Resolution of the map used for alignment within ChimeraX. [default: 6.0] │
64 | │ --output-dir TEXT Directory to which output files are written. │
65 | │ [default: ./ab_characterisation_output] │
66 | │ --rosetta-replicates INTEGER How many replicates to run for Rosetta characterisation steps. [default: 1] │
67 | │ * --rosetta-base-dir TEXT Base directory for the Roestta software suite, │
68 | │ e.g. /path/to/rosetta/rosetta.binary.linux.release-315 [required] │
69 | │ --top-n INTEGER Top N candidate antibodies to provide from the provided .csv file of │
70 | │ antibodies [default: 10] │
71 | │ --help Show this message and exit. │
72 | ╰────────────────────────────────────────────────
73 | ```
74 | ## Acknowledgements
75 | The antibody characterisation pipeline was developed by researchers and engineers at Exscientia:
76 |
77 | - Frederic Dreyer
78 |
79 | - Constantin Schneider
80 |
81 | - Aleksandr Kovaltsuk
82 |
83 | - Daniel Cutting
84 |
85 | - Matthew J. Byrne
86 |
87 | - Daniel A. Nissley
88 |
89 | - Newton Wahome
90 |
91 | - Henry Kenlay
92 |
93 | - Claire Marks
94 |
95 | - David Errington
96 |
97 | - Richard J. Gildea
98 |
99 | - David Damerell
100 |
101 | - Pedro Tizei
102 |
103 | - Wilawan Bunjobpol
104 |
105 | - Sachin Surade
106 |
107 | - Douglas E. V. Pires
108 |
109 | - Charlotte M. Deane
110 |
111 | ## Citation
112 | If you use this code in your research, please cite the following paper:
113 |
114 | ```
115 | @article{Computational_design_of_developable_therapeutic_antibodies,
116 | author = {Dreyer, Fr{\'e}d{\'e}ric A. and Schneider, Constantin and Kovaltsuk, Aleksandr and Cutting, Daniel and Byrne, Matthew J. and Nissley, Daniel A. and Wahome, Newton and Kenlay, Henry and Marks, Claire and Errington, David and Gildea, Richard J. and Damerell, David and Tizei, Pedro and Bunjobpol, Wilawan and Darby, John F. and Drulyte, Ieva and Hurdiss, Daniel L. and Surade, Sachin and Pires, Douglas E. V. and Deane, Charlotte M.},
117 | title = {Computational design of developable therapeutic antibodies: efficient traversal of binder landscapes and rescue of escape mutations},
118 | year = {2024},
119 | doi = {10.1101/2024.10.03.616038},
120 | eprint = {https://www.biorxiv.org/content/early/2024/10/04/2024.10.03.616038.full.pdf},
121 | journal = {bioRxiv}
122 | }
123 | ```
124 |
--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
1 | channels:
2 | - conda-forge
3 | - bioconda
4 | - pytorch
5 | dependencies:
6 | - python=3.10
7 | - pip
8 | - typer
9 | - pandas
10 | - loguru
11 | - biopython
12 | - openmm
13 | - pdbfixer
14 | - pytorch=2.1.0
15 | - mpi4py
16 | - pytest
17 | - pip:
18 | - Immunebuilder
19 | - -e .
20 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | name = ab-characterisation
3 | description = AB-characterisation
4 | version = 1.0.0
5 | long_description = file: README.md
6 | long_description_content_type = text/markdown
7 | classifiers =
8 | Intended Audience :: Science/Research
9 | Natural Language :: English
10 | Operating System :: OS Independent
11 | Programming Language :: Python
12 | Programming Language :: Python :: 3 :: Only
13 | Programming Language :: Python :: 3.10
14 | Typing :: Typed
15 |
16 | [options]
17 | packages = find_namespace:
18 | package_dir =
19 | =src
20 | python_requires = >= 3.10
21 | include_package_data = True
22 | install_requires =
23 | ImmuneBuilder
24 |
25 | [options.packages.find]
26 | where = src
27 | [options.entry_points]
28 | console_scripts =
29 | ab-characterisation = ab_characterisation.cli:app
30 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | """Legacy file required for editable installs on pip<21.3.1"""
2 | from setuptools import setup
3 |
4 | setup()
5 |
--------------------------------------------------------------------------------
/src/ab_characterisation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/__init__.py
--------------------------------------------------------------------------------
/src/ab_characterisation/cli.py:
--------------------------------------------------------------------------------
1 | from mpi4py import MPI
2 | from pathlib import Path
3 | import typer
4 |
5 | from ab_characterisation.pipeline_orchestration import RunConfig, pipeline
6 |
7 | app = typer.Typer(
8 | name="ab-characterisation-pipeline",
9 | add_completion=False,
10 | )
11 |
12 |
13 | @app.command()
14 | def run_pipeline(
15 | input_file: str = typer.Option(..., help='Input .csv file, containing sequence_name, heavy_sequence, light_sequence '
16 | 'and reference_complex columns.'),
17 | chimera_resolution: float = typer.Option(6.0, help='Resolution of the map used for alignment within ChimeraX.'),
18 | output_dir: str = typer.Option("./ab_characterisation_output", help='Directory to which output files are written.'),
19 | rosetta_replicates: int = typer.Option(1, help='How many replicates to run for Rosetta characterisation steps.'),
20 | rosetta_base_dir: str = typer.Option(..., help='Base directory for the Roestta software suite, e.g. '
21 | '/path/to/rosetta/rosetta.binary.linux.release-315'),
22 | top_n: int = typer.Option(10, help='Top N candidate antibodies to provide from the provided .csv file of antibodies'),
23 | no_complex_analysis: bool = typer.Option(False, help='If provided, the pipeline does not perform antibody-antigen '
24 | 'complex generation and analysis.')
25 | ):
26 | output_dir = Path(output_dir)
27 | config = RunConfig(
28 | chimera_map_resolution=chimera_resolution,
29 | input_file=input_file,
30 | output_directory=output_dir,
31 | rosetta_base_directory=rosetta_base_dir,
32 | top_n=top_n,
33 | rosetta_replicates=rosetta_replicates,
34 | exclude_complex_analysis=no_complex_analysis,
35 | )
36 | comm = MPI.COMM_WORLD
37 | rank = comm.Get_rank()
38 | size = comm.Get_size()
39 | pipeline(config, mpi_rank=rank, mpi_size=size)
40 |
41 |
42 | if __name__ == "__main__":
43 | app()
44 |
45 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/__init__.py
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_liabilities/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/sequence_liabilities/__init__.py
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_liabilities/definitions.py:
--------------------------------------------------------------------------------
1 | custom_regions = {
2 | "verniers": {
3 | "H": [
4 | (2, " "),
5 | (28, " "),
6 | (29, " "),
7 | (54, " "),
8 | (55, " "),
9 | (78, " "),
10 | (88, " "),
11 | (105, " "),
12 | (106, " "),
13 | (118, " "),
14 | ],
15 | "L": [
16 | (4, " "),
17 | (27, " "),
18 | (28, " "),
19 | (29, " "),
20 | (30, " "),
21 | (31, " "),
22 | (32, " "),
23 | (33, " "),
24 | (34, " "),
25 | (35, " "),
26 | (36, " "),
27 | (41, " "),
28 | (42, " "),
29 | (52, " "),
30 | (53, " "),
31 | (55, " "),
32 | (84, " "),
33 | (94, " "),
34 | (118, " "),
35 | ],
36 | },
37 | }
38 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_liabilities/main.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from typing import Optional
3 |
4 | from loguru import logger
5 |
6 | from ab_characterisation.developability_tools.utils.input_handling import get_numbering
7 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import SequenceLiability
8 | from ab_characterisation.developability_tools.sequence_liabilities.scanners import (asparagine_deamidation_scanner,
9 | aspartic_acid_isomeration_scanner,
10 | cd11c_cd18_binding_scanner, fragmentation_scanner,
11 | integrin_binding_scanner, lysine_glycation_scanner,
12 | methionine_oxidation_scanner,
13 | n_linked_glycosylation_scanner,
14 | n_terminal_glutamate_scanner,
15 | tryptophan_oxidation_scanner, unpaired_cysteine_scanner)
16 |
17 | logger.remove()
18 | logger.add(sys.stderr, format="{message}")
19 |
20 |
21 | scanner_list = [
22 | unpaired_cysteine_scanner,
23 | n_linked_glycosylation_scanner,
24 | methionine_oxidation_scanner,
25 | tryptophan_oxidation_scanner,
26 | asparagine_deamidation_scanner,
27 | aspartic_acid_isomeration_scanner,
28 | lysine_glycation_scanner,
29 | integrin_binding_scanner,
30 | cd11c_cd18_binding_scanner,
31 | fragmentation_scanner,
32 | n_terminal_glutamate_scanner,
33 | ]
34 |
35 |
36 | def scan_single(
37 | heavy_sequence: Optional[str], light_sequence: Optional[str], quiet: bool = False
38 | ) -> list[SequenceLiability]:
39 | """
40 | Scans the sequence of an antibody for potential liabilities.
41 |
42 | Args:
43 | heavy_sequence: the amino acid sequence of the antibody heavy chain
44 | light_sequence: the amino acid sequence of the antibody light chain
45 |
46 | Returns:
47 | a list of identified sequence liabilities.
48 | """
49 |
50 | numbering_dict = {}
51 | if heavy_sequence:
52 | numbering_dict["H"] = get_numbering(heavy_sequence, "H")
53 | if light_sequence:
54 | numbering_dict["L"] = get_numbering(light_sequence, "L")
55 |
56 | liabilities = []
57 | for scanner in scanner_list:
58 | liabilities += scanner.scan(numbering_dict, quiet=quiet)
59 |
60 | return liabilities
61 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_liabilities/outputs.py:
--------------------------------------------------------------------------------
1 | from loguru import logger
2 |
3 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import \
4 | SequenceLiability
5 | from ab_characterisation.developability_tools.utils.outputs import write_file
6 |
7 |
8 | def display_results(liabilities: list[SequenceLiability]) -> None:
9 | """
10 | Nicely prints the identified liabilities to the terminal.
11 |
12 | Args:
13 | liabilities: the list of identified sequence liabilities.
14 | """
15 | color = "red" if liabilities else "green"
16 | logger.opt(colors=True).info(
17 | f"\n<{color}>{len(liabilities)} liabilities were found{color}>"
18 | )
19 | for lia in liabilities:
20 | logger.opt(colors=True).info(
21 | f"{lia.liability_type} - residue motif {lia.motif}, position(s) {lia.positions_string}"
22 | )
23 | return
24 |
25 |
26 | def write_liabilities_to_csv(
27 | liabilities: list[SequenceLiability], filepath: str
28 | ) -> None:
29 | """
30 | Writes a list of identified sequence liabilities to a file in .csv format.
31 |
32 | Args:
33 | liabilities: the list of identified sequence liabilities
34 | filepath: the path to the output file. Can be an S3 path.
35 | """
36 | outstr = "Liability,Motif,Positions\n"
37 | for liability in liabilities:
38 | outstr += f"{liability.liability_type},{liability.motif},{liability.positions_string}\n"
39 |
40 | write_file(outstr, filepath)
41 |
42 | return
43 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_liabilities/scanner_classes.py:
--------------------------------------------------------------------------------
1 | import re
2 | from abc import ABC, abstractmethod
3 | from dataclasses import dataclass
4 | from typing import List, Optional
5 |
6 | from loguru import logger
7 |
8 | from ab_characterisation.developability_tools.sequence_liabilities.definitions import \
9 | custom_regions
10 | from ab_characterisation.utils.anarci_utils import Accept
11 |
12 |
13 | @dataclass
14 | class Position:
15 | chain: str
16 | number: int
17 | ins_code: str
18 |
19 | def to_string(self) -> str:
20 | if self.ins_code != " ":
21 | return f"{self.chain}{self.number}{self.ins_code}"
22 | return f"{self.chain}{self.number}"
23 |
24 |
25 | @dataclass
26 | class SequenceLiability:
27 | liability_type: str
28 | motif: str
29 | positions: list[Position]
30 |
31 | @property
32 | def positions_string(self) -> str:
33 | return "-".join([pos.to_string() for pos in self.positions])
34 |
35 |
36 | @dataclass
37 | class BaseScannerDataclassMixin:
38 | name: str
39 | description: str
40 |
41 |
42 | class BaseScanner(ABC, BaseScannerDataclassMixin):
43 | name: str
44 | description: str
45 |
46 | @abstractmethod
47 | def scan(
48 | self,
49 | numbering_dict: dict[str, list[tuple[tuple[int, str], str]]],
50 | quiet: bool = False,
51 | ) -> List[SequenceLiability]:
52 | """
53 | Scans an input sequence for liabilities.
54 |
55 | Args:
56 | numbering_dict: a dictionary of ANARCI numberings
57 | e.g. {"H": [((1, ' '), 'E'), ((2, ' '), 'L'), ((3, ' '), 'K'), ...],
58 | "L": [((1, ' '), 'D'), ((2, ' '), 'V'), ((3, ' '), 'L'), ...]}
59 |
60 | Returns:
61 | a list of identified liabilities.
62 | """
63 |
64 |
65 | @dataclass
66 | class RegexScanner(BaseScanner):
67 | regions: list[str]
68 | regex_search_string: str
69 | ignored_positions: Optional[list[tuple[int, str]]] = None
70 |
71 | def __post_init__(self) -> None:
72 | self.regex_pattern = re.compile(self.regex_search_string)
73 |
74 | def _get_acceptor(self, chain: str) -> Accept:
75 | acceptor = Accept(numbering_scheme="imgt", definition="imgt")
76 | for region in self.regions:
77 | if region in custom_regions:
78 | acceptor.add_positions(custom_regions[region][chain], chain)
79 | else:
80 | acceptor.add_regions([region])
81 | return acceptor
82 |
83 | def scan(
84 | self,
85 | numbering_dict: dict[str, list[tuple[tuple[int, str], str]]],
86 | quiet: bool = False,
87 | ) -> List[SequenceLiability]:
88 | identified = []
89 | for chain, numbering in numbering_dict.items():
90 | acceptor = self._get_acceptor(chain)
91 |
92 | sequence = "".join([res[1] for res in numbering if res[1] != "-"])
93 | numbers = [res[0] for res in numbering if res[1] != "-"]
94 |
95 | for match in self.regex_pattern.finditer(sequence):
96 | start, end = match.start(), match.end()
97 | identified_positions = numbers[start:end]
98 |
99 | # Check if any of the residues identified should be ignored; skip if so
100 | if self.ignored_positions:
101 | if set(identified_positions).intersection(self.ignored_positions):
102 | continue
103 |
104 | # Check if the first residue of the set identified belongs to a region of interest
105 | if acceptor.accept(identified_positions[0], chain):
106 | identified.append(
107 | SequenceLiability(
108 | liability_type=self.name,
109 | motif=match.group(),
110 | positions=[
111 | Position(chain=chain, number=pos[0], ins_code=pos[1])
112 | for pos in identified_positions
113 | ],
114 | )
115 | )
116 |
117 | if not quiet:
118 | color = "red" if identified else "green"
119 | logger.opt(colors=True).info(
120 | f"<{color}>{self.name}:{color}> identified <{color}>{len(identified)}{color}> liabilities"
121 | )
122 |
123 | return identified
124 |
125 |
126 | class NTerminalGlutamateScanner(BaseScanner):
127 | # This does not look for a consecutive pattern like the other liabilities
128 | # Checks for E residues at the start of each chain instead
129 | def scan(
130 | self,
131 | numbering_dict: dict[str, list[tuple[tuple[int, str], str]]],
132 | quiet: bool = False,
133 | ) -> List[SequenceLiability]:
134 | if "H" not in numbering_dict or "L" not in numbering_dict:
135 | if not quiet:
136 | logger.opt(colors=True).warning(
137 | f"{self.name}: both H and L chain sequences are required for this check; skipping"
138 | )
139 | return []
140 |
141 | heavy_dict: dict[tuple[int, str], str] = dict(numbering_dict["H"])
142 | light_dict: dict[tuple[int, str], str] = dict(numbering_dict["L"])
143 |
144 | identified = []
145 | if (
146 | heavy_dict.get((1, " "), None) == "E"
147 | and light_dict.get((1, " "), None) == "E"
148 | ):
149 | identified = [
150 | SequenceLiability(
151 | liability_type=self.name,
152 | motif="EE",
153 | positions=[
154 | Position(chain="H", number=1, ins_code=" "),
155 | Position(chain="L", number=1, ins_code=" "),
156 | ],
157 | )
158 | ]
159 |
160 | if not quiet:
161 | color = "red" if identified else "green"
162 | logger.opt(colors=True).info(
163 | f"<{color}>{self.name}:{color}> identified <{color}>{len(identified)}{color}> liabilities"
164 | )
165 |
166 | return identified
167 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_liabilities/scanners.py:
--------------------------------------------------------------------------------
1 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import (
2 | NTerminalGlutamateScanner, RegexScanner)
3 |
4 | unpaired_cysteine_scanner = RegexScanner(
5 | name="Unpaired cysteine",
6 | description="Checks for C residues in locations other than positions 23 and 104",
7 | regions=["fv"],
8 | regex_search_string="C",
9 | ignored_positions=[(23, " "), (104, " ")],
10 | )
11 |
12 | n_linked_glycosylation_scanner = RegexScanner(
13 | name="N-linked glycosylation",
14 | description="Checks for an N residue followed by any residue apart from P, followed by S or T",
15 | regions=["fv"],
16 | regex_search_string="N[^P][ST]",
17 | )
18 |
19 | methionine_oxidation_scanner = RegexScanner(
20 | name="Methionine oxidation",
21 | description="Checks for M residues in CDRs or Vernier zones",
22 | regions=["cdrs", "verniers"],
23 | regex_search_string="M",
24 | )
25 |
26 | tryptophan_oxidation_scanner = RegexScanner(
27 | name="Tryptophan oxidation",
28 | description="Checks for W residues in CDRs or Vernier zones",
29 | regions=["cdrs", "verniers"],
30 | regex_search_string="W",
31 | )
32 |
33 | asparagine_deamidation_scanner = RegexScanner(
34 | name="Asparagine deamidation",
35 | description="Checks for residue pairs NG, NS, or NT in CDRs or Vernier zones",
36 | regions=["cdrs", "verniers"],
37 | regex_search_string="N[GST]",
38 | )
39 |
40 | aspartic_acid_isomeration_scanner = RegexScanner(
41 | name="Aspartic acid isomeration",
42 | description="Checks for residue pairs DG, DS, DT, DD, or DH in CDRs or Vernier zones",
43 | regions=["cdrs", "verniers"],
44 | regex_search_string="D[GSTDH]",
45 | )
46 |
47 | lysine_glycation_scanner = RegexScanner(
48 | name="Lysine isomeration",
49 | description="Checks for residue pairs KE, KD, EK, or ED in CDRs or Vernier zones",
50 | regions=["cdrs", "verniers"],
51 | regex_search_string="KE|KD|EK|ED",
52 | )
53 |
54 | integrin_binding_scanner = RegexScanner(
55 | name="Integrin binding",
56 | description="Checks for residue triplets RGD, RYD, or LDV within the Fv",
57 | regions=["fv"],
58 | regex_search_string="RGD|RYD|LDV",
59 | )
60 |
61 | cd11c_cd18_binding_scanner = RegexScanner(
62 | name="CD11c/CD18 binding",
63 | description="Checks for residue triple GPR within the Fv",
64 | regions=["fv"],
65 | regex_search_string="GPR",
66 | )
67 |
68 | fragmentation_scanner = RegexScanner(
69 | name="Fragmentation",
70 | description="Checks for residue pair DP in the CDRs or Vernier zones",
71 | regions=["cdrs", "verniers"],
72 | regex_search_string="DP",
73 | )
74 |
75 | n_terminal_glutamate_scanner = NTerminalGlutamateScanner(
76 | name="N-terminal glutamate",
77 | description="Checks for glutamate residues at the N-termini of both heavy and light chains",
78 | )
79 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_properties/calculations.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | import numpy as np
4 | from Bio.SeqUtils.ProtParam import ProteinAnalysis
5 |
6 |
7 | class PropertyCalculator:
8 | def __init__(self, sequence: str) -> None:
9 | self.sequence = sequence
10 | self.protein_analysis = ProteinAnalysis(self.sequence)
11 |
12 | def calculate_aromaticity(self) -> float:
13 | return self.protein_analysis.aromaticity() # type: ignore
14 |
15 | def calculate_charge_at_ph(self, ph: float) -> float:
16 | return self.protein_analysis.charge_at_pH(ph) # type: ignore
17 |
18 | def calculate_flexibility(self) -> dict:
19 | flexibility_scores = self.protein_analysis.flexibility()
20 | return {
21 | "residue_scores": flexibility_scores,
22 | "mean": np.mean(flexibility_scores),
23 | "stdev": np.std(flexibility_scores),
24 | "min": min(flexibility_scores),
25 | "max": max(flexibility_scores),
26 | }
27 |
28 | def calculate_gravy(self) -> float:
29 | return self.protein_analysis.gravy() # type: ignore
30 |
31 | def calculate_instability_index(self) -> float:
32 | return self.protein_analysis.instability_index() # type: ignore
33 |
34 | def calculate_isoelectric_point(self) -> float:
35 | return self.protein_analysis.isoelectric_point() # type: ignore
36 |
37 | def calculate_properties(self) -> dict:
38 | """
39 | Calculates several properties from the sequence of an antibody.
40 | Returns:
41 | a dictionary of calculated properties.
42 | """
43 | return {
44 | "sequence": self.sequence,
45 | "aromaticity": self.calculate_aromaticity(),
46 | "charge_pH_6": self.calculate_charge_at_ph(6),
47 | "charge_pH_7.4": self.calculate_charge_at_ph(7.4),
48 | "flexibility": self.calculate_flexibility(),
49 | "gravy": self.calculate_gravy(),
50 | "instability_index": self.calculate_instability_index(),
51 | "isoelectric_point": self.calculate_isoelectric_point(),
52 | }
53 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_properties/main.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from pathlib import Path
3 | from typing import Optional
4 |
5 | from ab_characterisation.developability_tools.sequence_properties.calculations import \
6 | PropertyCalculator
7 | from ab_characterisation.developability_tools.sequence_properties.outputs import \
8 | write_properties_to_json
9 | from ab_characterisation.developability_tools.utils.input_handling import parse_fasta
10 | from loguru import logger
11 |
12 | logger.remove()
13 | logger.add(sys.stderr, format="{message}")
14 |
15 |
16 | def calculate_properties(
17 | heavy_sequence: Optional[str], light_sequence: Optional[str]
18 | ) -> dict[str, dict]:
19 | results = {}
20 | for chain, sequence in {"heavy": heavy_sequence, "light": light_sequence}.items():
21 | if sequence:
22 | results[chain] = PropertyCalculator(sequence).calculate_properties()
23 |
24 | return results
25 |
26 |
27 | def property_calculator(
28 | heavy_sequence: Optional[str],
29 | light_sequence: Optional[str],
30 | outfile: Optional[str] = None,
31 | ) -> dict[str, dict]:
32 | """
33 | Main function to calculate sequence-based properties for a single antibody.
34 | Writes the results to a file in .csv format.
35 |
36 | Args:
37 | heavy_sequence: the amino acid sequence of the antibody heavy chain
38 | light_sequence: the amino acid sequence of the antibody light chain
39 | outfile: the output path where results should be written. If a path is not given, results will only be printed
40 | to the terminal.
41 | """
42 | property_dict = calculate_properties(heavy_sequence, light_sequence)
43 |
44 | if outfile:
45 | write_properties_to_json(property_dict, outfile)
46 |
47 | return property_dict
48 |
49 |
50 | def property_calculator_fasta(
51 | fasta_file: str,
52 | outdir: Optional[str],
53 | quiet: bool = False,
54 | ) -> dict[str, dict[str, dict]]:
55 | """
56 | Function to scan a set of antibody sequences in a fasta file for liabilities.
57 | Writes the results to a series of files (one per antibody) in .csv format.
58 |
59 | Args:
60 | fasta_file: the amino acid sequences of the antibodies to be scanned in fasta format. Each antibody should be a
61 | separate fasta entry, with the heavy and light chains being separated by a forward slash. E.g.:
62 |
63 | >antibody1
64 | HEAVYSEQUENCE/LIGHTSEQUENCE
65 | >antibody2
66 | HEAVYSEQUENCE/LIGHTSEQUENCE
67 | >nanobody1
68 | HEAVYSEQUENCE/-
69 | ...
70 |
71 | outdir: Path to the directory where results should be written. Individual files will be named according to the
72 | IDs in the fasta file.
73 | """
74 | if outdir:
75 | dirpath = Path(outdir)
76 | dirpath.mkdir(parents=True, exist_ok=True)
77 | else:
78 | dirpath = Path(".")
79 |
80 | sequences = parse_fasta(fasta_file)
81 | results = {}
82 | for antibody_id, seqs in sequences.items():
83 | if not quiet:
84 | logger.info(f"Calculating properties for {antibody_id}")
85 |
86 | property_dict = calculate_properties(seqs["H"], seqs["L"])
87 | filepath = dirpath / f"{antibody_id}_properties.json"
88 | write_properties_to_json(property_dict, str(filepath))
89 | results[antibody_id] = property_dict
90 |
91 | return results
92 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/sequence_properties/outputs.py:
--------------------------------------------------------------------------------
1 | import json
2 |
3 | from ab_characterisation.developability_tools.utils.outputs import write_file
4 | from loguru import logger
5 |
6 |
7 | def write_properties_to_json(property_dict: dict, filepath: str) -> None:
8 | """
9 | Writes the calculated property dict to a file in .json format.
10 |
11 | Args:
12 | property_dict: the dictionary of calculated properties
13 | filepath: the path to the output file. Can be an S3 path.
14 | """
15 | outstr = json.dumps(property_dict)
16 | write_file(outstr, filepath)
17 |
18 | return
19 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/tap/__init__.py
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/definitions.py:
--------------------------------------------------------------------------------
1 | colour_dict = {"RED": "red", "AMBER": "fg #ff6100", "GREEN": "fg #14b853"}
2 |
3 | imgt_cdr_definitions = {
4 | 1: range(27, 39),
5 | 2: range(56, 66),
6 | 3: range(105, 118),
7 | }
8 |
9 | # Two residues on either side of the IMGT CDRs
10 | anchor_residues = [25, 26, 39, 40, 54, 55, 66, 67, 103, 104, 118, 119]
11 |
12 |
13 | # Salt bridge donor/acceptor atom types
14 | donors = {"LYS": ["NZ"], "ARG": ["NH1", "NH2"]}
15 | acceptors = {"ASP": ["OD1", "OD2"], "GLU": ["OE1", "OE2"]}
16 |
17 |
18 | # Kyte and Doolittle hydrophobicity scale, values normalised to values between 1 and 2
19 | normalised_hydrophobicities = {
20 | "ILE": 2.0,
21 | "VAL": 1.9666666666666666,
22 | "LEU": 1.9222222222222223,
23 | "PHE": 1.8111111111111111,
24 | "CYS": 1.7777777777777777,
25 | "MET": 1.7111111111111112,
26 | "ALA": 1.7,
27 | "GLY": 1.4555555555555555,
28 | "THR": 1.4222222222222223,
29 | "SER": 1.4111111111111112,
30 | "TRP": 1.4,
31 | "TYR": 1.3555555555555556,
32 | "PRO": 1.3222222222222222,
33 | "HIS": 1.1444444444444444,
34 | "GLU": 1.1111111111111112,
35 | "GLN": 1.1111111111111112,
36 | "ASP": 1.1111111111111112,
37 | "ASN": 1.1111111111111112,
38 | "LYS": 1.0666666666666667,
39 | "ARG": 1.0,
40 | }
41 |
42 | # Charges at pH 7.4
43 | residue_charges = {
44 | "ALA": 0.0,
45 | "ARG": 1.0,
46 | "ASN": 0.0,
47 | "ASP": -1.0,
48 | "CYS": 0.0,
49 | "GLN": 0.0,
50 | "GLU": -1.0,
51 | "GLY": 0.0,
52 | "HIS": 0.1,
53 | "ILE": 0.0,
54 | "LEU": 0.0,
55 | "LYS": 1.0,
56 | "MET": 0.0,
57 | "PHE": 0.0,
58 | "PRO": 0.0,
59 | "SER": 0.0,
60 | "THR": 0.0,
61 | "TRP": 0.0,
62 | "TYR": 0.0,
63 | "VAL": 0.0,
64 | }
65 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/main.py:
--------------------------------------------------------------------------------
1 | import sys
2 | from typing import Optional
3 |
4 | from loguru import logger
5 |
6 | from ab_characterisation.developability_tools.tap.definitions import colour_dict
7 | from ab_characterisation.developability_tools.tap.metrics import (
8 | HydrophobicPatchScoreCalculator, NegativePatchScoreCalculator,
9 | PositivePatchScoreCalculator, SFvCSPCalculator, TotalCDRLengthCalculator)
10 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import MetricResult
11 | from ab_characterisation.developability_tools.tap.outputs import write_output_file
12 | from ab_characterisation.developability_tools.tap.structure_annotation import \
13 | StructureAnnotator
14 |
15 | logger.remove()
16 | logger.add(sys.stderr, format="{message}")
17 |
18 |
19 | def run_tap(
20 | modelfile: str,
21 | outfile: Optional[str],
22 | quiet: bool = False,
23 | ) -> list[MetricResult]:
24 | """
25 | Main function to calculate TAP metrics for a pre-generated ABodyBuilder2 model.
26 | Writes the results to a file in .csv format.
27 |
28 | Args:
29 | modelfile: the path to the input model .pdb file. Can be an S3 path - in this case the file will be temporarily
30 | downloaded before TAP is run.
31 | This should be a model created by ABodyBuilder2, and should be already IMGT numbered.
32 | outfile: the output path where results should be written.
33 | quiet: suppresses all log messages if set to True.
34 | """
35 |
36 | structure = StructureAnnotator().load_and_annotate_structure(modelfile)
37 |
38 | # Calculate the 5 metrics
39 | results = []
40 | for calculator in [
41 | HydrophobicPatchScoreCalculator,
42 | NegativePatchScoreCalculator,
43 | PositivePatchScoreCalculator,
44 | SFvCSPCalculator,
45 | TotalCDRLengthCalculator,
46 | ]:
47 | results.append(calculator(quiet=quiet).calculate(structure)) # type: ignore
48 |
49 | if outfile:
50 | write_output_file(results, outfile)
51 |
52 | return results
53 |
54 |
55 | def list_metrics() -> list[dict]:
56 | """
57 | Returns (and prints) a list of the metrics and their green/amber region definitions.
58 | """
59 | metrics = []
60 | for calculator in [
61 | HydrophobicPatchScoreCalculator,
62 | NegativePatchScoreCalculator,
63 | PositivePatchScoreCalculator,
64 | SFvCSPCalculator,
65 | TotalCDRLengthCalculator,
66 | ]:
67 | metric = calculator() # type: ignore
68 | green_str = "; ".join(
69 | [
70 | str(region[0]) + " to " + str(region[1])
71 | for region in metric.green_flag_regions
72 | ]
73 | )
74 | amber_str = "; ".join(
75 | [
76 | str(region[0]) + " to " + str(region[1])
77 | for region in metric.amber_flag_regions
78 | ]
79 | )
80 | logger.opt(colors=True).info(f"TAP METRIC {metric.name}:")
81 | logger.opt(colors=True).info(
82 | f"<{colour_dict['GREEN']}>GREEN{colour_dict['GREEN']}> region: {green_str}"
83 | )
84 | logger.opt(colors=True).info(
85 | f"<{colour_dict['AMBER']}>AMBER{colour_dict['AMBER']}> region: {amber_str}\n"
86 | )
87 |
88 | metrics.append(
89 | {
90 | "name": metric.name,
91 | "green_flag_regions": metric.green_flag_regions,
92 | "amber_flag_regions": metric.amber_flag_regions,
93 | }
94 | )
95 |
96 | return metrics
97 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/metrics/__init__.py:
--------------------------------------------------------------------------------
1 | from ab_characterisation.developability_tools.tap.metrics.hydrophobic_patches import \
2 | HydrophobicPatchScoreCalculator
3 | from ab_characterisation.developability_tools.tap.metrics.negative_patches import \
4 | NegativePatchScoreCalculator
5 | from ab_characterisation.developability_tools.tap.metrics.positive_patches import \
6 | PositivePatchScoreCalculator
7 | from ab_characterisation.developability_tools.tap.metrics.sfvcsp import SFvCSPCalculator
8 | from ab_characterisation.developability_tools.tap.metrics.total_cdr_length import \
9 | TotalCDRLengthCalculator
10 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/metrics/base_calculator.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from dataclasses import dataclass
3 |
4 | from Bio import PDB
5 | from loguru import logger
6 |
7 | from ab_characterisation.developability_tools.tap.definitions import colour_dict
8 |
9 |
10 | @dataclass
11 | class MetricResult:
12 | metric_name: str
13 | calculated_value: float
14 | flag: str
15 |
16 |
17 | class BaseMetricCalculator(ABC):
18 | @abstractmethod
19 | def __init__(self, quiet: bool = False) -> None:
20 | self.quiet: bool = quiet
21 | self.name: str = ""
22 | self.green_flag_regions: list[tuple[float, float]] = []
23 | self.amber_flag_regions: list[tuple[float, float]] = []
24 |
25 | def get_flag(self, value: float) -> str:
26 | """
27 | Assigns either a green, amber, or red flag to a value depending on the defined regions
28 | (which were established by calculating the same metrics on structural models of known therapeutics).
29 |
30 | Args:
31 | value: the value calculated for the query antibody for this metric
32 |
33 | Returns:
34 | a string representing the flag colour.
35 | """
36 | for minval, maxval in self.amber_flag_regions:
37 | if minval <= value <= maxval:
38 | return "AMBER"
39 |
40 | for minval, maxval in self.green_flag_regions:
41 | if minval <= value <= maxval:
42 | return "GREEN"
43 |
44 | # If the calculated value lies outside the defined green and amber regions, it is assigned a red flag
45 | return "RED"
46 |
47 | def log_result(self, result: MetricResult) -> None:
48 | """Logs a message to the terminal summarising the result of the metric calculation."""
49 | if not self.quiet:
50 | colour = colour_dict[result.flag]
51 | logger.opt(colors=True).info(
52 | f"METRIC {result.metric_name} = <{colour}>{result.calculated_value:.2f} ({result.flag}){colour}>"
53 | )
54 |
55 | @abstractmethod
56 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult:
57 | pass
58 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/metrics/hydrophobic_patches.py:
--------------------------------------------------------------------------------
1 | from Bio import PDB
2 |
3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import (
4 | BaseMetricCalculator, MetricResult)
5 |
6 |
7 | class HydrophobicPatchScoreCalculator(BaseMetricCalculator):
8 | def __init__(self, quiet: bool = False) -> None:
9 | self.quiet = quiet
10 | self.name = "Hydrophobic Patch Score"
11 | self.green_flag_regions = [(137.61, 200.71)]
12 | self.amber_flag_regions = [(106.44, 137.61), (200.71, 225.85)]
13 |
14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult:
15 | """
16 | Calculates the 'patches of surface hydrophobicity' score (PSH) and assigns a flag colour.
17 | Considers residues that are in the CDR vicinity only.
18 | The input structure must have been annotated using the
19 | ab_characterisation.developability_tools.tap.structure_annotation module.
20 | """
21 | cdr_vicinity = [
22 | res for res in annotated_structure[0].get_residues() if res.in_cdr_vicinity
23 | ]
24 |
25 | score = 0
26 | for res1 in cdr_vicinity:
27 | for res2 in cdr_vicinity:
28 | if res1 == res2:
29 | continue
30 |
31 | distance = res1.neighbours.get(res2.get_full_id(), None)
32 | if not distance:
33 | continue
34 |
35 | score += (res1.hydrophobicity * res2.hydrophobicity) / distance**2
36 |
37 | flag = self.get_flag(score)
38 |
39 | result = MetricResult(
40 | metric_name=self.name,
41 | calculated_value=score,
42 | flag=flag,
43 | )
44 |
45 | self.log_result(result)
46 | return result
47 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/metrics/negative_patches.py:
--------------------------------------------------------------------------------
1 | from Bio import PDB
2 |
3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import (
4 | BaseMetricCalculator, MetricResult)
5 |
6 |
7 | class NegativePatchScoreCalculator(BaseMetricCalculator):
8 | def __init__(self, quiet: bool = False) -> None:
9 | self.quiet = quiet
10 | self.name = "Negative Patch Score"
11 | self.green_flag_regions = [(0, 1.67)]
12 | self.amber_flag_regions = [(1.67, 3.50)]
13 |
14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult:
15 | """
16 | Calculates the 'patches of negative charge' score (PNC) and assigns a flag colour.
17 | Considers residues that are in the CDR vicinity only.
18 | The input structure must have been annotated using the
19 | ab_characterisation.developability_tools.tap.structure_annotation module.
20 | """
21 | cdr_vicinity = [
22 | res for res in annotated_structure[0].get_residues() if res.in_cdr_vicinity
23 | ]
24 |
25 | score = 0
26 | for res1 in cdr_vicinity:
27 | for res2 in cdr_vicinity:
28 | if res1 == res2:
29 | continue
30 |
31 | if res1.charge >= 0 or res2.charge >= 0:
32 | continue
33 |
34 | distance = res1.neighbours.get(res2.get_full_id(), None)
35 | if not distance:
36 | continue
37 |
38 | score += (abs(res1.charge) * abs(res2.charge)) / distance**2
39 |
40 | flag = self.get_flag(score)
41 |
42 | result = MetricResult(
43 | metric_name=self.name,
44 | calculated_value=score,
45 | flag=flag,
46 | )
47 |
48 | self.log_result(result)
49 | return result
50 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/metrics/positive_patches.py:
--------------------------------------------------------------------------------
1 | from Bio import PDB
2 |
3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import (
4 | BaseMetricCalculator, MetricResult)
5 |
6 |
7 | class PositivePatchScoreCalculator(BaseMetricCalculator):
8 | def __init__(self, quiet: bool = False) -> None:
9 | self.quiet = quiet
10 | self.name = "Positive Patch Score"
11 | self.green_flag_regions = [(0, 1.19)]
12 | self.amber_flag_regions = [(1.19, 3.58)]
13 |
14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult:
15 | """
16 | Calculates the 'patches of positive charge' score (PPC) and assigns a flag colour.
17 | Considers residues that are in the CDR vicinity only.
18 | The input structure must have been annotated using the
19 | ab_characterisation.developability_tools.tap.structure_annotation module.
20 | """
21 | cdr_vicinity = [
22 | res for res in annotated_structure[0].get_residues() if res.in_cdr_vicinity
23 | ]
24 |
25 | score = 0
26 | for res1 in cdr_vicinity:
27 | for res2 in cdr_vicinity:
28 | if res1 == res2:
29 | continue
30 |
31 | if res1.charge <= 0 or res2.charge <= 0:
32 | continue
33 |
34 | distance = res1.neighbours.get(res2.get_full_id(), None)
35 | if not distance:
36 | continue
37 |
38 | score += (abs(res1.charge) * abs(res2.charge)) / distance**2
39 |
40 | flag = self.get_flag(score)
41 |
42 | result = MetricResult(
43 | metric_name=self.name,
44 | calculated_value=score,
45 | flag=flag,
46 | )
47 |
48 | self.log_result(result)
49 | return result
50 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/metrics/sfvcsp.py:
--------------------------------------------------------------------------------
1 | from Bio import PDB
2 |
3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import (
4 | BaseMetricCalculator, MetricResult)
5 |
6 |
7 | class SFvCSPCalculator(BaseMetricCalculator):
8 | def __init__(self, quiet: bool = False) -> None:
9 | self.quiet = quiet
10 | self.name = "SFvCSP"
11 | self.green_flag_regions = [(-4.20, 100000)]
12 | self.amber_flag_regions = [(-20.50, -4.20)]
13 |
14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult:
15 | """
16 | Calculates the 'structural Fv charge symmetry parameter' (SFvCSP) and assigns a flag colour.
17 | Considers surface residues only.
18 | The input structure must have been annotated using the
19 | ab_characterisation.developability_tools.tap.structure_annotation module.
20 | """
21 | h_charge = sum(
22 | res.charge
23 | for res in annotated_structure[0]["H"].get_residues()
24 | if res.is_surface
25 | )
26 | l_charge = sum(
27 | res.charge
28 | for res in annotated_structure[0]["L"].get_residues()
29 | if res.is_surface
30 | )
31 | sfvcsp = h_charge * l_charge
32 | flag = self.get_flag(sfvcsp)
33 |
34 | result = MetricResult(
35 | metric_name=self.name,
36 | calculated_value=sfvcsp,
37 | flag=flag,
38 | )
39 |
40 | self.log_result(result)
41 | return result
42 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/metrics/total_cdr_length.py:
--------------------------------------------------------------------------------
1 | from Bio import PDB
2 |
3 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import (
4 | BaseMetricCalculator, MetricResult)
5 |
6 |
7 | class TotalCDRLengthCalculator(BaseMetricCalculator):
8 | def __init__(self, quiet: bool = False) -> None:
9 | self.quiet = quiet
10 | self.name = "Total IMGT CDR Length"
11 | self.green_flag_regions = [(43, 55)]
12 | self.amber_flag_regions = [(37, 43), (55, 63)]
13 |
14 | def calculate(self, annotated_structure: PDB.Structure.Structure) -> MetricResult:
15 | """
16 | Calculates the total number of CDR residues and assigns a flag colour.
17 | Uses the IMGT CDR definition.
18 | The input structure must have been annotated using the
19 | ab_characterisation.developability_tools.tap.structure_annotation module.
20 | """
21 | cdr_residues = [
22 | res for res in annotated_structure[0].get_residues() if res.is_cdr
23 | ]
24 | total_cdr_length = len(cdr_residues)
25 | flag = self.get_flag(total_cdr_length)
26 |
27 | result = MetricResult(
28 | metric_name=self.name,
29 | calculated_value=total_cdr_length,
30 | flag=flag,
31 | )
32 |
33 | self.log_result(result)
34 | return result
35 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/outputs.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from pathlib import Path
3 |
4 | from ab_characterisation.developability_tools.tap.metrics.base_calculator import MetricResult
5 | from ab_characterisation.developability_tools.utils.outputs import write_file
6 |
7 |
8 | def write_output_file(results: list[MetricResult], outfile: str) -> None:
9 | """
10 | Writes the TAP results to an output file in csv format.
11 |
12 | Args:
13 | results: the list of metric results
14 | outfile: the path to where the results should be written.
15 | """
16 | outstr = "Metric,Value,Flag\n"
17 | for res in results:
18 | outstr += f"{res.metric_name},{res.calculated_value:.2f},{res.flag}\n"
19 |
20 | write_file(outstr, outfile)
21 |
22 | return
23 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/psa_executables/psa:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/tap/psa_executables/psa
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/psa_executables/psa_mac:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/developability_tools/tap/psa_executables/psa_mac
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/tap/structure_annotation.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 | from dataclasses import dataclass, field
4 | from pathlib import Path
5 | from typing import Optional
6 |
7 | from Bio import PDB
8 | from Bio.PDB.NeighborSearch import NeighborSearch
9 |
10 | from ab_characterisation.developability_tools.tap.definitions import (
11 | acceptors, anchor_residues, donors, imgt_cdr_definitions,
12 | normalised_hydrophobicities, residue_charges)
13 |
14 |
15 | class PSAError(Exception):
16 | """Raised when something has gone awry when running psa to calculate surface areas."""
17 |
18 |
19 | @dataclass
20 | class AnnotatedResidue(PDB.Residue.Residue): # type: ignore
21 | salt_bridge_partner: Optional[tuple] = None
22 | neighbours: dict = field(default_factory=dict)
23 | cdr_number: int = field(init=False)
24 | relative_surface_area: float = field(init=False)
25 | hydrophobicity: float = field(init=False)
26 | charge: float = field(init=False)
27 | in_cdr_vicinity: bool = field(init=False)
28 |
29 | def __eq__(self, other): # type: ignore
30 | """Direct copy from Biopython Entity"""
31 | if isinstance(other, type(self)):
32 | if self.parent is None:
33 | return self.id == other.id
34 | return self.full_id[1:] == other.full_id[1:]
35 | return NotImplemented
36 |
37 | def __hash__(self) -> int:
38 | """Direct copy from Biopython Entity"""
39 | return hash(self.full_id)
40 |
41 | @property
42 | def is_cdr(self) -> bool:
43 | """Whether the residue is part of a CDR (IMGT definition) or not."""
44 | return self.cdr_number > 0
45 |
46 | @property
47 | def is_anchor(self) -> bool:
48 | """
49 | Whether the residue an anchor residue to a CDR (IMGT definition) or not.
50 | Anchor residues are defined as the two residues on each side of the CDR.
51 | """
52 | return self.id[1] in anchor_residues
53 |
54 | @property
55 | def is_surface(self) -> bool:
56 | """
57 | Uses the relative surface area as calculated by psa to determine whether the residue is on the surface or not.
58 | Surface residues have a relative sidechain surface area of 7.5 or above.
59 | """
60 | return self.relative_surface_area >= 7.5
61 |
62 | @property
63 | def res_number(self) -> str:
64 | """Returns a formatted string containing the residue number and insertion code, if present."""
65 | return f"{self.id[1]}{self.id[2]}".strip()
66 |
67 | @property
68 | def is_donor(self) -> bool:
69 | """Whether the residue is a salt bridge donor residue type."""
70 | return self.resname in donors
71 |
72 | @property
73 | def is_acceptor(self) -> bool:
74 | """Whether the residue is a salt bridge acceptor residue type."""
75 | return self.resname in acceptors
76 |
77 |
78 | @dataclass
79 | class StructureAnnotator:
80 | """
81 | Class containing methods for structural annotation of properties required by TAP":
82 | - relative surface area
83 | - minimum distances between neighbouring residues
84 | - CDR vicinity (surface residues within 4 A of CDRs/anchors)
85 | - salt bridges (donor/acceptor atoms within 3.2 A)
86 | - hydrophobicity
87 | - charge
88 | """
89 |
90 | neighbour_cutoff: float = 7.5
91 | salt_bridge_cutoff: float = 3.2
92 | vicinity_cutoff: float = 4.0
93 | psa_path: Path = field(init=False)
94 | cdr_lookup_dict: dict[tuple[str, int], int] = field(init=False)
95 |
96 | def __post_init__(self) -> None:
97 | lookup_dict = {}
98 | for chain in "HL":
99 | for cdr, residue_range in imgt_cdr_definitions.items():
100 | for res in residue_range:
101 | lookup_dict[(chain, res)] = cdr
102 | self.cdr_lookup_dict = lookup_dict
103 |
104 | psa_version = "psa_mac" if sys.platform == "darwin" else "psa"
105 | self.psa_path = (
106 | Path(__file__).resolve().parent / f"psa_executables/{psa_version}"
107 | )
108 |
109 | @staticmethod
110 | def _convert_residues(structure: PDB.Structure.Structure) -> None:
111 | """Converts normal Biopython residues to our annotated version with extra properties/methods."""
112 | for res in structure[0].get_residues():
113 | res.__class__ = AnnotatedResidue
114 | res.neighbours = {}
115 | return
116 |
117 | def _run_psa(self, structure_path: str) -> list[str]:
118 | """Runs the psa executable on the .pdb file to get surface accessibility information."""
119 | if self.psa_path.exists() is False:
120 | raise PSAError("psa executable was not found.")
121 |
122 | result, error = subprocess.Popen(
123 | [str(self.psa_path), "-t", structure_path],
124 | stdout=subprocess.PIPE,
125 | stderr=subprocess.PIPE,
126 | ).communicate()
127 | if not result:
128 | raise PSAError(error.decode())
129 |
130 | psa_output = result.decode().split("\n")
131 | return psa_output
132 |
133 | def _annotate_sasa(
134 | self, structure: PDB.Structure.Structure, structure_path: str
135 | ) -> None:
136 | """Runs psa and extracts surface accessibility information from its output."""
137 | # Run psa and get the relevant lines from the output
138 | psa_output = self._run_psa(structure_path)
139 | residue_lines = [line for line in psa_output if line.startswith("ACCESS")]
140 |
141 | # Check that the number of residues in the psa output is the same as the number of residues in our structure
142 | all_residues = list(structure[0].get_residues())
143 | if len(all_residues) != len(residue_lines):
144 | raise PSAError("PSA output contained the wrong number of residues.")
145 |
146 | # Iterate through residues and annotate the structure
147 | for res, psa_line in zip(all_residues, residue_lines):
148 | # Check we are on the correct residue with the correct type
149 | psa_residue_number = psa_line[6:12].strip()
150 | if res.res_number != psa_residue_number:
151 | raise PSAError(
152 | f"Residue number mismatch: {res.res_number} != {psa_residue_number}"
153 | )
154 | psa_residue_type = psa_line[14:17]
155 | if res.resname != psa_residue_type:
156 | raise PSAError(
157 | f"Expected type {res.resname} for residue {res.parent}{res.res_number}; got {psa_residue_type}"
158 | )
159 |
160 | res.relative_surface_area = float(psa_line[61:67])
161 | return
162 |
163 | @staticmethod
164 | def _get_minimum_distance(res1: AnnotatedResidue, res2: AnnotatedResidue) -> float:
165 | """Calculates the minimum distance between the heavy atoms of two residues."""
166 | min_dist = 100.0
167 | for atom1 in res1.get_atoms():
168 | for atom2 in res2.get_atoms():
169 | dist = atom1 - atom2
170 | if dist < min_dist:
171 | min_dist = dist
172 | return min_dist
173 |
174 | def _get_neighbours(self, structure: PDB.Structure.Structure) -> None:
175 | """For each residue in the structure, gets a list of neighbouring residues and their minimum distance"""
176 | # Quickly get list of residue pairs that are less than 7.5A apart
177 | all_heavy_atoms = [
178 | atom for atom in structure[0].get_atoms() if atom.element != "H"
179 | ]
180 | residue_pairs = NeighborSearch(atom_list=all_heavy_atoms).search_all(
181 | self.neighbour_cutoff, level="R"
182 | )
183 |
184 | for res1, res2 in residue_pairs:
185 | if res1 == res2:
186 | continue
187 | min_dist = self._get_minimum_distance(res1, res2)
188 | res1.neighbours[res2.get_full_id()] = min_dist
189 | res2.neighbours[res1.get_full_id()] = min_dist
190 | return
191 |
192 | def _cdr_lookup(self, chain_id: str, residue_number: int) -> int:
193 | """
194 | Returns the number of the CDR a residue is part of from its residue number.
195 | Returns zero if the residue is not part of a CDR.
196 | """
197 | return self.cdr_lookup_dict.get((chain_id, residue_number), 0)
198 |
199 | def _annotate_cdrs(self, structure: PDB.Structure.Structure) -> None:
200 | """Annotates residues with their CDR number (0 if not in a CDR)"""
201 | for res in structure[0].get_residues():
202 | chain_id = res.parent.id
203 | res_number = res.id[1]
204 | res.cdr_number = self._cdr_lookup(chain_id, res_number)
205 |
206 | def _annotate_cdr_vicinity(self, structure: PDB.Structure.Structure) -> None:
207 | """Finds and annotates which residues are on the surface and less than 4A away from the CDRs/anchors."""
208 | surface_cdrs_and_anchors = []
209 | for res in structure[0].get_residues():
210 | if res.is_surface and (res.is_cdr or res.is_anchor):
211 | res.in_cdr_vicinity = True
212 | surface_cdrs_and_anchors.append(res)
213 | else:
214 | res.in_cdr_vicinity = False
215 |
216 | for res in surface_cdrs_and_anchors:
217 | res.in_cdr_vicinity = True
218 | for neighbour_id, distance in res.neighbours.items():
219 | res2 = structure[0][neighbour_id[2]][neighbour_id[3]]
220 | if distance < self.vicinity_cutoff and res2.is_surface:
221 | res2.in_cdr_vicinity = True
222 | return
223 |
224 | def _annotate_salt_bridges(self, structure: PDB.Structure.Structure) -> None:
225 | """Identifies which residues form salt bridges based on distance."""
226 | donor_atoms = [
227 | atom
228 | for atom in structure[0].get_atoms()
229 | if atom.id in donors.get(atom.parent.resname, [])
230 | ]
231 | acceptor_atoms = [
232 | atom
233 | for atom in structure[0].get_atoms()
234 | if atom.id in acceptors.get(atom.parent.resname, [])
235 | ]
236 | residue_pairs = NeighborSearch(
237 | atom_list=donor_atoms + acceptor_atoms
238 | ).search_all(self.salt_bridge_cutoff, level="R")
239 |
240 | for res1, res2 in residue_pairs:
241 | # Ignore if already part of a salt bridge
242 | if res1.salt_bridge_partner or res2.salt_bridge_partner:
243 | continue
244 | if res1.is_surface and res2.is_surface:
245 | if (res1.is_donor and res2.is_acceptor) or (
246 | res2.is_donor and res1.is_acceptor
247 | ):
248 | res1.salt_bridge_partner = res2.get_full_id()
249 | res2.salt_bridge_partner = res1.get_full_id()
250 | return
251 |
252 | def _annotate_hydrophobicity(self, structure: PDB.Structure.Structure) -> None:
253 | """
254 | Annotates residues with their normalised (between 1 and 2) hydrophobicity values.
255 | If the residue forms part of a salt bridge, it is assigned the hydrophobicity of glycine.
256 | """
257 | for res in structure[0].get_residues():
258 | if res.salt_bridge_partner:
259 | res.hydrophobicity = normalised_hydrophobicities["GLY"]
260 | else:
261 | res.hydrophobicity = normalised_hydrophobicities[res.resname]
262 | return
263 |
264 | def _annotate_charge(self, structure: PDB.Structure.Structure) -> None:
265 | """
266 | Annotates residues with their charges.
267 | If the residue forms part of a salt bridge, it is assigned a charge of zero.
268 | """
269 | for res in structure[0].get_residues():
270 | if res.salt_bridge_partner:
271 | res.charge = 0
272 | else:
273 | res.charge = residue_charges[res.resname]
274 | return
275 |
276 | def load_and_annotate_structure(
277 | self, structure_path: str
278 | ) -> PDB.Structure.Structure:
279 | """
280 | Loads an antibody structure from the provided file, and annotates the residues for later use in TAP metric
281 | calculations.
282 | Assumes the structure is already IMGT-numbered!!
283 |
284 | Args:
285 | structure_path: the path to the structure that is to be annotated.
286 |
287 | Returns:
288 | the annotated structure (Biopython Structure entity, with residues converted to an AnnotatedResidue type).
289 | """
290 | structure = PDB.PDBParser(QUIET=True).get_structure(
291 | "input_structure", structure_path
292 | )
293 | self._convert_residues(structure)
294 | self._annotate_sasa(structure, structure_path)
295 | self._get_neighbours(structure)
296 | self._annotate_cdrs(structure)
297 | self._annotate_cdr_vicinity(structure)
298 | self._annotate_salt_bridges(structure)
299 | self._annotate_hydrophobicity(structure)
300 | self._annotate_charge(structure)
301 | return structure
302 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/utils/input_handling.py:
--------------------------------------------------------------------------------
1 | from pathlib import Path
2 | from typing import Optional
3 |
4 | from anarci.anarci import number
5 | from Bio import SeqIO
6 |
7 |
8 | class InputError(Exception):
9 | pass
10 |
11 |
12 | def get_numbering(
13 | sequence: str, expected_type: str
14 | ) -> list[tuple[tuple[int, str], str]]:
15 | """
16 | Uses ANARCI to number an input sequence.
17 |
18 | Args:
19 | sequence: the amino acid sequence of the antibody chain
20 | expected_type: H or L (if the ANARCI annotation does not match this an error will be raised)
21 |
22 | Returns:
23 | the ANARCI residue numbering, e.g. [((1, ' '), 'E'), ((2, ' '), 'L'), ... ]
24 |
25 | """
26 | anarci_result: tuple[list[tuple[tuple[int, str], str]], str] = number(sequence)
27 | numbering, chain_type = anarci_result
28 | if numbering:
29 | if chain_type == expected_type:
30 | return numbering
31 | raise InputError(
32 | f"Incorrect chain type: expected {expected_type}, got {chain_type}"
33 | )
34 | raise InputError(f"ANARCI failed to number {expected_type} sequence")
35 |
36 |
37 | def parse_fasta(fasta_file: str) -> dict[str, dict[str, Optional[str]]]:
38 | if not Path(fasta_file).exists():
39 | raise InputError(f"Fasta file {fasta_file} does not exist.")
40 |
41 | sequences: dict[str, dict[str, Optional[str]]] = {}
42 | with open(fasta_file) as handle:
43 | for record in SeqIO.parse(handle, "fasta"):
44 | if '/' not in str(record.seq):
45 | raise(
46 | AssertionError,
47 | f"Antibody fasta sequences need to be formatted as HEAVY/LIGHT, an entry in {fasta_file} does not"
48 | f"contain /"
49 | )
50 | heavy, light = str(record.seq).split("/")
51 | sequences[record.id] = {
52 | "H": heavy if heavy not in ["-", ""] else None,
53 | "L": light if light not in ["-", ""] else None,
54 | }
55 |
56 | return sequences
57 |
--------------------------------------------------------------------------------
/src/ab_characterisation/developability_tools/utils/outputs.py:
--------------------------------------------------------------------------------
1 | import tempfile
2 | from pathlib import Path
3 |
4 |
5 | def write_file(contents: str, filepath: str) -> None:
6 | """Writes an output file to the given location with the given contents."""
7 | outpath = Path(filepath)
8 | outpath.parent.mkdir(parents=True, exist_ok=True)
9 | with outpath.open("w") as openf:
10 | openf.write(contents)
11 |
--------------------------------------------------------------------------------
/src/ab_characterisation/filter_steps.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 |
3 | from loguru import logger
4 | import numpy as np
5 | import pandas as pd
6 | from numpy import typing as npt
7 | from scipy.stats import multivariate_normal
8 |
9 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig
10 | from ab_characterisation.utils.rosetta_utils import aggregate_rosetta_metrics
11 |
12 |
13 | def sequence_liability_filter(biol_data: BiologicsData, config: RunConfig) -> bool:
14 | """
15 |
16 | Args:
17 | biol_data:
18 | config:
19 |
20 | Returns:
21 |
22 | """
23 | for liability in biol_data.sequence_liabilities:
24 | if liability.liability_type in config.dq_sequence_liabilities:
25 | return True
26 | return False
27 |
28 |
29 | def tap_filter(biol_data: BiologicsData, config: RunConfig) -> bool:
30 | """
31 |
32 | Args:
33 | biol_data:
34 | config:
35 |
36 | Returns:
37 |
38 | """
39 | for tap_metric in biol_data.tap_flags:
40 | if tap_metric.flag == "RED":
41 | return True
42 | return False
43 |
44 |
45 | def rosetta_antibody_filter(biol_data: BiologicsData, config: RunConfig) -> bool:
46 | """
47 |
48 | Args:
49 | biol_data:
50 | config:
51 |
52 | Returns:
53 |
54 | """
55 | rosetta_antibody_data = aggregate_rosetta_metrics(biol_data.rosetta_output_ab_only)
56 | if rosetta_antibody_data.dG_separated.iloc[0] < 5:
57 | return False
58 | return True
59 |
60 |
61 | def find_top_n(
62 | biol_data_ls: list[BiologicsData], config: RunConfig
63 | ) -> list[BiologicsData]:
64 | pd_row_ls = []
65 | out_ls = []
66 | for biol_data in biol_data_ls:
67 | if biol_data.discarded_by is None:
68 | rosetta_complex_scores = aggregate_rosetta_metrics(
69 | biol_data.rosetta_output_complex, metrics=["dG_separated", "total_score"]
70 | ).iloc[0]
71 | pd_row_ls.append(rosetta_complex_scores)
72 | metric_df = pd.concat(pd_row_ls)
73 | top_indices = find_top_candidates(
74 | metric_df.total_score,
75 | metric_df.dG_separated,
76 | config.top_n,
77 | scale_factor=1,
78 | fit_without_outliers=True,
79 | )
80 | valid_candidate_idx = -1
81 | for biol_data in biol_data_ls:
82 | if biol_data.discarded_by is None:
83 | valid_candidate_idx += 1
84 | if valid_candidate_idx in top_indices:
85 | biol_data.rank = list(top_indices).index(valid_candidate_idx)
86 | else:
87 | biol_data.discarded_by = "Not in top N"
88 | out_ls.append(biol_data)
89 | return out_ls
90 |
91 |
92 | def find_top_candidates(
93 | total_score: npt.ArrayLike,
94 | dG_separated: npt.ArrayLike,
95 | n: int,
96 | scale_factor: float = 1,
97 | total_score_max: Optional[float] = None,
98 | dG_separated_max: Optional[float] = None,
99 | fit_without_outliers: bool = True,
100 | ) -> np.ndarray:
101 | """
102 | Fit multivariate gaussian (centered on median rather than mean) and then select best points
103 | according to lowest probability of being drawn subject to bounds.
104 | Args:
105 | total_score: Total score for candidates to select
106 | dG_separated: dG_seperated of candidates to select
107 | n: N candidates to select
108 | scale_factor: Scale total score of data points by this factor AFTER fitting multivariate
109 | total_score_max: Maximum total score of selected candidates, if not specified use median
110 | dG_separated_max: Maximum dG_separated of selected candidates, if not specified use medians
111 | fit_without_outliers: Ignore points that are 1.5 IQR above/below the upper/lower quartile
112 | when fitting the gaussian.
113 |
114 | Returns:
115 |
116 | """
117 | data = np.stack([np.array(total_score), np.array(dG_separated)], axis=1)
118 |
119 | if fit_without_outliers:
120 | # Define outliers (don't fit gaussian on these)
121 | ts_q1 = np.quantile(total_score, 0.25)
122 | ts_q3 = np.quantile(total_score, 0.75)
123 | ts_IQR = ts_q3 - ts_q1
124 | ts_lower_bound = ts_q1 - ts_IQR * 1.5
125 | ts_upper_bound = ts_q3 + ts_IQR * 1.5
126 |
127 | dGs_q1 = np.quantile(dG_separated, 0.25)
128 | dGs_q3 = np.quantile(dG_separated, 0.75)
129 | dGs_IQR = dGs_q3 - dGs_q1
130 | dGs_lower_bound = dGs_q1 - dGs_IQR * 1.5
131 | dGs_upper_bound = dGs_q3 + dGs_IQR * 1.5
132 |
133 | outlier_mask = (
134 | (ts_lower_bound < data[:, 0])
135 | & (data[:, 0] < ts_upper_bound)
136 | & (dGs_lower_bound < data[:, 1])
137 | & (data[:, 1] < dGs_upper_bound)
138 | )
139 |
140 | median = np.median(data[outlier_mask], axis=0)
141 | cov = np.cov(data[outlier_mask], rowvar=0)
142 | else:
143 | median = np.median(data, axis=0)
144 | cov = np.cov(data, rowvar=0)
145 | multivar_f = multivariate_normal(mean=median, cov=cov, allow_singular=True)
146 | xmax = total_score_max if total_score_max is not None else median[0]
147 | ymax = dG_separated_max if dG_separated_max is not None else median[1]
148 | centroid = np.array([xmax, ymax])
149 | mask = np.all((data < centroid), axis=1)
150 | data[:, 0] = scale_factor * (data[:, 0] - median[0]) + median[0]
151 | top_idx = np.argsort(multivar_f.pdf(data[mask]))[:n]
152 | indices = np.where(mask == True)[0]
153 | return indices[top_idx]
154 |
--------------------------------------------------------------------------------
/src/ab_characterisation/pipeline_orchestration.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import typing as t
3 |
4 | import pandas as pd
5 | from loguru import logger
6 | from mpi4py import MPI
7 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig, save_output
8 |
9 | from ab_characterisation.filter_steps import (
10 | find_top_n, rosetta_antibody_filter, sequence_liability_filter, tap_filter
11 | )
12 | from ab_characterisation.rosetta_steps import rosetta_antibody_step, rosetta_complex_step
13 | from ab_characterisation.sequence_steps import sequence_liability_check
14 | from ab_characterisation.structure_steps import run_abb2, run_chimerax_superposition, run_tap
15 |
16 |
17 | def get_objects(config: RunConfig) -> list[BiologicsData]:
18 | """
19 |
20 | Args:
21 | config:
22 |
23 | Returns:
24 |
25 | """
26 | data_objects = []
27 | df = pd.read_csv(config.input_file)
28 | for idx, row in df.iterrows():
29 | data_objects.append(
30 | BiologicsData(
31 | heavy_sequence=row.heavy_sequence,
32 | light_sequence=row.light_sequence,
33 | name=row.sequence_name,
34 | target_complex_reference=row.reference_complex,
35 | )
36 | )
37 | return data_objects
38 |
39 |
40 | def filtering_step(
41 | input_data: list[BiologicsData],
42 | step_name: str,
43 | criterion_function: t.Callable,
44 | config: RunConfig,
45 | ) -> list[BiologicsData]:
46 | """
47 | General framework for a step that performs filtering of the input data, labelling datapoints as discarded if they
48 | fail to pass a filter criterion.
49 | Args:
50 | input_data:
51 | step_name:
52 | criterion_function: Function mapping BiologicsData -> bool
53 | config:
54 |
55 | Returns:
56 | list of BiologicsData objects
57 | """
58 | output_data: list[BiologicsData] = []
59 | filter_count = 0
60 |
61 | for biol_data in input_data:
62 | if biol_data.discarded_by is None:
63 | filtered = criterion_function(biol_data, config)
64 | if filtered:
65 | biol_data.discarded_by = step_name
66 | filter_count += 1
67 | output_data.append(biol_data)
68 |
69 | logger.info(f"{filter_count} datapoints discarded during step {step_name}.")
70 | return output_data
71 |
72 |
73 | def computation_step(
74 | input_data: list[BiologicsData], computation_function: t.Callable, config: RunConfig
75 | ) -> list[BiologicsData]:
76 | """
77 | General framework for a step that performs computation on the input data, manipulating one or more of the dataclass
78 | fields.
79 |
80 | Args:
81 | input_data:
82 | computation_function: Function mapping BiologicsData -> BiologicsData, modifying the dataclass fields with the
83 | results of the computation
84 | config:
85 |
86 | Returns:
87 | list of BiologicsData objects
88 | """
89 |
90 | comm = MPI.COMM_WORLD
91 | rank = comm.Get_rank()
92 | size = comm.Get_size()
93 |
94 | # Calculate the chunk size for each process
95 | chunk_size = len(input_data) // size
96 | remainder = len(input_data) % size
97 |
98 | # Calculate the range for the current process
99 | local_start = rank * chunk_size + min(rank, remainder)
100 | local_end = local_start + chunk_size + (1 if rank < remainder else 0)
101 |
102 | # Perform the local computation
103 | local_results: list[BiologicsData] = []
104 | for biol_data in input_data[local_start:local_end]:
105 | if biol_data.discarded_by is None:
106 | biol_data = computation_function(biol_data, config)
107 | local_results.append(biol_data)
108 |
109 | # Gather the local results at the root process
110 | all_results = comm.gather(local_results, root=0)
111 |
112 | # Combine the results into a single list
113 | output_data: list[BiologicsData] = []
114 | if rank == 0:
115 | for result_list in all_results:
116 | output_data.extend(result_list)
117 | output_data = comm.bcast(output_data, root=0)
118 | return output_data
119 |
120 |
121 | def pipeline(config: RunConfig, mpi_rank: int, mpi_size: int) -> None:
122 | """
123 |
124 | Args:
125 | config:
126 |
127 | Returns:
128 |
129 | """
130 | logger.remove()
131 | if mpi_rank == 0:
132 | logger.add(
133 | sys.stdout,
134 | format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
135 | level="INFO",
136 | )
137 | else:
138 | logger.add(
139 | sys.stdout,
140 | format="{time:YYYY-MM-DD HH:mm:ss} | {level: <8} | {message}",
141 | level="WARNING",
142 | )
143 | biologics_objects = get_objects(config)
144 |
145 | logger.info("Identifying sequence liabilities")
146 | biologics_objects = computation_step(
147 | biologics_objects, sequence_liability_check, config
148 | )
149 | logger.info("Filtering by sequence liabilities")
150 | biologics_objects = filtering_step(
151 | biologics_objects,
152 | step_name="liabilities",
153 | criterion_function=sequence_liability_filter,
154 | config=config,
155 | )
156 |
157 | logger.info("Running ABB2")
158 | biologics_objects = computation_step(biologics_objects, run_abb2, config)
159 | logger.info("Running TAP")
160 | biologics_objects = computation_step(biologics_objects, run_tap, config)
161 | logger.info("Filtering TAP")
162 | biologics_objects = filtering_step(biologics_objects, "tap", tap_filter, config)
163 | logger.info("Running antibody-only Rosetta analysis")
164 | biologics_objects = computation_step(
165 | biologics_objects, rosetta_antibody_step, config
166 | )
167 | logger.info("Running filtering based on antibody-only Rosetta analysis")
168 | biologics_objects = filtering_step(
169 | biologics_objects, "rosetta_antibody", rosetta_antibody_filter, config
170 | )
171 | if not config.exclude_complex_analysis:
172 | logger.info("Running ChimeraX complex generation")
173 | biologics_objects = computation_step(
174 | biologics_objects, run_chimerax_superposition, config
175 | )
176 | logger.info("Running Rosetta complex analysis")
177 | biologics_objects = computation_step(
178 | biologics_objects, rosetta_complex_step, config
179 | )
180 |
181 | if mpi_rank == 0:
182 | logger.info("Identifying top N candidates")
183 | biologics_objects = find_top_n(biologics_objects, config)
184 | save_output(biol_data_ls=biologics_objects, config=config)
185 |
--------------------------------------------------------------------------------
/src/ab_characterisation/rosetta_steps.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import subprocess
3 | import tempfile
4 | from pathlib import Path
5 |
6 | import pandas as pd
7 |
8 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig
9 |
10 |
11 | def generic_rosetta_step(
12 | biol_data: BiologicsData,
13 | variables: dict[str, str],
14 | template: str,
15 | config: RunConfig,
16 | step_name: str,
17 | replicates: int = 1,
18 | ) -> pd.DataFrame:
19 | """
20 | Args:
21 | biol_data:
22 | variables:
23 | template:
24 | config:
25 | step_name:
26 | replicates:
27 |
28 | Returns:
29 |
30 | """
31 | outputs = []
32 | for replicate in range(replicates):
33 | with tempfile.TemporaryDirectory() as temp_dir:
34 | bash_template_path = (
35 | Path(__file__).parent / "utils" / "rosetta_templates" / f"{template}.sh"
36 | )
37 | xml_template_path = (
38 | Path(__file__).parent
39 | / "utils"
40 | / "rosetta_templates"
41 | / f"{template}.xml"
42 | )
43 |
44 | with open(bash_template_path) as inf_sh, open(
45 | Path(temp_dir) / f"{template}.sh", "w"
46 | ) as outf_sh:
47 | for line in inf_sh:
48 | for key, value in variables.items():
49 | line = line.replace(key, value)
50 | outf_sh.write(line)
51 |
52 | shutil.copy(xml_template_path, Path(temp_dir) / f"{template}.xml")
53 |
54 | with open(
55 | config.output_directory
56 | / "logs"
57 | / f"{biol_data.name}_rosetta_{step_name}_{replicate}.log",
58 | "w",
59 | ) as outf:
60 | subprocess.run(
61 | ["bash", f"{template}.sh"], cwd=temp_dir, stdout=outf, stderr=outf
62 | )
63 | output = pd.read_csv(
64 | Path(temp_dir) / "score.sc", delim_whitespace=True, skiprows=1
65 | )
66 | output["replicate"] = replicate
67 | outputs.append(output)
68 | return pd.concat(outputs)
69 |
70 |
71 | def rosetta_antibody_step(biol_data: BiologicsData, config: RunConfig) -> BiologicsData:
72 | """
73 |
74 | Args:
75 | biol_data:
76 | config:
77 |
78 | Returns:
79 |
80 | """
81 | variables = {
82 | "": str(biol_data.antibody_structure),
83 | "": config.rosetta_base_directory,
84 | }
85 | result_df = generic_rosetta_step(
86 | biol_data,
87 | variables,
88 | "rosetta_metrics_ab_only",
89 | config,
90 | step_name="ab_only",
91 | replicates=config.rosetta_replicates,
92 | )
93 | biol_data.rosetta_output_ab_only = result_df
94 | result_df.to_csv(
95 | config.output_directory
96 | / "rosetta_output"
97 | / f"{biol_data.name}_rosetta_ab_only.csv"
98 | )
99 | return biol_data
100 |
101 |
102 | def rosetta_complex_step(biol_data: BiologicsData, config: RunConfig) -> BiologicsData:
103 | """
104 |
105 | Args:
106 | biol_data:
107 | config:
108 |
109 | Returns:
110 |
111 | """
112 | variables = {
113 | "": biol_data.chimerax_complex_structure,
114 | "": config.rosetta_base_directory,
115 | }
116 | result_df = generic_rosetta_step(
117 | biol_data,
118 | variables,
119 | "rosetta_metrics_complex",
120 | config,
121 | step_name="complex",
122 | replicates=config.rosetta_replicates,
123 | )
124 | biol_data.rosetta_output_complex = result_df
125 | result_df.to_csv(
126 | config.output_directory
127 | / "rosetta_output"
128 | / f"{biol_data.name}_rosetta_complex.csv"
129 | )
130 | return biol_data
131 |
--------------------------------------------------------------------------------
/src/ab_characterisation/sequence_steps.py:
--------------------------------------------------------------------------------
1 | from ab_characterisation.developability_tools.sequence_liabilities.main import scan_single
2 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig
3 |
4 |
5 | def sequence_liability_check(
6 | input_data: BiologicsData, config: RunConfig
7 | ) -> BiologicsData:
8 | """
9 |
10 | Args:
11 | input_data:
12 |
13 | Returns:
14 |
15 | """
16 | liabilities = scan_single(
17 | input_data.heavy_sequence, input_data.light_sequence, quiet=True
18 | )
19 | input_data.sequence_liabilities = liabilities
20 | return input_data
21 |
--------------------------------------------------------------------------------
/src/ab_characterisation/structure_steps.py:
--------------------------------------------------------------------------------
1 | from ImmuneBuilder import ABodyBuilder2
2 |
3 | from ab_characterisation.developability_tools.tap.main import run_tap as tap
4 | from ab_characterisation.utils.chimerax_utils import ChimeraInput, ChimeraOutput, run_chimerax
5 | from ab_characterisation.utils.data_classes import BiologicsData, RunConfig
6 |
7 |
8 | def run_abb2(biol_data: BiologicsData, config: RunConfig) -> BiologicsData:
9 | """
10 | Run ABodyBuilder2 on input data, save output and output path.
11 | Args:
12 | biol_data:
13 | config:
14 |
15 | Returns:
16 |
17 | """
18 | predictor = ABodyBuilder2()
19 |
20 | sequences = {"H": biol_data.heavy_sequence, "L": biol_data.light_sequence}
21 |
22 | antibody = predictor.predict(sequences)
23 | antibody.save(
24 | str(config.output_directory / "antibody_models" / f"{biol_data.name}_model.pdb")
25 | )
26 | biol_data.antibody_structure = (
27 | config.output_directory / "antibody_models" / f"{biol_data.name}_model.pdb"
28 | ).resolve()
29 | return biol_data
30 |
31 |
32 | def run_tap(biol_data: BiologicsData, config: RunConfig) -> BiologicsData:
33 | """
34 |
35 | Args:
36 | biol_data:
37 | config:
38 |
39 | Returns:
40 |
41 | """
42 | results = tap(biol_data.antibody_structure, outfile=None, quiet=True)
43 | biol_data.tap_flags = results
44 | return biol_data
45 |
46 |
47 | def run_chimerax_superposition(
48 | biol_data: BiologicsData, config: RunConfig
49 | ) -> BiologicsData:
50 | """
51 |
52 | Args:
53 | biol_data:
54 | config:
55 |
56 | Returns:
57 |
58 | """
59 | chimera_input = ChimeraInput(
60 | name=biol_data.name,
61 | template=biol_data.target_complex_reference,
62 | query_ab=biol_data.antibody_structure,
63 | template_ab_chains=biol_data.target_complex_antibody_chains,
64 | map_resolution=config.chimera_map_resolution,
65 | query_ab_chains="HL",
66 | template_ag_chains=biol_data.target_complex_antigen_chains,
67 | output_file=str(
68 | (
69 | config.output_directory
70 | / "complex_structures"
71 | / f"{biol_data.name}_complex.pdb"
72 | ).resolve()
73 | ),
74 | )
75 |
76 | chimera_output = run_chimerax(chimera_input, config)
77 | if chimera_output.success:
78 | biol_data.chimerax_complex_structure = chimera_output.output_file
79 | else:
80 | biol_data.discarded_by = "ChimeraX failure"
81 |
82 | return biol_data
83 |
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Exscientia/ab-characterisation/46ccd6452ec22e31c6c4a97327740b7a88ab0b83/src/ab_characterisation/utils/__init__.py
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/anarci_region_definition_utils.py:
--------------------------------------------------------------------------------
1 | from collections import defaultdict
2 | from dataclasses import dataclass, field # pylint: disable=C0302
3 | from typing import Dict, List, Tuple
4 |
5 |
6 | @dataclass
7 | class ExtractedRegions:
8 | """
9 | Class to record numbered antibody sequence with region annotation
10 | """
11 |
12 | region_numbering: List[Tuple[Tuple[int, str], str, str]] = field(
13 | init=False, repr=False, default_factory=list
14 | )
15 | region_sequences: Dict[str, str] = field(
16 | init=False, repr=True, default_factory=lambda: defaultdict(str)
17 | )
18 |
19 | def add_residue(
20 | self, current_regions: str, amino_acid: str, residue: Tuple[int, str]
21 | ) -> None:
22 | """
23 | Function to record a numbered residue with region annotation
24 | Args:
25 | current_regions: name of the region e.g. cdrh1
26 | amino_acid: one letter amino acid letter
27 | residue: antibody numbered position e.g. (110, " ")
28 |
29 | Returns:
30 |
31 | """
32 | if self.region_numbering and self.region_numbering[-1][0][0] > residue[0]:
33 | raise AssertionError(
34 | f"Incorrect numbering. Previous residue cannot come"
35 | f" after the added one: {self.region_numbering[-1][0]} and {residue}"
36 | )
37 | self.region_numbering.append((residue, amino_acid, current_regions))
38 | self.region_sequences[current_regions] += (
39 | amino_acid if amino_acid != "-" else ""
40 | )
41 |
42 |
43 | def define_imgt_regions() -> Dict[str, str]:
44 | """
45 | Antibody region definition according to imgt scheme
46 | """
47 | heavy_region = (
48 | "1" * 26 + "2" * 12 + "3" * 17 + "4" * 10 + "5" * 39 + "6" * 13 + "7" * 11
49 | )
50 | light_region = heavy_region
51 | assert len(heavy_region) == 128
52 |
53 | return {"H": heavy_region, "L": light_region}
54 |
55 |
56 | def define_chothia_regions() -> Dict[str, str]:
57 | light_region = (
58 | "1" * 23 + "2" * 17 + "3" * 15 + "4" * 14 + "5" * 35 + "6" * 13 + "7" * 11
59 | )
60 |
61 | heavy_region = (
62 | "1" * 26 + "2" * 11 + "3" * 19 + "4" * 8 + "5" * 42 + "6" * 11 + "7" * 11
63 | )
64 | for reg in [light_region, heavy_region]:
65 | assert len(reg) == 128
66 |
67 | return {"H": heavy_region, "L": light_region}
68 |
69 |
70 | def define_kabat_regions() -> Dict[str, str]:
71 | light_region = (
72 | "1" * 23 + "2" * 17 + "3" * 15 + "4" * 14 + "5" * 35 + "6" * 13 + "7" * 11
73 | )
74 | heavy_region = (
75 | "1" * 35 + "2" * 5 + "3" * 14 + "4" * 20 + "5" * 32 + "6" * 11 + "7" * 11
76 | )
77 |
78 | for reg in [light_region, heavy_region]:
79 | assert len(reg) == 128
80 |
81 | return {"H": heavy_region, "L": light_region}
82 |
83 |
84 | def define_contact_regions() -> Dict[str, str]:
85 | light_region = (
86 | "1" * 35 + "2" * 7 + "3" * 9 + "4" * 17 + "5" * 36 + "6" * 12 + "7" * 12
87 | )
88 | heavy_region = (
89 | "1" * 30 + "2" * 10 + "3" * 11 + "4" * 15 + "5" * 38 + "6" * 12 + "7" * 12
90 | )
91 | for reg in [light_region, heavy_region]:
92 | assert len(reg) == 128
93 |
94 | return {"H": heavy_region, "L": light_region}
95 |
96 |
97 | def define_north_regions() -> Dict[str, str]:
98 | light_region = (
99 | "1" * 23 + "2" * 17 + "3" * 14 + "4" * 15 + "5" * 35 + "6" * 13 + "7" * 11
100 | )
101 | heavy_region = (
102 | "1" * 23 + "2" * 17 + "3" * 14 + "4" * 12 + "5" * 38 + "6" * 13 + "7" * 11
103 | )
104 | for reg in [light_region, heavy_region]:
105 | assert len(reg) == 128
106 |
107 | return {"H": heavy_region, "L": light_region}
108 |
109 |
110 | _regions = {}
111 | for scheme, define_function in [
112 | ("imgt", define_imgt_regions),
113 | ("chothia", define_chothia_regions),
114 | ("kabat", define_kabat_regions),
115 | ("north", define_north_regions),
116 | ("contact", define_contact_regions),
117 | ]:
118 | _regions[scheme] = define_function()
119 |
120 | # For internal use only. These are not direct conversions and are handled heuristically.
121 | _index_to_imgt_state = {
122 | ("chothia", "H"): {
123 | 1: 0,
124 | 2: 1,
125 | 3: 2,
126 | 4: 3,
127 | 5: 4,
128 | 6: 6,
129 | 7: 7,
130 | 8: 8,
131 | 9: 9,
132 | 10: 10,
133 | 11: 11,
134 | 12: 12,
135 | 13: 13,
136 | 14: 14,
137 | 15: 15,
138 | 16: 16,
139 | 17: 17,
140 | 18: 18,
141 | 19: 19,
142 | 20: 20,
143 | 21: 21,
144 | 22: 22,
145 | 23: 23,
146 | 24: 24,
147 | 25: 25,
148 | 26: 26,
149 | 27: 27,
150 | 28: 28,
151 | 29: 29,
152 | 30: 30,
153 | 31: 35,
154 | 32: 36,
155 | 33: 37,
156 | 34: 38,
157 | 35: 39,
158 | 36: 40,
159 | 37: 41,
160 | 38: 42,
161 | 39: 43,
162 | 40: 44,
163 | 41: 45,
164 | 42: 46,
165 | 43: 47,
166 | 44: 48,
167 | 45: 49,
168 | 46: 50,
169 | 47: 51,
170 | 48: 52,
171 | 49: 53,
172 | 50: 54,
173 | 51: 55,
174 | 52: 59,
175 | 53: 60,
176 | 54: 61,
177 | 55: 62,
178 | 56: 63,
179 | 57: 64,
180 | 58: 65,
181 | 59: 66,
182 | 60: 67,
183 | 61: 68,
184 | 62: 69,
185 | 63: 70,
186 | 64: 72,
187 | 65: 73,
188 | 66: 74,
189 | 67: 75,
190 | 68: 76,
191 | 69: 77,
192 | 70: 78,
193 | 71: 79,
194 | 72: 80,
195 | 73: 81,
196 | 74: 82,
197 | 75: 83,
198 | 76: 84,
199 | 77: 85,
200 | 78: 86,
201 | 79: 87,
202 | 80: 88,
203 | 81: 89,
204 | 82: 93,
205 | 83: 94,
206 | 84: 95,
207 | 85: 96,
208 | 86: 97,
209 | 87: 98,
210 | 88: 99,
211 | 89: 100,
212 | 90: 101,
213 | 91: 102,
214 | 92: 103,
215 | 93: 104,
216 | 94: 105,
217 | 95: 106,
218 | 96: 107,
219 | 97: 108,
220 | 98: 109,
221 | 99: 110,
222 | 100: 114,
223 | 101: 115,
224 | 102: 116,
225 | 103: 117,
226 | 104: 118,
227 | 105: 119,
228 | 106: 120,
229 | 107: 121,
230 | 108: 122,
231 | 109: 123,
232 | 110: 124,
233 | 111: 125,
234 | 112: 126,
235 | 113: 127,
236 | },
237 | ("kabat", "H"): {
238 | 1: 0,
239 | 2: 1,
240 | 3: 2,
241 | 4: 3,
242 | 5: 4,
243 | 6: 6,
244 | 7: 7,
245 | 8: 8,
246 | 9: 9,
247 | 10: 10,
248 | 11: 11,
249 | 12: 12,
250 | 13: 13,
251 | 14: 14,
252 | 15: 15,
253 | 16: 16,
254 | 17: 17,
255 | 18: 18,
256 | 19: 19,
257 | 20: 20,
258 | 21: 21,
259 | 22: 22,
260 | 23: 23,
261 | 24: 24,
262 | 25: 25,
263 | 26: 26,
264 | 27: 27,
265 | 28: 28,
266 | 29: 29,
267 | 30: 30,
268 | 31: 31,
269 | 32: 32,
270 | 33: 33,
271 | 34: 34,
272 | 35: 35,
273 | 36: 40,
274 | 37: 41,
275 | 38: 42,
276 | 39: 43,
277 | 40: 44,
278 | 41: 45,
279 | 42: 46,
280 | 43: 47,
281 | 44: 48,
282 | 45: 49,
283 | 46: 50,
284 | 47: 51,
285 | 48: 52,
286 | 49: 53,
287 | 50: 54,
288 | 51: 55,
289 | 52: 59,
290 | 53: 60,
291 | 54: 61,
292 | 55: 62,
293 | 56: 63,
294 | 57: 64,
295 | 58: 65,
296 | 59: 66,
297 | 60: 67,
298 | 61: 68,
299 | 62: 69,
300 | 63: 70,
301 | 64: 72,
302 | 65: 73,
303 | 66: 74,
304 | 67: 75,
305 | 68: 76,
306 | 69: 77,
307 | 70: 78,
308 | 71: 79,
309 | 72: 80,
310 | 73: 81,
311 | 74: 82,
312 | 75: 83,
313 | 76: 84,
314 | 77: 85,
315 | 78: 86,
316 | 79: 87,
317 | 80: 88,
318 | 81: 89,
319 | 82: 93,
320 | 83: 94,
321 | 84: 95,
322 | 85: 96,
323 | 86: 97,
324 | 87: 98,
325 | 88: 99,
326 | 89: 100,
327 | 90: 101,
328 | 91: 102,
329 | 92: 103,
330 | 93: 104,
331 | 94: 105,
332 | 95: 106,
333 | 96: 107,
334 | 97: 108,
335 | 98: 109,
336 | 99: 110,
337 | 100: 114,
338 | 101: 115,
339 | 102: 116,
340 | 103: 117,
341 | 104: 118,
342 | 105: 119,
343 | 106: 120,
344 | 107: 121,
345 | 108: 122,
346 | 109: 123,
347 | 110: 124,
348 | 111: 125,
349 | 112: 126,
350 | 113: 127,
351 | },
352 | ("imgt", "H"): {
353 | 1: 0,
354 | 2: 1,
355 | 3: 2,
356 | 4: 3,
357 | 5: 4,
358 | 6: 5,
359 | 7: 6,
360 | 8: 7,
361 | 9: 8,
362 | 10: 9,
363 | 11: 10,
364 | 12: 11,
365 | 13: 12,
366 | 14: 13,
367 | 15: 14,
368 | 16: 15,
369 | 17: 16,
370 | 18: 17,
371 | 19: 18,
372 | 20: 19,
373 | 21: 20,
374 | 22: 21,
375 | 23: 22,
376 | 24: 23,
377 | 25: 24,
378 | 26: 25,
379 | 27: 26,
380 | 28: 27,
381 | 29: 28,
382 | 30: 29,
383 | 31: 30,
384 | 32: 31,
385 | 33: 32,
386 | 34: 33,
387 | 35: 34,
388 | 36: 35,
389 | 37: 36,
390 | 38: 37,
391 | 39: 38,
392 | 40: 39,
393 | 41: 40,
394 | 42: 41,
395 | 43: 42,
396 | 44: 43,
397 | 45: 44,
398 | 46: 45,
399 | 47: 46,
400 | 48: 47,
401 | 49: 48,
402 | 50: 49,
403 | 51: 50,
404 | 52: 51,
405 | 53: 52,
406 | 54: 53,
407 | 55: 54,
408 | 56: 55,
409 | 57: 56,
410 | 58: 57,
411 | 59: 58,
412 | 60: 59,
413 | 61: 60,
414 | 62: 61,
415 | 63: 62,
416 | 64: 63,
417 | 65: 64,
418 | 66: 65,
419 | 67: 66,
420 | 68: 67,
421 | 69: 68,
422 | 70: 69,
423 | 71: 70,
424 | 72: 71,
425 | 73: 72,
426 | 74: 73,
427 | 75: 74,
428 | 76: 75,
429 | 77: 76,
430 | 78: 77,
431 | 79: 78,
432 | 80: 79,
433 | 81: 80,
434 | 82: 81,
435 | 83: 82,
436 | 84: 83,
437 | 85: 84,
438 | 86: 85,
439 | 87: 86,
440 | 88: 87,
441 | 89: 88,
442 | 90: 89,
443 | 91: 90,
444 | 92: 91,
445 | 93: 92,
446 | 94: 93,
447 | 95: 94,
448 | 96: 95,
449 | 97: 96,
450 | 98: 97,
451 | 99: 98,
452 | 100: 99,
453 | 101: 100,
454 | 102: 101,
455 | 103: 102,
456 | 104: 103,
457 | 105: 104,
458 | 106: 105,
459 | 107: 106,
460 | 108: 107,
461 | 109: 108,
462 | 110: 109,
463 | 111: 110,
464 | 112: 111,
465 | 113: 112,
466 | 114: 113,
467 | 115: 114,
468 | 116: 115,
469 | 117: 116,
470 | 118: 117,
471 | 119: 118,
472 | 120: 119,
473 | 121: 120,
474 | 122: 121,
475 | 123: 122,
476 | 124: 123,
477 | 125: 124,
478 | 126: 125,
479 | 127: 126,
480 | 128: 127,
481 | },
482 | ("chothia", "L"): {
483 | 1: 0,
484 | 2: 1,
485 | 3: 2,
486 | 4: 3,
487 | 5: 4,
488 | 6: 5,
489 | 7: 6,
490 | 8: 7,
491 | 9: 8,
492 | 10: 9,
493 | 11: 10,
494 | 12: 11,
495 | 13: 12,
496 | 14: 13,
497 | 15: 14,
498 | 16: 15,
499 | 17: 16,
500 | 18: 17,
501 | 19: 18,
502 | 20: 19,
503 | 21: 20,
504 | 22: 21,
505 | 23: 22,
506 | 24: 23,
507 | 25: 24,
508 | 26: 25,
509 | 27: 26,
510 | 28: 27,
511 | 29: 28,
512 | 30: 35,
513 | 31: 36,
514 | 32: 37,
515 | 33: 38,
516 | 34: 39,
517 | 35: 40,
518 | 36: 41,
519 | 37: 42,
520 | 38: 43,
521 | 39: 44,
522 | 40: 45,
523 | 41: 46,
524 | 42: 47,
525 | 43: 48,
526 | 44: 49,
527 | 45: 50,
528 | 46: 51,
529 | 47: 52,
530 | 48: 53,
531 | 49: 54,
532 | 50: 55,
533 | 51: 56,
534 | 52: 57,
535 | 53: 65,
536 | 54: 66,
537 | 55: 67,
538 | 56: 68,
539 | 57: 69,
540 | 58: 70,
541 | 59: 72,
542 | 60: 73,
543 | 61: 74,
544 | 62: 75,
545 | 63: 76,
546 | 64: 77,
547 | 65: 78,
548 | 66: 81,
549 | 67: 82,
550 | 68: 83,
551 | 69: 84,
552 | 70: 85,
553 | 71: 86,
554 | 72: 87,
555 | 73: 88,
556 | 74: 89,
557 | 75: 90,
558 | 76: 91,
559 | 77: 92,
560 | 78: 93,
561 | 79: 94,
562 | 80: 95,
563 | 81: 96,
564 | 82: 97,
565 | 83: 98,
566 | 84: 99,
567 | 85: 100,
568 | 86: 101,
569 | 87: 102,
570 | 88: 103,
571 | 89: 104,
572 | 90: 105,
573 | 91: 106,
574 | 92: 107,
575 | 93: 108,
576 | 94: 109,
577 | 95: 114,
578 | 96: 115,
579 | 97: 116,
580 | 98: 117,
581 | 99: 118,
582 | 100: 119,
583 | 101: 120,
584 | 102: 121,
585 | 103: 122,
586 | 104: 123,
587 | 105: 124,
588 | 106: 125,
589 | 107: 126,
590 | 108: 127,
591 | },
592 | ("martin", "H"): {
593 | 1: 0,
594 | 2: 1,
595 | 3: 2,
596 | 4: 3,
597 | 5: 4,
598 | 6: 5,
599 | 7: 6,
600 | 8: 8,
601 | 9: 9,
602 | 10: 10,
603 | 11: 11,
604 | 12: 12,
605 | 13: 13,
606 | 14: 14,
607 | 15: 15,
608 | 16: 16,
609 | 17: 17,
610 | 18: 18,
611 | 19: 19,
612 | 20: 20,
613 | 21: 21,
614 | 22: 22,
615 | 23: 23,
616 | 24: 24,
617 | 25: 25,
618 | 26: 26,
619 | 27: 27,
620 | 28: 28,
621 | 29: 29,
622 | 30: 30,
623 | 31: 35,
624 | 32: 36,
625 | 33: 37,
626 | 34: 38,
627 | 35: 39,
628 | 36: 40,
629 | 37: 41,
630 | 38: 42,
631 | 39: 43,
632 | 40: 44,
633 | 41: 45,
634 | 42: 46,
635 | 43: 47,
636 | 44: 48,
637 | 45: 49,
638 | 46: 50,
639 | 47: 51,
640 | 48: 52,
641 | 49: 53,
642 | 50: 54,
643 | 51: 55,
644 | 52: 59,
645 | 53: 60,
646 | 54: 61,
647 | 55: 62,
648 | 56: 63,
649 | 57: 64,
650 | 58: 65,
651 | 59: 66,
652 | 60: 67,
653 | 61: 68,
654 | 62: 69,
655 | 63: 70,
656 | 64: 72,
657 | 65: 73,
658 | 66: 74,
659 | 67: 75,
660 | 68: 76,
661 | 69: 77,
662 | 70: 78,
663 | 71: 79,
664 | 72: 83,
665 | 73: 84,
666 | 74: 85,
667 | 75: 86,
668 | 76: 87,
669 | 77: 88,
670 | 78: 89,
671 | 79: 90,
672 | 80: 91,
673 | 81: 92,
674 | 82: 93,
675 | 83: 94,
676 | 84: 95,
677 | 85: 96,
678 | 86: 97,
679 | 87: 98,
680 | 88: 99,
681 | 89: 100,
682 | 90: 101,
683 | 91: 102,
684 | 92: 103,
685 | 93: 104,
686 | 94: 105,
687 | 95: 106,
688 | 96: 107,
689 | 97: 108,
690 | 98: 109,
691 | 99: 110,
692 | 100: 114,
693 | 101: 115,
694 | 102: 116,
695 | 103: 117,
696 | 104: 118,
697 | 105: 119,
698 | 106: 120,
699 | 107: 121,
700 | 108: 122,
701 | 109: 123,
702 | 110: 124,
703 | 111: 125,
704 | 112: 126,
705 | 113: 127,
706 | },
707 | ("kabat", "L"): {
708 | 1: 0,
709 | 2: 1,
710 | 3: 2,
711 | 4: 3,
712 | 5: 4,
713 | 6: 5,
714 | 7: 6,
715 | 8: 7,
716 | 9: 8,
717 | 10: 9,
718 | 11: 10,
719 | 12: 11,
720 | 13: 12,
721 | 14: 13,
722 | 15: 14,
723 | 16: 15,
724 | 17: 16,
725 | 18: 17,
726 | 19: 18,
727 | 20: 19,
728 | 21: 20,
729 | 22: 21,
730 | 23: 22,
731 | 24: 23,
732 | 25: 24,
733 | 26: 25,
734 | 27: 32,
735 | 28: 33,
736 | 29: 34,
737 | 30: 35,
738 | 31: 36,
739 | 32: 37,
740 | 33: 38,
741 | 34: 39,
742 | 35: 40,
743 | 36: 41,
744 | 37: 42,
745 | 38: 43,
746 | 39: 44,
747 | 40: 45,
748 | 41: 46,
749 | 42: 47,
750 | 43: 48,
751 | 44: 49,
752 | 45: 50,
753 | 46: 51,
754 | 47: 52,
755 | 48: 53,
756 | 49: 54,
757 | 50: 55,
758 | 51: 56,
759 | 52: 57,
760 | 53: 65,
761 | 54: 66,
762 | 55: 67,
763 | 56: 68,
764 | 57: 69,
765 | 58: 70,
766 | 59: 72,
767 | 60: 73,
768 | 61: 74,
769 | 62: 75,
770 | 63: 76,
771 | 64: 77,
772 | 65: 78,
773 | 66: 81,
774 | 67: 82,
775 | 68: 83,
776 | 69: 84,
777 | 70: 85,
778 | 71: 86,
779 | 72: 87,
780 | 73: 88,
781 | 74: 89,
782 | 75: 90,
783 | 76: 91,
784 | 77: 92,
785 | 78: 93,
786 | 79: 94,
787 | 80: 95,
788 | 81: 96,
789 | 82: 97,
790 | 83: 98,
791 | 84: 99,
792 | 85: 100,
793 | 86: 101,
794 | 87: 102,
795 | 88: 103,
796 | 89: 104,
797 | 90: 105,
798 | 91: 106,
799 | 92: 107,
800 | 93: 108,
801 | 94: 109,
802 | 95: 114,
803 | 96: 115,
804 | 97: 116,
805 | 98: 117,
806 | 99: 118,
807 | 100: 119,
808 | 101: 120,
809 | 102: 121,
810 | 103: 122,
811 | 104: 123,
812 | 105: 124,
813 | 106: 125,
814 | 107: 126,
815 | 108: 127,
816 | },
817 | ("imgt", "L"): {
818 | 1: 0,
819 | 2: 1,
820 | 3: 2,
821 | 4: 3,
822 | 5: 4,
823 | 6: 5,
824 | 7: 6,
825 | 8: 7,
826 | 9: 8,
827 | 10: 9,
828 | 11: 10,
829 | 12: 11,
830 | 13: 12,
831 | 14: 13,
832 | 15: 14,
833 | 16: 15,
834 | 17: 16,
835 | 18: 17,
836 | 19: 18,
837 | 20: 19,
838 | 21: 20,
839 | 22: 21,
840 | 23: 22,
841 | 24: 23,
842 | 25: 24,
843 | 26: 25,
844 | 27: 26,
845 | 28: 27,
846 | 29: 28,
847 | 30: 29,
848 | 31: 30,
849 | 32: 31,
850 | 33: 32,
851 | 34: 33,
852 | 35: 34,
853 | 36: 35,
854 | 37: 36,
855 | 38: 37,
856 | 39: 38,
857 | 40: 39,
858 | 41: 40,
859 | 42: 41,
860 | 43: 42,
861 | 44: 43,
862 | 45: 44,
863 | 46: 45,
864 | 47: 46,
865 | 48: 47,
866 | 49: 48,
867 | 50: 49,
868 | 51: 50,
869 | 52: 51,
870 | 53: 52,
871 | 54: 53,
872 | 55: 54,
873 | 56: 55,
874 | 57: 56,
875 | 58: 57,
876 | 59: 58,
877 | 60: 59,
878 | 61: 60,
879 | 62: 61,
880 | 63: 62,
881 | 64: 63,
882 | 65: 64,
883 | 66: 65,
884 | 67: 66,
885 | 68: 67,
886 | 69: 68,
887 | 70: 69,
888 | 71: 70,
889 | 72: 71,
890 | 73: 72,
891 | 74: 73,
892 | 75: 74,
893 | 76: 75,
894 | 77: 76,
895 | 78: 77,
896 | 79: 78,
897 | 80: 79,
898 | 81: 80,
899 | 82: 81,
900 | 83: 82,
901 | 84: 83,
902 | 85: 84,
903 | 86: 85,
904 | 87: 86,
905 | 88: 87,
906 | 89: 88,
907 | 90: 89,
908 | 91: 90,
909 | 92: 91,
910 | 93: 92,
911 | 94: 93,
912 | 95: 94,
913 | 96: 95,
914 | 97: 96,
915 | 98: 97,
916 | 99: 98,
917 | 100: 99,
918 | 101: 100,
919 | 102: 101,
920 | 103: 102,
921 | 104: 103,
922 | 105: 104,
923 | 106: 105,
924 | 107: 106,
925 | 108: 107,
926 | 109: 108,
927 | 110: 109,
928 | 111: 110,
929 | 112: 111,
930 | 113: 112,
931 | 114: 113,
932 | 115: 114,
933 | 116: 115,
934 | 117: 116,
935 | 118: 117,
936 | 119: 118,
937 | 120: 119,
938 | 121: 120,
939 | 122: 121,
940 | 123: 122,
941 | 124: 123,
942 | 125: 124,
943 | 126: 125,
944 | 127: 126,
945 | 128: 127,
946 | },
947 | ("martin", "L"): {
948 | 1: 0,
949 | 2: 1,
950 | 3: 2,
951 | 4: 3,
952 | 5: 4,
953 | 6: 5,
954 | 7: 6,
955 | 8: 7,
956 | 9: 8,
957 | 10: 9,
958 | 11: 10,
959 | 12: 11,
960 | 13: 12,
961 | 14: 13,
962 | 15: 14,
963 | 16: 15,
964 | 17: 16,
965 | 18: 17,
966 | 19: 18,
967 | 20: 19,
968 | 21: 20,
969 | 22: 21,
970 | 23: 22,
971 | 24: 23,
972 | 25: 24,
973 | 26: 25,
974 | 27: 26,
975 | 28: 27,
976 | 29: 28,
977 | 30: 35,
978 | 31: 36,
979 | 32: 37,
980 | 33: 38,
981 | 34: 39,
982 | 35: 40,
983 | 36: 41,
984 | 37: 42,
985 | 38: 43,
986 | 39: 44,
987 | 40: 45,
988 | 41: 46,
989 | 42: 47,
990 | 43: 48,
991 | 44: 49,
992 | 45: 50,
993 | 46: 51,
994 | 47: 52,
995 | 48: 53,
996 | 49: 54,
997 | 50: 55,
998 | 51: 56,
999 | 52: 57,
1000 | 53: 65,
1001 | 54: 66,
1002 | 55: 67,
1003 | 56: 68,
1004 | 57: 69,
1005 | 58: 70,
1006 | 59: 72,
1007 | 60: 73,
1008 | 61: 74,
1009 | 62: 75,
1010 | 63: 76,
1011 | 64: 77,
1012 | 65: 78,
1013 | 66: 81,
1014 | 67: 82,
1015 | 68: 83,
1016 | 69: 84,
1017 | 70: 85,
1018 | 71: 86,
1019 | 72: 87,
1020 | 73: 88,
1021 | 74: 89,
1022 | 75: 90,
1023 | 76: 91,
1024 | 77: 92,
1025 | 78: 93,
1026 | 79: 94,
1027 | 80: 95,
1028 | 81: 96,
1029 | 82: 97,
1030 | 83: 98,
1031 | 84: 99,
1032 | 85: 100,
1033 | 86: 101,
1034 | 87: 102,
1035 | 88: 103,
1036 | 89: 104,
1037 | 90: 105,
1038 | 91: 106,
1039 | 92: 107,
1040 | 93: 108,
1041 | 94: 109,
1042 | 95: 114,
1043 | 96: 115,
1044 | 97: 116,
1045 | 98: 117,
1046 | 99: 118,
1047 | 100: 119,
1048 | 101: 120,
1049 | 102: 121,
1050 | 103: 122,
1051 | 104: 123,
1052 | 105: 124,
1053 | 106: 125,
1054 | 107: 126,
1055 | 108: 127,
1056 | },
1057 | }
1058 |
1059 | # Wolfguy will be deprecated in ANARCI v1.0.0
1060 | wolfguy_indexdiv50_to_region = {
1061 | "H": ["fwh1", "cdrh1", "fwh2", "cdrh2", "fwh3", "cdrh3", "fwh4"],
1062 | "L": ["fwl1", "cdrl1", "fwl2", "cdrl2", "fwl3", "cdrl3", "fwl4"],
1063 | }
1064 |
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/anarci_utils.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass, field
2 | from typing import Optional, Union
3 |
4 | from ab_characterisation.utils.anarci_region_definition_utils import (_index_to_imgt_state,
5 | _regions)
6 |
7 | _reg_one2three = {
8 | "1": "fw%s1",
9 | "2": "cdr%s1",
10 | "3": "fw%s2",
11 | "4": "cdr%s2",
12 | "5": "fw%s3",
13 | "6": "cdr%s3",
14 | "7": "fw%s4",
15 | }
16 |
17 |
18 | @dataclass
19 | class Accept: # pylint: disable=R0902
20 | """
21 | Class that taken ANARCI numbering and classifies each position according to antibody region
22 | """
23 |
24 | _defined_regions: list[str] = field(
25 | init=False,
26 | repr=False,
27 | default_factory=lambda: [
28 | "fwh1",
29 | "fwh2",
30 | "fwh3",
31 | "fwh4",
32 | "fwl1",
33 | "fwl2",
34 | "fwl3",
35 | "fwl4",
36 | "cdrh1",
37 | "cdrh2",
38 | "cdrh3",
39 | "cdrl1",
40 | "cdrl2",
41 | "cdrl3",
42 | ],
43 | )
44 | numbering_scheme: str = field(default="imgt")
45 | definition: str = field(default="imgt")
46 | not_defined: bool = field(default=False)
47 | positions: dict[str, set[tuple[int, str]]] = field(init=False)
48 | exclude: dict[str, set[tuple[int, str]]] = field(init=False)
49 | regions: set[str] = field(init=False, default_factory=set)
50 |
51 | def __post_init__(self) -> None:
52 |
53 | self._macro_regions = {
54 | "hframework": {"fwh1", "fwh2", "fwh3", "fwh4"},
55 | "hcdrs": {"cdrh1", "cdrh2", "cdrh3"},
56 | "lframework": {"fwl1", "fwl2", "fwl3", "fwl4"},
57 | "lcdrs": {"cdrl1", "cdrl2", "cdrl3"},
58 | }
59 | self._macro_regions.update(
60 | {
61 | "framework": self._macro_regions["hframework"]
62 | | self._macro_regions["lframework"],
63 | "cdrs": self._macro_regions["hcdrs"] | self._macro_regions["lcdrs"],
64 | "vh": self._macro_regions["hcdrs"] | self._macro_regions["hframework"],
65 | "vl": self._macro_regions["lcdrs"] | self._macro_regions["lframework"],
66 | }
67 | )
68 |
69 | self._macro_regions.update(
70 | {"fv": self._macro_regions["vh"] | self._macro_regions["vl"]}
71 | )
72 |
73 | self.positions = {"H": set(), "L": set()}
74 | self.exclude = {"H": set(), "L": set()}
75 |
76 | def set_regions(self, regions: Union[list, str, None] = None) -> None:
77 | """
78 | Set the regions to be used. Will clear anything added using add regions.
79 | """
80 | if not regions:
81 | raise AssertionError(
82 | f"Need to specify a list of regions: {self._defined_regions}"
83 | )
84 |
85 | if isinstance(regions, str):
86 | regions = [regions]
87 |
88 | if self.not_defined:
89 | self.regions = self._macro_regions["fv"]
90 | else:
91 | self.regions = set()
92 |
93 | self.add_regions(regions)
94 |
95 | def add_regions(self, regions: list) -> None:
96 | """
97 | Add regions to the selection.
98 | """
99 | for region in regions:
100 | region = region.lower()
101 | if region in self._defined_regions:
102 | if self.not_defined:
103 | self.regions = self.regions - set([region])
104 | else:
105 | self.regions.add(region)
106 | elif region in self._macro_regions:
107 | if self.not_defined:
108 | self.regions = self.regions - self._macro_regions[region]
109 | else:
110 | self.regions = self.regions | self._macro_regions[region]
111 | else:
112 | raise AssertionError(
113 | f"Got unexpected region: {region}. Allowed: {self._defined_regions} "
114 | )
115 |
116 | def add_positions(self, positions: list[tuple[int, str]], chain: str) -> None:
117 | for position in positions:
118 | self.positions[chain].add(position)
119 |
120 | def exclude_positions(self, positions: list[tuple[int, str]], chain: str) -> None:
121 | for position in positions:
122 | self.exclude[chain].add(position)
123 |
124 | def accept(self, position: tuple[int, str], chain: str) -> Optional[int]:
125 | if position in self.exclude[chain]:
126 | return None
127 | if (
128 | get_region(position, chain, self.numbering_scheme, self.definition)
129 | in self.regions
130 | or position in self.positions[chain]
131 | ):
132 | return 1
133 | return None
134 |
135 |
136 | def get_region( # pylint: disable=R0911
137 | position: tuple[int, str],
138 | chain: str,
139 | numbering_scheme: str = "imgt",
140 | definition: str = "imgt",
141 | ) -> str:
142 | """
143 | Get the region in which the position belongs given the chain, numbering scheme and definition.
144 | **Note** this function does not know about insertions on the sequence. Therefore, it will get the region annotation
145 | wrong when using non-equivalent scheme-definitions.
146 | To get around this please use the annotate_regions function
147 | which implements heuristics to get the definition correct
148 | in the scheme.
149 | """
150 |
151 | if numbering_scheme == "wolfguy" or definition == "wolfguy":
152 | raise NotImplementedError(
153 | "Wolguy cdr/framework identification is not implemented"
154 | )
155 |
156 | index, insertion = position
157 | chain = chain.upper()
158 |
159 | # Horrible exception cases revolving around the kabat scheme/definition and cdr h1
160 | # Kabat numbering scheme will be deprecated in ANARCI v1.0.0
161 | if definition == "kabat":
162 | if (
163 | numbering_scheme == "kabat" and chain == "H" and 31 <= index < 36
164 | ): # Kabat scheme kabat definition.
165 | if index == 35:
166 | if insertion in " AB": # Position 31 to 35B
167 | return "cdrh1"
168 |
169 | return "fwh2" # 31C would be framework.
170 |
171 | return "cdrh1"
172 | if numbering_scheme == "kabat": # Kabat numbering, chothia or imgt definitions.
173 | if definition == "chothia" and chain == "H" and 33 <= index < 36:
174 | return "fwh2"
175 | if definition == "imgt" and chain == "H" and 34 <= index < 36:
176 | return "fwh2"
177 |
178 | try:
179 | return (
180 | _reg_one2three[
181 | _regions[definition][chain][
182 | _index_to_imgt_state[(numbering_scheme, chain)][index]
183 | ]
184 | ]
185 | % chain.lower()
186 | )
187 | except KeyError:
188 | return "?"
189 |
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/chimerax_utils.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import tempfile
3 | from dataclasses import dataclass
4 | from pathlib import Path
5 |
6 | from ImmuneBuilder.refine import refine
7 |
8 | from ab_characterisation.utils.data_classes import RunConfig
9 |
10 |
11 | @dataclass
12 | class ChimeraInput:
13 | name: str
14 | template: str
15 | query_ab: str
16 | template_ab_chains: str
17 | map_resolution: float
18 | query_ab_chains: str
19 | template_ag_chains: str
20 | output_file: str
21 |
22 |
23 | @dataclass
24 | class ChimeraOutput:
25 | success: bool
26 | output_file: str
27 |
28 |
29 | def write_script(script_name: str, payload: ChimeraInput) -> None:
30 | """
31 |
32 | Args:
33 | script_name:
34 | payload:
35 |
36 | Returns:
37 |
38 | """
39 | with open(script_name, "w") as outf:
40 | outf.write("from chimerax.core.commands import run\n")
41 | outf.write(f"run(session, 'open {payload.template}')\n")
42 | outf.write(
43 | f"run(session, 'molmap /{','.join(list(payload.template_ab_chains))} {payload.map_resolution}')\n"
44 | )
45 | outf.write(f"run(session, 'open {payload.query_ab}')\n")
46 | outf.write("run(session, 'fitmap #3 inMap #2 search 10')\n")
47 | outf.write(
48 | f"""run(session, "select #3/{','.join(list(payload.query_ab_chains))}#1/{','.join(list(payload.template_ag_chains))}")\n"""
49 | )
50 | outf.write(
51 | f"""run(session, "save {payload.output_file} format pdb selectedOnly true")\n"""
52 | )
53 | outf.write("""run(session, "exit")\n""")
54 |
55 |
56 | def run_chimerax(payload: ChimeraInput, config: RunConfig) -> ChimeraOutput:
57 | """
58 | Use Chimerax to create complex pdb file of the query AB and the target antigen, using the template context to guide
59 | the complex generation.
60 | Args:
61 | payload:
62 |
63 | Returns:
64 |
65 | """
66 | with tempfile.NamedTemporaryFile(suffix=".py") as temp_f:
67 | script_name = temp_f.name
68 | write_script(payload=payload, script_name=script_name)
69 |
70 | cmd = ["ChimeraX", "--script", script_name, "--nogui"]
71 | with open(
72 | config.output_directory / "logs" / f"{payload.name}_chimera.log", "w"
73 | ) as outf:
74 | subprocess.run(cmd, check=True, stderr=outf, stdout=outf)
75 |
76 | if not Path(payload.output_file).exists():
77 | output = ChimeraOutput(output_file=payload.output_file, success=False)
78 | return output
79 |
80 | with open(payload.output_file) as inf:
81 | lines = inf.readlines()
82 | with open(payload.output_file, "w") as outf:
83 | for line in lines:
84 | if line.startswith("ATOM"):
85 | outf.write(line)
86 |
87 | # refinement
88 | refined_output = payload.output_file.replace(".pdb", "_refined.pdb")
89 | success = refine(input_file=payload.output_file, output_file=refined_output)
90 | if not success:
91 | output = ChimeraOutput(output_file=refined_output, success=success)
92 | return output
93 |
94 | output = ChimeraOutput(
95 | output_file=payload.output_file, success=Path(payload.output_file).exists()
96 | )
97 | return output
98 |
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/data_classes.py:
--------------------------------------------------------------------------------
1 | import typing as t
2 | from dataclasses import dataclass, field
3 | from pathlib import Path
4 | from typing import Optional
5 |
6 | import numpy as np
7 | import pandas as pd
8 |
9 | from ab_characterisation.developability_tools.sequence_liabilities.scanner_classes import \
10 | SequenceLiability
11 | from ab_characterisation.utils.rosetta_utils import aggregate_rosetta_metrics
12 |
13 |
14 | @dataclass
15 | class BiologicsData:
16 | """ """
17 |
18 | heavy_sequence: str
19 | light_sequence: str
20 | name: str
21 | target_complex_reference: str
22 | target_complex_antigen_chains: str = "A"
23 | target_complex_antibody_chains: str = "HL"
24 | antibody_structure: t.Optional[str] = None
25 | discarded_by: t.Optional[str] = None
26 | tap_flags: list = field(default_factory=lambda: [])
27 | sequence_liabilities: list[SequenceLiability] = field(default_factory=lambda: [])
28 | rosetta_output_ab_only: Optional[pd.DataFrame] = None
29 | chimerax_complex_structure: t.Optional[str] = None
30 | rosetta_output_complex: Optional[pd.DataFrame] = None
31 | rank: Optional[int] = None
32 |
33 |
34 | @dataclass
35 | class RunConfig:
36 | """ """
37 |
38 | input_file: str
39 | output_directory: Path
40 | rosetta_base_directory: str = None
41 | chimera_map_resolution: float = 6.0
42 | dq_sequence_liabilities: list[str] = field(
43 | default_factory=lambda: ["Unpaired cysteine", "N-linked glycosylation"]
44 | )
45 | top_n: int = 100
46 | rosetta_replicates: int = 1
47 | exclude_complex_analysis: bool = False
48 |
49 | def __post_init__(self):
50 | self.output_directory.mkdir(exist_ok=True)
51 | (self.output_directory / "complex_structures").mkdir(exist_ok=True)
52 | (self.output_directory / "antibody_models").mkdir(exist_ok=True)
53 | (self.output_directory / "logs").mkdir(exist_ok=True)
54 | (self.output_directory / "rosetta_output").mkdir(exist_ok=True)
55 |
56 |
57 | def save_output(biol_data_ls: list[BiologicsData], config: RunConfig) -> None:
58 | row_dicts = []
59 | for biol_data in biol_data_ls:
60 | row_dict = {}
61 | for key, value in biol_data.__dict__.items():
62 | if isinstance(value, str):
63 | row_dict[key] = value
64 | elif isinstance(value, int):
65 | row_dict[key] = value
66 | elif value is None:
67 | row_dict[key] = np.nan
68 | elif key == "tap_flags":
69 | for tap_metric in value:
70 | row_dict[f"TAP-{tap_metric.metric_name}"] = tap_metric.flag
71 | elif key == "sequence_liabilities":
72 | seq_liab_str = ""
73 | for liability in value:
74 | seq_liab_str += f"{liability.liability_type}-{liability.motif}-{liability.positions_string}|"
75 | row_dict[key] = seq_liab_str
76 | elif key == "rosetta_output_ab_only":
77 | value = aggregate_rosetta_metrics(value)
78 | value.columns = ["ab-" + col for col in value.columns]
79 | for col in value.columns:
80 | row_dict[col] = value[col].iloc[0]
81 | elif key == "rosetta_output_complex":
82 | value = aggregate_rosetta_metrics(
83 | value, metrics=("dG_separated", "total_score")
84 | )
85 | value.columns = ["complex-" + col for col in value.columns]
86 | for col in value.columns:
87 | row_dict[col] = value[col].iloc[0]
88 | row_dicts.append(row_dict)
89 | pd.DataFrame(row_dicts).to_csv(config.output_directory / "output.csv")
90 |
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_ab_only.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ROSETTA3=
4 |
5 | $ROSETTA3/main/source/bin/rosetta_scripts.static.linuxgccrelease \
6 | -database $ROSETTA3/main/database \
7 | -in:file:s \
8 | -in:file:native \
9 | -parser:protocol ./rosetta_metrics_ab_only.xml \
10 | -beta \
11 | -include_sugars \
12 | -alternate_3_letter_codes pdb_sugar \
13 | -load_PDB_components false \
14 | -auto_detect_glycan_connections \
15 | -write_glycan_pdb_codes \
16 | -output_alternate_atomids \
17 | -write_pdb_link_records
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_ab_only.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_complex.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | ROSETTA3=
4 |
5 | $ROSETTA3/main/source/bin/rosetta_scripts.static.linuxgccrelease \
6 | -database $ROSETTA3/main/database \
7 | -in:file:s \
8 | -in:file:native \
9 | -parser:protocol ./rosetta_metrics_complex.xml \
10 | -beta \
11 | -include_sugars \
12 | -alternate_3_letter_codes pdb_sugar \
13 | -load_PDB_components false \
14 | -auto_detect_glycan_connections \
15 | -write_glycan_pdb_codes \
16 | -output_alternate_atomids \
17 | -write_pdb_link_records
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/rosetta_templates/rosetta_metrics_complex.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
--------------------------------------------------------------------------------
/src/ab_characterisation/utils/rosetta_utils.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 |
3 |
4 | def aggregate_rosetta_metrics(
5 | metric_df: pd.DataFrame, metrics: list[str] = ["dG_separated"]
6 | ) -> pd.DataFrame:
7 | """
8 | Args:
9 | metric_df:
10 | metrics:
11 |
12 | Returns:
13 |
14 | """
15 | metric_df = metric_df.select_dtypes("number")
16 | if len(metrics) == 1:
17 | idx = list(metric_df.sort_values(by=metrics[0], ascending=True)[:3].index)
18 |
19 | else:
20 | idx = []
21 | for metric in metrics:
22 | idx += list(metric_df.sort_values(by=metric, ascending=True)[:2].index)
23 | idx = set(idx)
24 | metric_df = metric_df.loc[list(idx)].mean().to_frame().T
25 | return metric_df
26 |
--------------------------------------------------------------------------------
/tests/data/test_pipeline.csv:
--------------------------------------------------------------------------------
1 | sequence_name,heavy_sequence,light_sequence,reference_complex
2 | test1,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLEWMGIIYPGDSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCARLGGRYYYDSSGYYYFDYWGQGTLVTVSS,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSSWVFGGGTKLTVL,tests/data/test_complex_reference.pdb
3 | test2,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLEWMGIIYPGDSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCARLGGRYYYDSSGYYYFDYWGQGTLVTVSS,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSSWVFGGGTKLTVL,tests/data/test_complex_reference.pdb
4 | test3,EVQLVQSGAEVKKPGESLKISCKGSGYSFTSYWIGWVRQMPGKGLEWMGIIYPGDSDTRYSPSFQGQVTISADKSISTAYLQWSSLKASDTAMYYCARLGGRYYYDSSGYYYFDYWGQGTLVTVSS,NFMLTQPHSVSESPGKTVTISCTRSSGSIASNYVQWYQQRPGSSPTTVIYEDNQRPSGVPDRFSGSIDSSSNSASLTISGLKTEDEADYYCQSYDSSSWVFGGGTKLTVL,tests/data/test_complex_reference.pdb
5 | test4,ELKLVETGGDLVKPGGSLTLSCEASGFTLRTYGMSWVRQTPQMRLEWVASISYGGLLYFSDSVKGRFTISRDIVRNILTLQMSRLRSEDTAIYYCARGTSFVRYFDVWGAGTTVTVSS,EVLLTQTPLSLPVSLGDQASISCRSSQTIVHTNGNTYFEWYLQKPGQSPHLLIYKVSNRLSGVPDRFSGSGSGTDFTLKISRVEAEDLGLYYCFQGSHSPWTFGGGTKLELK,tests/data/test_complex_reference.pdb
--------------------------------------------------------------------------------
/tests/integration/chimera_test_script.py:
--------------------------------------------------------------------------------
1 | from chimerax.core.commands import run
2 | run(session, "exit")
--------------------------------------------------------------------------------
/tests/integration/test_pipeline.py:
--------------------------------------------------------------------------------
1 | import os
2 | from pathlib import Path
3 | from mpi4py import MPI
4 |
5 | from ab_characterisation.pipeline_orchestration import pipeline, RunConfig
6 |
7 |
8 | def test_pipeline():
9 | comm = MPI.COMM_WORLD
10 | rank = comm.Get_rank()
11 | size = comm.Get_size()
12 | input_file = Path(__file__).parent.parent / "data" / "test_pipeline.csv"
13 | output_dir = Path(__file__).parent.parent / "data" / "ab_characterisation_output"
14 | rosetta_base_directory = os.environ.get('ROSETTA_BASE')
15 | config = RunConfig(
16 | chimera_map_resolution=6,
17 | input_file=str(input_file),
18 | output_directory=output_dir,
19 | rosetta_base_directory=rosetta_base_directory,
20 | )
21 | pipeline(config, mpi_rank=rank, mpi_size=size)
22 |
23 |
24 | if __name__ == '__main__':
25 | test_pipeline()
26 |
--------------------------------------------------------------------------------