├── .github
    └── workflows
    │   └── python-publish.yml
├── .gitignore
├── LICENSE
├── README.md
├── antiberty_pytorch
    ├── __init__.py
    ├── antiberty_pytorch.py
    ├── data.py
    └── train.py
├── data
    ├── download.smk
    └── manifest_230324.csv
├── img
    ├── antiberty_num_params.png
    ├── banner.png
    └── training.png
├── note
    └── oas_data_example.ipynb
├── setup.py
└── tokenizer
    └── ProteinTokenizer
        ├── added_tokens.json
        ├── special_tokens_map.json
        ├── tokenizer_config.json
        └── vocab.txt


/.github/workflows/python-publish.yml:
--------------------------------------------------------------------------------
 1 | # This workflow will upload a Python Package using Twine when a release is created
 2 | # For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries
 3 | 
 4 | # This workflow uses actions that are not certified by GitHub.
 5 | # They are provided by a third-party and are governed by
 6 | # separate terms of service, privacy policy, and support
 7 | # documentation.
 8 | 
 9 | name: Upload Python Package
10 | 
11 | on:
12 |   release:
13 |     types: [published]
14 | 
15 | jobs:
16 |   deploy:
17 | 
18 |     runs-on: ubuntu-latest
19 | 
20 |     steps:
21 |     - uses: actions/checkout@v2
22 |     - name: Set up Python
23 |       uses: actions/setup-python@v2
24 |       with:
25 |         python-version: '3.x'
26 |     - name: Install dependencies
27 |       run: |
28 |         python -m pip install --upgrade pip
29 |         pip install build
30 |     - name: Build package
31 |       run: python -m build
32 |     - name: Publish package
33 |       uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29
34 |       with:
35 |         user: __token__
36 |         password: ${{ secrets.PYPI_API_TOKEN }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | data/sequences
 2 | data/*.sh
 3 | 
 4 | **/checkpoints
 5 | lightning_logs/
 6 | note/.ipynb_checkpoints/
 7 | *.egg-info/
 8 | **/__pycache__/
 9 | wandb/
10 | **/.snakemake
11 | tuning/results
12 | **/*.pt
13 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 이도훈
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # antiberty-pytorch
 2 | [![Lightning](https://img.shields.io/badge/-Lightning-792ee5?logo=pytorchlightning&logoColor=white)](https://github.com/Lightning-AI/lightning)
 3 | 
 4 | 
 5 | ![antiberty_model](img/banner.png)
 6 | 
 7 | ## installation
 8 | ```bash
 9 | $ pip install antiberty-pytorch
10 | ```
11 | 
12 | ## Reproduction status
13 | 
14 | ### Number of parameters
15 | 
16 | ![numparams](img/antiberty_num_params.png)
17 | 
18 | This version of AntiBERTy implementation has 25,759,769 parameters in total, and it matches well with the approx. 26M parameters specified in the paper (See above).
19 | 
20 | ### Training with 1% of the entire OAS data
21 | 
22 | I've reproduced AntiBERTy training with about tiny ~1% of the entire OAS data (`batch_size=16`, `mask_prob=0.15`) and observed pretty reasonable loss decrease, though it's not for validation set.
23 | The training log can be found [here](https://api.wandb.ai/links/dohlee/qqzxgo1v).
24 | 
25 | ![training_log](img/training.png)
26 | 
27 | ## Observed Antibody Sequences (OAS) dataset preparation pipeline
28 | 
29 | I wrote a `snakemake` pipeline in the directory `data` to automate the dataset prep process. It will download metadata from [OAS](https://opig.stats.ox.ac.uk/webapps/oas/oas) and extract lists of sequences. The pipeline can be run as follows:
30 | 
31 | ```bash
32 | $ cd data
33 | $ snakemake -s download.smk -j1
34 | ```
35 | 
36 | *NOTE: Only 3% of the entire OAS sequences were downloaded for now due to space and computational cost. (83M sequences, 31GB)*
37 | 
38 | ## Citation
39 | ```bibtex
40 | @article{ruffolo2021deciphering,
41 |     title = {Deciphering antibody affinity maturation with language models and weakly supervised learning},
42 |     author = {Ruffolo, Jeffrey A and Gray, Jeffrey J and Sulam, Jeremias},
43 |     journal = {arXiv},
44 |     year= {2021}
45 | }
46 | ```
47 | 


--------------------------------------------------------------------------------
/antiberty_pytorch/__init__.py:
--------------------------------------------------------------------------------
1 | from antiberty_pytorch.antiberty_pytorch import AntiBERTy
2 | from antiberty_pytorch.data import OASDataset


--------------------------------------------------------------------------------
/antiberty_pytorch/antiberty_pytorch.py:
--------------------------------------------------------------------------------
 1 | import torch.optim as optim
 2 | 
 3 | import pytorch_lightning as pl
 4 | import transformers
 5 | 
 6 | 
 7 | class AntiBERTy(pl.LightningModule):
 8 |     def __init__(self):
 9 |         super().__init__()
10 |         config = transformers.BertConfig(
11 |             vocab_size=25,
12 |             hidden_size=512,
13 |             num_hidden_layers=8,
14 |             num_attention_heads=8,
15 |             intermediate_size=2048,
16 |             max_position_embeddings=512,
17 |         )
18 |         self.bert = transformers.BertForMaskedLM(config)
19 | 
20 |     def forward(self, input_ids, labels=None):
21 |         return self.bert(input_ids, labels=labels)
22 | 
23 |     def training_step(self, batch, batch_idx):
24 |         input_ids, labels = batch["input_ids"], batch["labels"]
25 |         out = self(input_ids=input_ids, labels=labels)
26 | 
27 |         self.log_dict({"loss": out.loss}, prog_bar=True, on_step=True, on_epoch=True)
28 |         return out.loss
29 | 
30 |     def validation_step(self, batch, batch_idx):
31 |         input_ids, labels = batch["input_ids"], batch["labels"]
32 |         out = self(input_ids=input_ids, labels=labels)
33 | 
34 |         self.log_dict({"val/loss": out.loss}, prog_bar=True, on_step=True, on_epoch=True)
35 |         return out.loss
36 | 
37 |     def configure_optimizers(self):
38 |         return optim.AdamW(self.parameters(), lr=1e-5)
39 | 
40 | 
41 | if __name__ == "__main__":
42 |     from transformers import DataCollatorForLanguageModeling
43 |     from transformers import BertTokenizer
44 |     from torch.utils.data import DataLoader
45 | 
46 |     from .data import OASDataset
47 | 
48 |     tokenizer = BertTokenizer.from_pretrained("tokenizer/ProteinTokenizer")
49 |     collator = DataCollatorForLanguageModeling(
50 |         tokenizer=tokenizer,
51 |         mlm=True,
52 |         mlm_probability=0.5,
53 |     )
54 |     data = ["ACGACGACGACGAGC", "CGGCGAGCGAAG", "CGACGACGACAGCGACGACGAGCAGCAG"]
55 | 
56 |     ds = OASDataset(data, tokenizer, max_len=512)
57 |     loader = DataLoader(ds, batch_size=2, collate_fn=collator)
58 | 
59 |     model = AntiBERTy()
60 | 
61 |     for batch in loader:
62 |         print(batch["input_ids"])
63 |         print(batch["labels"])
64 | 
65 |         out = model(batch["input_ids"], labels=batch["labels"])
66 |         print(out.loss)
67 | 
68 |     print("# Parameters:", sum(p.numel() for p in model.parameters()))
69 | 


--------------------------------------------------------------------------------
/antiberty_pytorch/data.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | from torch.utils.data import Dataset
 5 | 
 6 | class OASDataset(Dataset):
 7 |     def __init__(self, data, tokenizer, max_len):
 8 |         self.data = data
 9 |         self.tokenizer = tokenizer
10 |         self.max_len = max_len
11 | 
12 |     def __len__(self):
13 |         return len(self.data)
14 | 
15 |     def __getitem__(self, index):
16 |         text = self.data[index]
17 |         encoding = self.tokenizer(text, truncation=True, max_length=self.max_len)
18 |         return encoding['input_ids']
19 |    
20 | if __name__ == '__main__':
21 |     from transformers import DataCollatorForLanguageModeling
22 |     from transformers import BertTokenizer
23 |     from torch.utils.data import DataLoader
24 | 
25 |     tokenizer = BertTokenizer.from_pretrained('tokenizer/ProteinTokenizer')
26 |     collator = DataCollatorForLanguageModeling(
27 |         tokenizer=tokenizer,
28 |         mlm=True,
29 |         mlm_probability=0.5,
30 |     )
31 |     data = ['ACGACGACGACGAGC', 'CGGCGAGCGAAG', 'CGACGACGACAGCGACGACGAGCAGCAG']
32 |     
33 |     ds = OASDataset(data, tokenizer, max_len=512)
34 |     loader = DataLoader(ds, batch_size=2, collate_fn=collator)
35 | 
36 |     for batch in loader:
37 |         print(batch['input_ids'])
38 |         print(batch['labels'])
39 | 
40 |         print(batch['input_ids'].shape)
41 |         print(batch['labels'].shape)
42 |         break


--------------------------------------------------------------------------------
/antiberty_pytorch/train.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import argparse
 3 | import os
 4 | import pytorch_lightning as pl
 5 | 
 6 | from torch.utils.data import DataLoader
 7 | from antiberty_pytorch import AntiBERTy, OASDataset
 8 | from transformers import (
 9 |     DataCollatorForLanguageModeling,
10 |     BertTokenizer,
11 | )
12 | 
13 | from pytorch_lightning.loggers import WandbLogger
14 | 
15 | 
16 | def parse_argument():
17 |     parser = argparse.ArgumentParser()
18 |     parser.add_argument("-i", "--input", required=True)
19 |     parser.add_argument("-o", "--output", required=True)
20 |     parser.add_argument("-a", "--accelerator", default="gpu")
21 |     parser.add_argument("-b", "--batch-size", type=int, default=16)
22 |     parser.add_argument("-p", "--mask-prob", type=float, default=0.15)
23 |     return parser.parse_args()
24 | 
25 | 
26 | def main():
27 |     torch.set_float32_matmul_precision("high")  # Trade-off precision for speed.
28 | 
29 |     args = parse_argument()
30 |     wandb_logger = WandbLogger(project="antiberty-pytorch", entity="dohlee")
31 | 
32 |     tokenizer = BertTokenizer.from_pretrained("tokenizer/ProteinTokenizer")
33 |     collator = DataCollatorForLanguageModeling(
34 |         tokenizer=tokenizer,
35 |         mlm=True,
36 |         mlm_probability=args.mask_prob,
37 |     )
38 | 
39 |     seqs = []
40 |     for fp in os.listdir(args.input):
41 |         with open(os.path.join(args.input, fp)) as f:
42 |             seqs += f.read().splitlines()
43 | 
44 |     train_seqs, val_seqs = seqs[: int(len(seqs) * 0.99)], seqs[int(len(seqs) * 0.99) :]
45 |     train_ds = OASDataset(train_seqs, tokenizer, max_len=512)
46 |     val_ds = OASDataset(val_seqs, tokenizer, max_len=512)
47 | 
48 |     train_loader = DataLoader(
49 |         train_ds,
50 |         batch_size=args.batch_size,
51 |         collate_fn=collator,
52 |         num_workers=4,
53 |         shuffle=True,
54 |     )
55 |     val_loader = DataLoader(
56 |         val_ds,
57 |         batch_size=args.batch_size,
58 |         collate_fn=collator,
59 |         num_workers=4,
60 |         shuffle=False,
61 |     )
62 | 
63 |     model = AntiBERTy()
64 |     trainer = pl.Trainer(
65 |         logger=wandb_logger,
66 |         accelerator=args.accelerator,
67 |         devices=1,
68 |         max_epochs=-1,
69 |     )
70 |     trainer.fit(model, train_loader, val_loader)
71 | 
72 | 
73 | if __name__ == "__main__":
74 |     main()
75 | 


--------------------------------------------------------------------------------
/data/download.smk:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | 
 3 | manifest = pd.read_csv('manifest_230324.csv')
 4 | 
 5 | # Randomly sample 3% of the dataset.
 6 | manifest = manifest.sample(frac=0.03, random_state=42)
 7 | 
 8 | f2type = {r.filename:r.seq_type for r in manifest.to_records()}
 9 | f2study = {r.filename:r.study for r in manifest.to_records()}
10 | 
11 | filenames = manifest.filename.values
12 | ALL = expand('sequences/{filename}.list', filename=filenames)
13 | 
14 | rule all:
15 |     input: ALL
16 | 
17 | rule download:
18 |     output:
19 |         'sequences/{filename}.list'
20 |     params:
21 |         type = lambda wc: f2type[wc.filename],
22 |         study = lambda wc: f2study[wc.filename],
23 |     shell:
24 |         'wget -qO- '
25 |         'http://opig.stats.ox.ac.uk/webapps/ngsdb/{params.type}/{params.study}/csv/{wildcards.filename}.csv.gz | '
26 |         'gunzip -c | '
27 |         'tail -n+3 | '
28 |         'cut -d, -f1 > {output}'
29 | 


--------------------------------------------------------------------------------
/img/antiberty_num_params.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dohlee/antiberty-pytorch/a6bc5f84e97454068aed13b2ac4be40144cc9e2a/img/antiberty_num_params.png


--------------------------------------------------------------------------------
/img/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dohlee/antiberty-pytorch/a6bc5f84e97454068aed13b2ac4be40144cc9e2a/img/banner.png


--------------------------------------------------------------------------------
/img/training.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dohlee/antiberty-pytorch/a6bc5f84e97454068aed13b2ac4be40144cc9e2a/img/training.png


--------------------------------------------------------------------------------
/note/oas_data_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pandas as pd"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 4,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "meta = pd.read_csv('/data/project/dohoon/antiberty-pytorch/ERR2843421_Heavy_IGHA.csv', skiprows=1)"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 9,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "data": {
 28 |       "text/plain": [
 29 |        "25363"
 30 |       ]
 31 |      },
 32 |      "execution_count": 9,
 33 |      "metadata": {},
 34 |      "output_type": "execute_result"
 35 |     }
 36 |    ],
 37 |    "source": [
 38 |     "len(meta)"
 39 |    ]
 40 |   },
 41 |   {
 42 |    "cell_type": "code",
 43 |    "execution_count": 14,
 44 |    "metadata": {},
 45 |    "outputs": [],
 46 |    "source": [
 47 |     "with open('../ERR2843421.list', 'w') as outFile:\n",
 48 |     "    print('\\n'.join(meta.sequence.values), file=outFile)"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 12,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "data": {
 58 |       "text/html": [
 59 |        "<div>\n",
 60 |        "<style scoped>\n",
 61 |        "    .dataframe tbody tr th:only-of-type {\n",
 62 |        "        vertical-align: middle;\n",
 63 |        "    }\n",
 64 |        "\n",
 65 |        "    .dataframe tbody tr th {\n",
 66 |        "        vertical-align: top;\n",
 67 |        "    }\n",
 68 |        "\n",
 69 |        "    .dataframe thead th {\n",
 70 |        "        text-align: right;\n",
 71 |        "    }\n",
 72 |        "</style>\n",
 73 |        "<table border=\"1\" class=\"dataframe\">\n",
 74 |        "  <thead>\n",
 75 |        "    <tr style=\"text-align: right;\">\n",
 76 |        "      <th></th>\n",
 77 |        "      <th>sequence</th>\n",
 78 |        "      <th>locus</th>\n",
 79 |        "      <th>stop_codon</th>\n",
 80 |        "      <th>vj_in_frame</th>\n",
 81 |        "      <th>v_frameshift</th>\n",
 82 |        "      <th>productive</th>\n",
 83 |        "      <th>rev_comp</th>\n",
 84 |        "      <th>complete_vdj</th>\n",
 85 |        "      <th>v_call</th>\n",
 86 |        "      <th>d_call</th>\n",
 87 |        "      <th>...</th>\n",
 88 |        "      <th>cdr3_start</th>\n",
 89 |        "      <th>cdr3_end</th>\n",
 90 |        "      <th>np1</th>\n",
 91 |        "      <th>np1_length</th>\n",
 92 |        "      <th>np2</th>\n",
 93 |        "      <th>np2_length</th>\n",
 94 |        "      <th>c_region</th>\n",
 95 |        "      <th>Redundancy</th>\n",
 96 |        "      <th>ANARCI_numbering</th>\n",
 97 |        "      <th>ANARCI_status</th>\n",
 98 |        "    </tr>\n",
 99 |        "  </thead>\n",
100 |        "  <tbody>\n",
101 |        "    <tr>\n",
102 |        "      <th>0</th>\n",
103 |        "      <td>AGCTCTGGGAGAGGAGCCCCAGCCCTGAAATTCCCAAGTGTTTCCA...</td>\n",
104 |        "      <td>H</td>\n",
105 |        "      <td>F</td>\n",
106 |        "      <td>T</td>\n",
107 |        "      <td>F</td>\n",
108 |        "      <td>T</td>\n",
109 |        "      <td>T</td>\n",
110 |        "      <td>T</td>\n",
111 |        "      <td>IGHV3-43*02</td>\n",
112 |        "      <td>IGHD3-16*01</td>\n",
113 |        "      <td>...</td>\n",
114 |        "      <td>423.0</td>\n",
115 |        "      <td>452.0</td>\n",
116 |        "      <td>T</td>\n",
117 |        "      <td>1.0</td>\n",
118 |        "      <td>TTAA</td>\n",
119 |        "      <td>4.0</td>\n",
120 |        "      <td>CATCCCCGACCAGCCCCAAGGTCTTCCCG</td>\n",
121 |        "      <td>4</td>\n",
122 |        "      <td>{'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...</td>\n",
123 |        "      <td>|Deletions: 10, 73||||</td>\n",
124 |        "    </tr>\n",
125 |        "    <tr>\n",
126 |        "      <th>1</th>\n",
127 |        "      <td>AGCTCTGGGAGAGGAGCCCCAGCCCTGAGATTCCCAGGTGTTTCCA...</td>\n",
128 |        "      <td>H</td>\n",
129 |        "      <td>F</td>\n",
130 |        "      <td>T</td>\n",
131 |        "      <td>F</td>\n",
132 |        "      <td>T</td>\n",
133 |        "      <td>T</td>\n",
134 |        "      <td>F</td>\n",
135 |        "      <td>IGHV3-9*01</td>\n",
136 |        "      <td>IGHD5/OR15-5a*01</td>\n",
137 |        "      <td>...</td>\n",
138 |        "      <td>426.0</td>\n",
139 |        "      <td>470.0</td>\n",
140 |        "      <td>CCAGAGGGA</td>\n",
141 |        "      <td>9.0</td>\n",
142 |        "      <td>CTGGG</td>\n",
143 |        "      <td>5.0</td>\n",
144 |        "      <td>TGCATCCCCGACCAGCCCCAAGGTCTTCCCG</td>\n",
145 |        "      <td>1</td>\n",
146 |        "      <td>{'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...</td>\n",
147 |        "      <td>Unusual residue: X|Deletions: 10, 73||||</td>\n",
148 |        "    </tr>\n",
149 |        "    <tr>\n",
150 |        "      <th>2</th>\n",
151 |        "      <td>GGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACC...</td>\n",
152 |        "      <td>H</td>\n",
153 |        "      <td>F</td>\n",
154 |        "      <td>T</td>\n",
155 |        "      <td>F</td>\n",
156 |        "      <td>T</td>\n",
157 |        "      <td>T</td>\n",
158 |        "      <td>T</td>\n",
159 |        "      <td>IGHV4-39*01</td>\n",
160 |        "      <td>NaN</td>\n",
161 |        "      <td>...</td>\n",
162 |        "      <td>382.0</td>\n",
163 |        "      <td>399.0</td>\n",
164 |        "      <td>GGCCCCG</td>\n",
165 |        "      <td>7.0</td>\n",
166 |        "      <td>NaN</td>\n",
167 |        "      <td>NaN</td>\n",
168 |        "      <td>CATCCCCGACCAGCCCCAAGGTCTTCCCG</td>\n",
169 |        "      <td>2</td>\n",
170 |        "      <td>{'fwh1': {'1 ': 'Q', '2 ': 'L', '3 ': 'Q', '4 ...</td>\n",
171 |        "      <td>|Deletions: 10, 55, 73||||</td>\n",
172 |        "    </tr>\n",
173 |        "  </tbody>\n",
174 |        "</table>\n",
175 |        "<p>3 rows × 97 columns</p>\n",
176 |        "</div>"
177 |       ],
178 |       "text/plain": [
179 |        "                                            sequence locus stop_codon  \\\n",
180 |        "0  AGCTCTGGGAGAGGAGCCCCAGCCCTGAAATTCCCAAGTGTTTCCA...     H          F   \n",
181 |        "1  AGCTCTGGGAGAGGAGCCCCAGCCCTGAGATTCCCAGGTGTTTCCA...     H          F   \n",
182 |        "2  GGCTTTCTGAGAGTCATGGATCTCATGTGCAAGAAAATGAAGCACC...     H          F   \n",
183 |        "\n",
184 |        "  vj_in_frame v_frameshift productive rev_comp complete_vdj       v_call  \\\n",
185 |        "0           T            F          T        T            T  IGHV3-43*02   \n",
186 |        "1           T            F          T        T            F   IGHV3-9*01   \n",
187 |        "2           T            F          T        T            T  IGHV4-39*01   \n",
188 |        "\n",
189 |        "             d_call  ... cdr3_start cdr3_end        np1 np1_length    np2  \\\n",
190 |        "0       IGHD3-16*01  ...      423.0    452.0          T        1.0   TTAA   \n",
191 |        "1  IGHD5/OR15-5a*01  ...      426.0    470.0  CCAGAGGGA        9.0  CTGGG   \n",
192 |        "2               NaN  ...      382.0    399.0    GGCCCCG        7.0    NaN   \n",
193 |        "\n",
194 |        "   np2_length                         c_region  Redundancy  \\\n",
195 |        "0         4.0    CATCCCCGACCAGCCCCAAGGTCTTCCCG           4   \n",
196 |        "1         5.0  TGCATCCCCGACCAGCCCCAAGGTCTTCCCG           1   \n",
197 |        "2         NaN    CATCCCCGACCAGCCCCAAGGTCTTCCCG           2   \n",
198 |        "\n",
199 |        "                                    ANARCI_numbering  \\\n",
200 |        "0  {'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...   \n",
201 |        "1  {'fwh1': {'1 ': 'E', '2 ': 'V', '3 ': 'Q', '4 ...   \n",
202 |        "2  {'fwh1': {'1 ': 'Q', '2 ': 'L', '3 ': 'Q', '4 ...   \n",
203 |        "\n",
204 |        "                              ANARCI_status  \n",
205 |        "0                    |Deletions: 10, 73||||  \n",
206 |        "1  Unusual residue: X|Deletions: 10, 73||||  \n",
207 |        "2                |Deletions: 10, 55, 73||||  \n",
208 |        "\n",
209 |        "[3 rows x 97 columns]"
210 |       ]
211 |      },
212 |      "execution_count": 12,
213 |      "metadata": {},
214 |      "output_type": "execute_result"
215 |     }
216 |    ],
217 |    "source": [
218 |     "meta.head(3)"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "markdown",
223 |    "metadata": {},
224 |    "source": [
225 |     "### Make data manifest"
226 |    ]
227 |   },
228 |   {
229 |    "cell_type": "code",
230 |    "execution_count": 15,
231 |    "metadata": {},
232 |    "outputs": [
233 |     {
234 |      "name": "stdout",
235 |      "output_type": "stream",
236 |      "text": [
237 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843400_Heavy_IGHE.csv.gz\n",
238 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843418_Heavy_IGHA.csv.gz\n",
239 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843418_Heavy_Bulk.csv.gz\n",
240 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843403_Heavy_Bulk.csv.gz\n",
241 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843421_Heavy_IGHA.csv.gz\n",
242 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843416_Heavy_Bulk.csv.gz\n",
243 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843392_Heavy_IGHD.csv.gz\n",
244 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843414_Heavy_Bulk.csv.gz\n",
245 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843405_Heavy_IGHG.csv.gz\n",
246 |       "wget http://opig.stats.ox.ac.uk/webapps/ngsdb/unpaired/Eliyahu_2018/csv/ERR2843386_Heavy_IGHD.csv.gz\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "!head ../data/bulk_download_unpaired.sh"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 17,
257 |    "metadata": {},
258 |    "outputs": [
259 |     {
260 |      "data": {
261 |       "text/html": [
262 |        "<div>\n",
263 |        "<style scoped>\n",
264 |        "    .dataframe tbody tr th:only-of-type {\n",
265 |        "        vertical-align: middle;\n",
266 |        "    }\n",
267 |        "\n",
268 |        "    .dataframe tbody tr th {\n",
269 |        "        vertical-align: top;\n",
270 |        "    }\n",
271 |        "\n",
272 |        "    .dataframe thead th {\n",
273 |        "        text-align: right;\n",
274 |        "    }\n",
275 |        "</style>\n",
276 |        "<table border=\"1\" class=\"dataframe\">\n",
277 |        "  <thead>\n",
278 |        "    <tr style=\"text-align: right;\">\n",
279 |        "      <th></th>\n",
280 |        "      <th>seq_type</th>\n",
281 |        "      <th>study</th>\n",
282 |        "      <th>filename</th>\n",
283 |        "    </tr>\n",
284 |        "  </thead>\n",
285 |        "  <tbody>\n",
286 |        "    <tr>\n",
287 |        "      <th>0</th>\n",
288 |        "      <td>unpaired</td>\n",
289 |        "      <td>Eliyahu_2018</td>\n",
290 |        "      <td>ERR2843400_Heavy_IGHE</td>\n",
291 |        "    </tr>\n",
292 |        "    <tr>\n",
293 |        "      <th>1</th>\n",
294 |        "      <td>unpaired</td>\n",
295 |        "      <td>Eliyahu_2018</td>\n",
296 |        "      <td>ERR2843418_Heavy_IGHA</td>\n",
297 |        "    </tr>\n",
298 |        "    <tr>\n",
299 |        "      <th>2</th>\n",
300 |        "      <td>unpaired</td>\n",
301 |        "      <td>Eliyahu_2018</td>\n",
302 |        "      <td>ERR2843418_Heavy_Bulk</td>\n",
303 |        "    </tr>\n",
304 |        "  </tbody>\n",
305 |        "</table>\n",
306 |        "</div>"
307 |       ],
308 |       "text/plain": [
309 |        "   seq_type         study               filename\n",
310 |        "0  unpaired  Eliyahu_2018  ERR2843400_Heavy_IGHE\n",
311 |        "1  unpaired  Eliyahu_2018  ERR2843418_Heavy_IGHA\n",
312 |        "2  unpaired  Eliyahu_2018  ERR2843418_Heavy_Bulk"
313 |       ]
314 |      },
315 |      "execution_count": 17,
316 |      "metadata": {},
317 |      "output_type": "execute_result"
318 |     }
319 |    ],
320 |    "source": [
321 |     "types, studies, filenames = [], [], []\n",
322 |     "\n",
323 |     "urls = []\n",
324 |     "with open('../data/bulk_download_unpaired.sh') as inFile:\n",
325 |     "    urls += [l.strip().split()[1] for l in inFile.readlines()]\n",
326 |     "with open('../data/bulk_download_paired.sh') as inFile:\n",
327 |     "    urls += [l.strip().split()[1] for l in inFile.readlines()]\n",
328 |     "\n",
329 |     "for url in urls:\n",
330 |     "    types.append(url.split('/')[-4])\n",
331 |     "    studies.append(url.split('/')[-3])\n",
332 |     "    filenames.append(url.split('/')[-1].split('.')[0])\n",
333 |     "    \n",
334 |     "df = pd.DataFrame({\n",
335 |     "    'seq_type': types,\n",
336 |     "    'study': studies,\n",
337 |     "    'filename': filenames\n",
338 |     "})\n",
339 |     "\n",
340 |     "df.head(3)"
341 |    ]
342 |   },
343 |   {
344 |    "cell_type": "code",
345 |    "execution_count": 19,
346 |    "metadata": {},
347 |    "outputs": [],
348 |    "source": [
349 |     "df.to_csv('../data/manifest_230324.csv', index=False)"
350 |    ]
351 |   },
352 |   {
353 |    "cell_type": "code",
354 |    "execution_count": 22,
355 |    "metadata": {},
356 |    "outputs": [
357 |     {
358 |      "data": {
359 |       "text/html": [
360 |        "<div>\n",
361 |        "<style scoped>\n",
362 |        "    .dataframe tbody tr th:only-of-type {\n",
363 |        "        vertical-align: middle;\n",
364 |        "    }\n",
365 |        "\n",
366 |        "    .dataframe tbody tr th {\n",
367 |        "        vertical-align: top;\n",
368 |        "    }\n",
369 |        "\n",
370 |        "    .dataframe thead th {\n",
371 |        "        text-align: right;\n",
372 |        "    }\n",
373 |        "</style>\n",
374 |        "<table border=\"1\" class=\"dataframe\">\n",
375 |        "  <thead>\n",
376 |        "    <tr style=\"text-align: right;\">\n",
377 |        "      <th></th>\n",
378 |        "      <th>seq_type</th>\n",
379 |        "      <th>study</th>\n",
380 |        "      <th>filename</th>\n",
381 |        "    </tr>\n",
382 |        "  </thead>\n",
383 |        "  <tbody>\n",
384 |        "    <tr>\n",
385 |        "      <th>12081</th>\n",
386 |        "      <td>unpaired</td>\n",
387 |        "      <td>Galson_2015a</td>\n",
388 |        "      <td>SRR3099401_Heavy_IGHM</td>\n",
389 |        "    </tr>\n",
390 |        "    <tr>\n",
391 |        "      <th>291</th>\n",
392 |        "      <td>unpaired</td>\n",
393 |        "      <td>Schultheiss_2020</td>\n",
394 |        "      <td>ERR4337035_Heavy_Bulk</td>\n",
395 |        "    </tr>\n",
396 |        "    <tr>\n",
397 |        "      <th>10814</th>\n",
398 |        "      <td>unpaired</td>\n",
399 |        "      <td>Briney_2019</td>\n",
400 |        "      <td>SRR8283768_Heavy_IGHD</td>\n",
401 |        "    </tr>\n",
402 |        "    <tr>\n",
403 |        "      <th>3647</th>\n",
404 |        "      <td>unpaired</td>\n",
405 |        "      <td>Soto_2019</td>\n",
406 |        "      <td>SRR8365361_1_Heavy_IGHM</td>\n",
407 |        "    </tr>\n",
408 |        "    <tr>\n",
409 |        "      <th>1372</th>\n",
410 |        "      <td>unpaired</td>\n",
411 |        "      <td>Galson_2015</td>\n",
412 |        "      <td>SRR3990897_Heavy_Bulk</td>\n",
413 |        "    </tr>\n",
414 |        "    <tr>\n",
415 |        "      <th>...</th>\n",
416 |        "      <td>...</td>\n",
417 |        "      <td>...</td>\n",
418 |        "      <td>...</td>\n",
419 |        "    </tr>\n",
420 |        "    <tr>\n",
421 |        "      <th>4936</th>\n",
422 |        "      <td>unpaired</td>\n",
423 |        "      <td>Kim_2020</td>\n",
424 |        "      <td>SRR12326744_1_Heavy_IGHA</td>\n",
425 |        "    </tr>\n",
426 |        "    <tr>\n",
427 |        "      <th>6817</th>\n",
428 |        "      <td>unpaired</td>\n",
429 |        "      <td>Ellebedy_2016</td>\n",
430 |        "      <td>SRR3620118_Heavy_IGHM</td>\n",
431 |        "    </tr>\n",
432 |        "    <tr>\n",
433 |        "      <th>15245</th>\n",
434 |        "      <td>unpaired</td>\n",
435 |        "      <td>Waltari_2018</td>\n",
436 |        "      <td>SRR5811779_1_Heavy_IGHE</td>\n",
437 |        "    </tr>\n",
438 |        "    <tr>\n",
439 |        "      <th>8768</th>\n",
440 |        "      <td>unpaired</td>\n",
441 |        "      <td>Chen_2020</td>\n",
442 |        "      <td>SRR11937625_1_Light_Bulk</td>\n",
443 |        "    </tr>\n",
444 |        "    <tr>\n",
445 |        "      <th>1624</th>\n",
446 |        "      <td>unpaired</td>\n",
447 |        "      <td>Kuri-Cervantes_2020</td>\n",
448 |        "      <td>SRR12081538_Heavy_Bulk</td>\n",
449 |        "    </tr>\n",
450 |        "  </tbody>\n",
451 |        "</table>\n",
452 |        "<p>1579 rows × 3 columns</p>\n",
453 |        "</div>"
454 |       ],
455 |       "text/plain": [
456 |        "       seq_type                study                  filename\n",
457 |        "12081  unpaired         Galson_2015a     SRR3099401_Heavy_IGHM\n",
458 |        "291    unpaired     Schultheiss_2020     ERR4337035_Heavy_Bulk\n",
459 |        "10814  unpaired          Briney_2019     SRR8283768_Heavy_IGHD\n",
460 |        "3647   unpaired            Soto_2019   SRR8365361_1_Heavy_IGHM\n",
461 |        "1372   unpaired          Galson_2015     SRR3990897_Heavy_Bulk\n",
462 |        "...         ...                  ...                       ...\n",
463 |        "4936   unpaired             Kim_2020  SRR12326744_1_Heavy_IGHA\n",
464 |        "6817   unpaired        Ellebedy_2016     SRR3620118_Heavy_IGHM\n",
465 |        "15245  unpaired         Waltari_2018   SRR5811779_1_Heavy_IGHE\n",
466 |        "8768   unpaired            Chen_2020  SRR11937625_1_Light_Bulk\n",
467 |        "1624   unpaired  Kuri-Cervantes_2020    SRR12081538_Heavy_Bulk\n",
468 |        "\n",
469 |        "[1579 rows x 3 columns]"
470 |       ]
471 |      },
472 |      "execution_count": 22,
473 |      "metadata": {},
474 |      "output_type": "execute_result"
475 |     }
476 |    ],
477 |    "source": [
478 |     "df.sample(frac=0.1, random_state=42)"
479 |    ]
480 |   },
481 |   {
482 |    "cell_type": "code",
483 |    "execution_count": 23,
484 |    "metadata": {},
485 |    "outputs": [
486 |     {
487 |      "data": {
488 |       "text/plain": [
489 |        "(15789, 3)"
490 |       ]
491 |      },
492 |      "execution_count": 23,
493 |      "metadata": {},
494 |      "output_type": "execute_result"
495 |     }
496 |    ],
497 |    "source": [
498 |     "df.shape"
499 |    ]
500 |   }
501 |  ],
502 |  "metadata": {
503 |   "kernelspec": {
504 |    "display_name": "dohoon",
505 |    "language": "python",
506 |    "name": "dohoon"
507 |   },
508 |   "language_info": {
509 |    "codemirror_mode": {
510 |     "name": "ipython",
511 |     "version": 3
512 |    },
513 |    "file_extension": ".py",
514 |    "mimetype": "text/x-python",
515 |    "name": "python",
516 |    "nbconvert_exporter": "python",
517 |    "pygments_lexer": "ipython3",
518 |    "version": "3.9.13"
519 |   }
520 |  },
521 |  "nbformat": 4,
522 |  "nbformat_minor": 4
523 | }
524 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |   name = 'antiberty-pytorch',
 5 |   packages = find_packages(exclude=[]),
 6 |   include_package_data = True,
 7 |   version = '0.0.1',
 8 |   license='MIT',
 9 |   description = 'Antiberty - Pytorch',
10 |   author = 'Dohoon Lee',
11 |   author_email = 'dohlee.bioinfo@gmail.com',
12 |   long_description_content_type = 'text/markdown',
13 |   url = 'https://github.com/dohlee/antiberty-pytorch',
14 |   keywords = [
15 |     'artificial intelligence',
16 |     'antibody',
17 |     'protein language model',
18 |   ],
19 |   install_requires=[
20 |     'einops>=0.3',
21 |     'numpy',
22 |     'torch>=1.6',
23 |     'transformers',
24 |   ],
25 |   classifiers=[
26 |     'Development Status :: 4 - Beta',
27 |     'Intended Audience :: Developers',
28 |     'Topic :: Scientific/Engineering :: Artificial Intelligence',
29 |     'License :: OSI Approved :: MIT License',
30 |     'Programming Language :: Python :: 3.9',
31 |   ],
32 | )
33 | 


--------------------------------------------------------------------------------
/tokenizer/ProteinTokenizer/added_tokens.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "A": 5,
 3 |   "C": 6,
 4 |   "D": 7,
 5 |   "E": 8,
 6 |   "F": 9,
 7 |   "G": 10,
 8 |   "H": 11,
 9 |   "I": 12,
10 |   "K": 13,
11 |   "L": 14,
12 |   "M": 15,
13 |   "N": 16,
14 |   "P": 17,
15 |   "Q": 18,
16 |   "R": 19,
17 |   "S": 20,
18 |   "T": 21,
19 |   "V": 22,
20 |   "W": 23,
21 |   "Y": 24
22 | }
23 | 


--------------------------------------------------------------------------------
/tokenizer/ProteinTokenizer/special_tokens_map.json:
--------------------------------------------------------------------------------
1 | {
2 |   "cls_token": "[CLS]",
3 |   "mask_token": "[MASK]",
4 |   "pad_token": "[PAD]",
5 |   "sep_token": "[SEP]",
6 |   "unk_token": "[UNK]"
7 | }
8 | 


--------------------------------------------------------------------------------
/tokenizer/ProteinTokenizer/tokenizer_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "cls_token": "[CLS]",
 3 |   "do_basic_tokenize": true,
 4 |   "do_lower_case": false,
 5 |   "mask_token": "[MASK]",
 6 |   "never_split": null,
 7 |   "pad_token": "[PAD]",
 8 |   "sep_token": "[SEP]",
 9 |   "strip_accents": null,
10 |   "tokenize_chinese_chars": true,
11 |   "tokenizer_class": "BertTokenizer",
12 |   "unk_token": "[UNK]"
13 | }
14 | 


--------------------------------------------------------------------------------
/tokenizer/ProteinTokenizer/vocab.txt:
--------------------------------------------------------------------------------
1 | [PAD]
2 | [UNK]
3 | [CLS]
4 | [SEP]
5 | [MASK]
6 | 


--------------------------------------------------------------------------------