├── riot_na ├── api │ ├── __init__.py │ ├── utils.py │ └── api_mp.py ├── airr │ └── __init__.py ├── common │ ├── __init__.py │ ├── assert_never.py │ ├── serialization_utils.py │ ├── gene_match_utils.py │ ├── debug_utils.py │ ├── io.py │ ├── airr_csv_writer.py │ ├── multi_species_prefiltering.py │ └── multi_species_segment_prefiltering.py ├── data │ ├── __init__.py │ ├── constants.py │ ├── scheme_mapping_facade.py │ ├── scheme_regions.py │ └── scheme_definitions.py ├── schemes │ ├── __init__.py │ ├── collapse_alignment.py │ ├── region_offsets.py │ └── smooth_alignment.py ├── databases │ ├── gene_db │ │ ├── aa_genes │ │ │ ├── c_genes │ │ │ │ ├── human │ │ │ │ │ ├── igk.fasta │ │ │ │ │ ├── igl.fasta │ │ │ │ │ └── igh.fasta │ │ │ │ └── alpaca │ │ │ │ │ └── igh.fasta │ │ │ └── j_genes │ │ │ │ ├── mouse │ │ │ │ ├── igh.fasta │ │ │ │ ├── igl.fasta │ │ │ │ └── igk.fasta │ │ │ │ ├── human │ │ │ │ ├── igk.fasta │ │ │ │ ├── igh.fasta │ │ │ │ └── igl.fasta │ │ │ │ └── alpaca │ │ │ │ └── igh.fasta │ │ ├── aa_genes_deduplicated │ │ │ ├── c_genes │ │ │ │ ├── human │ │ │ │ │ ├── igk.fasta │ │ │ │ │ ├── igl.fasta │ │ │ │ │ └── igh.fasta │ │ │ │ └── alpaca │ │ │ │ │ └── igh.fasta │ │ │ └── j_genes │ │ │ │ ├── mouse │ │ │ │ ├── igl.fasta │ │ │ │ ├── igh.fasta │ │ │ │ └── igk.fasta │ │ │ │ ├── human │ │ │ │ ├── igk.fasta │ │ │ │ ├── igh.fasta │ │ │ │ └── igl.fasta │ │ │ │ └── alpaca │ │ │ │ └── igh.fasta │ │ ├── aa_genes_first_allele │ │ │ └── j_genes │ │ │ │ ├── human │ │ │ │ ├── igk.fasta │ │ │ │ ├── igh.fasta │ │ │ │ └── igl.fasta │ │ │ │ └── mouse │ │ │ │ ├── igl.fasta │ │ │ │ ├── igh.fasta │ │ │ │ └── igk.fasta │ │ ├── c_genes │ │ │ ├── human │ │ │ │ ├── igk.fasta │ │ │ │ └── igl.fasta │ │ │ └── alpaca │ │ │ │ └── igh.fasta │ │ ├── d_genes │ │ │ ├── alpaca │ │ │ │ └── igh.fasta │ │ │ ├── mouse │ │ │ │ └── igh.fasta │ │ │ └── human │ │ │ │ └── igh.fasta │ │ └── j_genes │ │ │ ├── human │ │ │ ├── igk.fasta │ │ │ ├── igh.fasta │ │ │ └── igl.fasta │ │ │ ├── mouse │ │ │ ├── igh.fasta │ │ │ ├── igl.fasta │ │ │ └── igk.fasta │ │ │ └── alpaca │ │ │ └── igh.fasta │ └── scheme_mappings │ │ └── alpaca │ │ ├── kabat │ │ └── scheme_mapping.csv │ │ ├── martin │ │ └── scheme_mapping.csv │ │ └── chothia │ │ └── scheme_mapping.csv ├── config.py ├── __init__.py ├── riot_na.pyi ├── alignment │ ├── alignment_metrics.py │ ├── alignment_utils.py │ ├── skbio_alignment.py │ └── nt_gene_alignments.py └── cli.py ├── .flake8 ├── .mypy.ini ├── riot_prefiltering ├── lib.rs └── model.rs ├── Dockerfile ├── tests ├── test_skbio_alignment.py ├── test_utils.py ├── test_collapse_alignment.py ├── test_smooth_alignment │ ├── test_smooth_cdr_junctions_martin.py │ └── test_smooth_cdr_junctions_chothia.py ├── test_e2e.py └── test_aa_alignments.py ├── notebooks ├── final_benchmark │ ├── utils.py │ ├── time_measurments.ipynb │ ├── nt_gene_assignment_ngs.ipynb │ └── aa_gene_assignment_therapeutics.ipynb ├── data_processing │ ├── dict_merge.py │ ├── therapeutics_preprocess_and_filter_species.py │ ├── ngs_sample_filter_species.ipynb │ └── deduplicate_aa_genes.ipynb ├── utils.py └── prefiltering_grid_search │ └── utils.py ├── .gitignore ├── Cargo.toml ├── .pre-commit-config.yaml ├── .pylintrc └── pyproject.toml /riot_na/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /riot_na/airr/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /riot_na/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /riot_na/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /riot_na/schemes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /riot_na/data/constants.py: -------------------------------------------------------------------------------- 1 | AMINO_ACIDS = set(list("QWERTYIPASDFGHKLCVNM")) 2 | -------------------------------------------------------------------------------- /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 3 | ignore = F841,E722,W605,E265,W503,E203,E501 4 | per-file-ignores = __init__.py:F401 5 | -------------------------------------------------------------------------------- /riot_na/common/assert_never.py: -------------------------------------------------------------------------------- 1 | from typing import NoReturn 2 | 3 | 4 | def assert_never(value: NoReturn) -> NoReturn: 5 | raise ValueError(f"Value {value} not handled") 6 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/c_genes/human/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKC IGK HOMO_SAPIENS 2 | RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC 3 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/c_genes/human/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKC IGK HOMO_SAPIENS 2 | RTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC 3 | -------------------------------------------------------------------------------- /riot_na/common/serialization_utils.py: -------------------------------------------------------------------------------- 1 | import base64 2 | 3 | 4 | def base64_encode(val: str) -> str: 5 | return base64.b64encode(val.encode("utf-8")).decode("utf-8") 6 | 7 | 8 | def base64_decode(val: str) -> str: 9 | return base64.b64decode(val.encode("utf-8")).decode("utf-8") 10 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_first_allele/j_genes/human/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ1*01 IGK HOMO_SAPIENS 2 | WTFGQGTKVEIK 3 | >IGKJ2*01 IGK HOMO_SAPIENS 4 | YTFGQGTKLEIK 5 | >IGKJ3*01 IGK HOMO_SAPIENS 6 | FTFGPGTKVDIK 7 | >IGKJ4*01 IGK HOMO_SAPIENS 8 | LTFGGGTKVEIK 9 | >IGKJ5*01 IGK HOMO_SAPIENS 10 | ITFGQGTRLEIK 11 | -------------------------------------------------------------------------------- /.mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | explicit_package_bases = True 3 | 4 | [mypy-Bio.*] 5 | ignore_missing_imports = True 6 | 7 | [mypy-skbio.*] 8 | ignore_missing_imports = True 9 | 10 | [mypy-blosum.*] 11 | ignore_missing_imports = True 12 | 13 | [mypy-pandas.*] 14 | ignore_missing_imports = True 15 | 16 | [mypy-cachetools.*] 17 | ignore_missing_imports = True -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/j_genes/mouse/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ-2ERU IGL MUS_MUSCULUS 2 | YVFGSGTKVTVL 3 | >IGLJ-ACBW IGL MUS_MUSCULUS 4 | YVFGGGTKVTVL 5 | >IGLJ-FMEA IGL MUS_MUSCULUS 6 | WVFGGGTKLTVL 7 | >IGLJ-KL6B IGL MUS_MUSCULUS 8 | FIFGSGTKVTVL 9 | >IGLJ-PG3G IGL MUS_MUSCULUS 10 | WVFGGGTRLTVLD 11 | >IGLJ-UWMQ IGL MUS_MUSCULUS 12 | RFFFLKWPIVCR 13 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_first_allele/j_genes/mouse/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ-2ERU IGL MUS_MUSCULUS 2 | YVFGSGTKVTVL 3 | >IGLJ-ACBW IGL MUS_MUSCULUS 4 | YVFGGGTKVTVL 5 | >IGLJ-FMEA IGL MUS_MUSCULUS 6 | WVFGGGTKLTVL 7 | >IGLJ-KL6B IGL MUS_MUSCULUS 8 | FIFGSGTKVTVL 9 | >IGLJ-PG3G IGL MUS_MUSCULUS 10 | WVFGGGTRLTVLD 11 | >IGLJ-UWMQ IGL MUS_MUSCULUS 12 | RFFFLKWPIVCR 13 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/j_genes/mouse/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ-O6GC IGH MUS_MUSCULUS 2 | YFDYWGQGTTITVSS 3 | >IGHJ-2FQV IGH MUS_MUSCULUS 4 | YFDYWGQGTTLTVSS 5 | >IGHJ-EGQA IGH MUS_MUSCULUS 6 | YYAMDYWGQGTSVTVSS 7 | >IGHJ-KZ3G IGH MUS_MUSCULUS 8 | WFAYWGQGTLVTVSA 9 | >IGHJ-32C2 IGH MUS_MUSCULUS 10 | YWYFDVWGTGTTVTVSS 11 | >IGHJ-G76U IGH MUS_MUSCULUS 12 | YWYFDVWGAGTTVTVSS 13 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/j_genes/mouse/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ-2FQV IGH MUS_MUSCULUS 2 | YFDYWGQGTTLTVSS 3 | >IGHJ-32C2 IGH MUS_MUSCULUS 4 | YWYFDVWGTGTTVTVSS 5 | >IGHJ-EGQA IGH MUS_MUSCULUS 6 | YYAMDYWGQGTSVTVSS 7 | >IGHJ-G76U IGH MUS_MUSCULUS 8 | YWYFDVWGAGTTVTVSS 9 | >IGHJ-KZ3G IGH MUS_MUSCULUS 10 | WFAYWGQGTLVTVSA 11 | >IGHJ-O6GC IGH MUS_MUSCULUS 12 | YFDYWGQGTTITVSS 13 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_first_allele/j_genes/human/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ1*01 IGH HOMO_SAPIENS 2 | AEYFQHWGQGTLVTVSS 3 | >IGHJ2*01 IGH HOMO_SAPIENS 4 | YWYFDLWGRGTLVTVSS 5 | >IGHJ3*02 IGH HOMO_SAPIENS 6 | DAFDIWGQGTMVTVSS 7 | >IGHJ4*02 IGH HOMO_SAPIENS 8 | YFDYWGQGTLVTVSS 9 | >IGHJ5*02 IGH HOMO_SAPIENS 10 | NWFDPWGQGTLVTVSS 11 | >IGHJ6*02 IGH HOMO_SAPIENS 12 | YYYYYGMDVWGQGTTVTVSS 13 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_first_allele/j_genes/mouse/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ-2FQV IGH MUS_MUSCULUS 2 | YFDYWGQGTTLTVSS 3 | >IGHJ-32C2 IGH MUS_MUSCULUS 4 | YWYFDVWGTGTTVTVSS 5 | >IGHJ-EGQA IGH MUS_MUSCULUS 6 | YYAMDYWGQGTSVTVSS 7 | >IGHJ-G76U IGH MUS_MUSCULUS 8 | YWYFDVWGAGTTVTVSS 9 | >IGHJ-KZ3G IGH MUS_MUSCULUS 10 | WFAYWGQGTLVTVSA 11 | >IGHJ-O6GC IGH MUS_MUSCULUS 12 | YFDYWGQGTTITVSS 13 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/j_genes/human/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ3*01 IGK HOMO_SAPIENS 2 | FTFGPGTKVDIK 3 | >IGKJ5*01 IGK HOMO_SAPIENS 4 | ITFGQGTRLEIK 5 | >IGKJ4*01 IGK HOMO_SAPIENS 6 | LTFGGGTKVEIK 7 | >IGKJ1*01 IGK HOMO_SAPIENS 8 | WTFGQGTKVEIK 9 | >IGKJ2*01 IGK HOMO_SAPIENS 10 | YTFGQGTKLEIK 11 | >IGKJ2*03 IGK HOMO_SAPIENS 12 | YSFGQGTKLEIK 13 | >IGKJ2*04 IGK HOMO_SAPIENS 14 | CSFGQGTKLEIK 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/j_genes/mouse/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ-UWMQ IGL MUS_MUSCULUS 2 | RFFFLKWPIVCR 3 | >IGLJ-FMEA IGL MUS_MUSCULUS 4 | WVFGGGTKLTVL 5 | >IGLJ-XVAO IGL MUS_MUSCULUS 6 | WVFGGGTKLTVL 7 | >IGLJ-KL6B IGL MUS_MUSCULUS 8 | FIFGSGTKVTVL 9 | >IGLJ-2ERU IGL MUS_MUSCULUS 10 | YVFGSGTKVTVL 11 | >IGLJ-ACBW IGL MUS_MUSCULUS 12 | YVFGGGTKVTVL 13 | >IGLJ-PG3G IGL MUS_MUSCULUS 14 | WVFGGGTRLTVLD 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/j_genes/human/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ1*01 IGK HOMO_SAPIENS 2 | WTFGQGTKVEIK 3 | >IGKJ2*01 IGK HOMO_SAPIENS 4 | YTFGQGTKLEIK 5 | >IGKJ2*03 IGK HOMO_SAPIENS 6 | YSFGQGTKLEIK 7 | >IGKJ2*04 IGK HOMO_SAPIENS 8 | CSFGQGTKLEIK 9 | >IGKJ3*01 IGK HOMO_SAPIENS 10 | FTFGPGTKVDIK 11 | >IGKJ4*01 IGK HOMO_SAPIENS 12 | LTFGGGTKVEIK 13 | >IGKJ5*01 IGK HOMO_SAPIENS 14 | ITFGQGTRLEIK 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_first_allele/j_genes/human/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ1*01 IGL HOMO_SAPIENS 2 | YVFGTGTKVTVL 3 | >IGLJ2*01 IGL HOMO_SAPIENS 4 | VVFGGGTKLTVL 5 | >IGLJ3*02 IGL HOMO_SAPIENS 6 | WVFGGGTKLTVL 7 | >IGLJ4*01 IGL HOMO_SAPIENS 8 | FVFGGGTQLIIL 9 | >IGLJ5*01 IGL HOMO_SAPIENS 10 | WVFGEGTELTVL 11 | >IGLJ6*01 IGL HOMO_SAPIENS 12 | NVFGSGTKVTVL 13 | >IGLJ7*01 IGL HOMO_SAPIENS 14 | AVFGGGTQLTVL 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/c_genes/human/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKC IGK HOMO_SAPIENS 2 | CGAACTGTGGCTGCACCATCTGTCTTCATCTTCCCGCCATCTGATGAGCAGTTGAAATCTGGAACTGCCTCTGTTGTGTG 3 | CCTGCTGAATAACTTCTATCCCAGAGAGGCCAAAGTACAGTGGAAGGTGGATAACGCCCTCCAATCGGGTAACTCCCAGG 4 | AGAGTGTCACAGAGCAGGACAGCAAGGACAGCACCTACAGCCTCAGCAGCACCCTGACGCTGAGCAAAGCAGACTACGAG 5 | AAACACAAAGTCTACGCCTGCGAAGTCACCCATCAGGGCCTGAGCTCGCCCGTCACAAAGAGCTTCAACAGGGGAGAGTG 6 | TTAG -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/j_genes/alpaca/igh.fasta: -------------------------------------------------------------------------------- 1 | >ighJ-1 IGH VICUGNA_PACOS 2 | SPIAGAPGHLGTVSS 3 | >ighJ-2 IGH VICUGNA_PACOS 4 | YRYLEVWGQGTLVTVSS 5 | >ighJ-3 IGH VICUGNA_PACOS 6 | NALDAWGQGTLVTVSS 7 | >ighJ-4 IGH VICUGNA_PACOS 8 | EYDYWGQGTQVTVSS 9 | >ighJ-5 IGH VICUGNA_PACOS 10 | PQFEYWGQGTLVTVS 11 | >ighJ-6 IGH VICUGNA_PACOS 12 | DFGSWGQGTQVTVSS 13 | >ighJ-7 IGH VICUGNA_PACOS 14 | YYGMDYWGKGTLVTVSS 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/j_genes/alpaca/igh.fasta: -------------------------------------------------------------------------------- 1 | >ighJ-1 IGH VICUGNA_PACOS 2 | SPIAGAPGHLGTVSS 3 | >ighJ-2 IGH VICUGNA_PACOS 4 | YRYLEVWGQGTLVTVSS 5 | >ighJ-3 IGH VICUGNA_PACOS 6 | NALDAWGQGTLVTVSS 7 | >ighJ-4 IGH VICUGNA_PACOS 8 | EYDYWGQGTQVTVSS 9 | >ighJ-5 IGH VICUGNA_PACOS 10 | PQFEYWGQGTLVTVS 11 | >ighJ-6 IGH VICUGNA_PACOS 12 | DFGSWGQGTQVTVSS 13 | >ighJ-7 IGH VICUGNA_PACOS 14 | YYGMDYWGKGTLVTVSS 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/d_genes/alpaca/igh.fasta: -------------------------------------------------------------------------------- 1 | >ighd-1 VICUGNA_PACOS 2 | TATTACGCTTATTGGCTTGGAGATGCTGG 3 | >ighd-2 VICUGNA_PACOS 4 | ACATACTATAGTGGTAGTTACTACTACACC 5 | >ighd-3 VICUGNA_PACOS 6 | GTATTACTACTGCTCAGGCTATGGGTGTTATGAC 7 | >ighd-4 VICUGNA_PACOS 8 | TTACTATAGCGACTATGAC 9 | >ighd-5 VICUGNA_PACOS 10 | AGACTACGGGTTGGGGTAC 11 | >ighd-6 VICUGNA_PACOS 12 | GTACGGTAGTAGCTGGTAC 13 | >ighd-7 VICUGNA_PACOS 14 | CTAACTGGAGC 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/j_genes/human/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ5*02 IGH HOMO_SAPIENS 2 | NWFDPWGQGTLVTVSS 3 | >IGHJ4*02 IGH HOMO_SAPIENS 4 | YFDYWGQGTLVTVSS 5 | >IGHJ6*02 IGH HOMO_SAPIENS 6 | YYYYYGMDVWGQGTTVTVSS 7 | >IGHJ6*03 IGH HOMO_SAPIENS 8 | YYYYYYMDVWGKGTTVTVSS 9 | >IGHJ2*01 IGH HOMO_SAPIENS 10 | YWYFDLWGRGTLVTVSS 11 | >IGHJ1*01 IGH HOMO_SAPIENS 12 | AEYFQHWGQGTLVTVSS 13 | >IGHJ3*02 IGH HOMO_SAPIENS 14 | DAFDIWGQGTMVTVSS 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/j_genes/human/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ1*01 IGH HOMO_SAPIENS 2 | AEYFQHWGQGTLVTVSS 3 | >IGHJ2*01 IGH HOMO_SAPIENS 4 | YWYFDLWGRGTLVTVSS 5 | >IGHJ3*02 IGH HOMO_SAPIENS 6 | DAFDIWGQGTMVTVSS 7 | >IGHJ4*02 IGH HOMO_SAPIENS 8 | YFDYWGQGTLVTVSS 9 | >IGHJ5*02 IGH HOMO_SAPIENS 10 | NWFDPWGQGTLVTVSS 11 | >IGHJ6*02 IGH HOMO_SAPIENS 12 | YYYYYGMDVWGQGTTVTVSS 13 | >IGHJ6*03 IGH HOMO_SAPIENS 14 | YYYYYYMDVWGKGTTVTVSS 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/j_genes/human/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ1*01 IGL HOMO_SAPIENS 2 | YVFGTGTKVTVL 3 | >IGLJ2*01 IGL HOMO_SAPIENS 4 | VVFGGGTKLTVL 5 | >IGLJ3*02 IGL HOMO_SAPIENS 6 | WVFGGGTKLTVL 7 | >IGLJ4*01 IGL HOMO_SAPIENS 8 | FVFGGGTQLIIL 9 | >IGLJ5*01 IGL HOMO_SAPIENS 10 | WVFGEGTELTVL 11 | >IGLJ6*01 IGL HOMO_SAPIENS 12 | NVFGSGTKVTVL 13 | >IGLJ7*01 IGL HOMO_SAPIENS 14 | AVFGGGTQLTVL 15 | >IGLJ7*02 IGL HOMO_SAPIENS 16 | AVFGGGTQLTAL 17 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/j_genes/human/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ5*01 IGL HOMO_SAPIENS 2 | WVFGEGTELTVL 3 | >IGLJ5*02 IGL HOMO_SAPIENS 4 | WVFGEGTELTVL 5 | >IGLJ6*01 IGL HOMO_SAPIENS 6 | NVFGSGTKVTVL 7 | >IGLJ7*02 IGL HOMO_SAPIENS 8 | AVFGGGTQLTAL 9 | >IGLJ7*01 IGL HOMO_SAPIENS 10 | AVFGGGTQLTVL 11 | >IGLJ2*01 IGL HOMO_SAPIENS 12 | VVFGGGTKLTVL 13 | >IGLJ1*01 IGL HOMO_SAPIENS 14 | YVFGTGTKVTVL 15 | >IGLJ3*02 IGL HOMO_SAPIENS 16 | WVFGGGTKLTVL 17 | >IGLJ4*01 IGL HOMO_SAPIENS 18 | FVFGGGTQLIIL 19 | -------------------------------------------------------------------------------- /riot_na/config.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | 5 | from dotenv import load_dotenv 6 | 7 | if bool(json.loads(os.getenv("LOAD_DOTENV", "true"))): 8 | # this is somewhat a hack - on production code is run directly from a .zip file. Because of that dotenv is not able 9 | # to find an .env file and will crash. Whole env is being set 10 | try: 11 | load_dotenv() 12 | except OSError: 13 | pass 14 | 15 | GENE_DB_DIR = Path(os.getenv("GENE_DB_DIR", Path(__file__).parent / "databases")) 16 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/j_genes/mouse/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ-4JXG IGK MUS_MUSCULUS 2 | WTFGGGTKLEIK 3 | >IGKJ-5NUZ IGK MUS_MUSCULUS 4 | TTFSDGTRLEIK 5 | >IGKJ-ABRI IGK MUS_MUSCULUS 6 | YTFGSGTKLEIK 7 | >IGKJ-AR7L IGK MUS_MUSCULUS 8 | YTFGSGTKLEMK 9 | >IGKJ-IIHF IGK MUS_MUSCULUS 10 | ITFSDGTRLEIK 11 | >IGKJ-LUDL IGK MUS_MUSCULUS 12 | FTFGTGTKLEIK 13 | >IGKJ-MXRV IGK MUS_MUSCULUS 14 | YTFGGGTKLEIK 15 | >IGKJ-O4CL IGK MUS_MUSCULUS 16 | LTFGAGTKLELK 17 | >IGKJ-WRAP IGK MUS_MUSCULUS 18 | PTFGGGTKLEIN 19 | >IGKJ-Z5J4 IGK MUS_MUSCULUS 20 | FTFGSGTKLEIK 21 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_first_allele/j_genes/mouse/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ-4JXG IGK MUS_MUSCULUS 2 | WTFGGGTKLEIK 3 | >IGKJ-5NUZ IGK MUS_MUSCULUS 4 | TTFSDGTRLEIK 5 | >IGKJ-ABRI IGK MUS_MUSCULUS 6 | YTFGSGTKLEIK 7 | >IGKJ-AR7L IGK MUS_MUSCULUS 8 | YTFGSGTKLEMK 9 | >IGKJ-IIHF IGK MUS_MUSCULUS 10 | ITFSDGTRLEIK 11 | >IGKJ-LUDL IGK MUS_MUSCULUS 12 | FTFGTGTKLEIK 13 | >IGKJ-MXRV IGK MUS_MUSCULUS 14 | YTFGGGTKLEIK 15 | >IGKJ-O4CL IGK MUS_MUSCULUS 16 | LTFGAGTKLELK 17 | >IGKJ-WRAP IGK MUS_MUSCULUS 18 | PTFGGGTKLEIN 19 | >IGKJ-Z5J4 IGK MUS_MUSCULUS 20 | FTFGSGTKLEIK 21 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/j_genes/human/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ3*01 IGK 1 HOMO_SAPIENS 2 | ATTCACTTTCGGCCCTGGGACCAAAGTGGATATCAAAC 3 | >IGKJ5*01 IGK 1 HOMO_SAPIENS 4 | GATCACCTTCGGCCAAGGGACACGACTGGAGATTAAAC 5 | >IGKJ4*01 IGK 1 HOMO_SAPIENS 6 | GCTCACTTTCGGCGGAGGGACCAAGGTGGAGATCAAAC 7 | >IGKJ1*01 IGK 1 HOMO_SAPIENS 8 | GTGGACGTTCGGCCAAGGGACCAAGGTGGAAATCAAAC 9 | >IGKJ2*01 IGK 2 HOMO_SAPIENS 10 | TGTACACTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC 11 | >IGKJ2*03 IGK 2 HOMO_SAPIENS 12 | TGTACAGTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC 13 | >IGKJ2*04 IGK 2 HOMO_SAPIENS 14 | TGTGCAGTTTTGGCCAGGGGACCAAGCTGGAGATCAAAC 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/j_genes/mouse/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ-O6GC IGH 2 MUS_MUSCULUS 2 | ACTACTTTGACTACTGGGGCCAAGGCACCACTATCACAGTCTCCTCAG 3 | >IGHJ-2FQV IGH 2 MUS_MUSCULUS 4 | ACTACTTTGACTACTGGGGCCAAGGCACCACTCTCACAGTCTCCTCAG 5 | >IGHJ-EGQA IGH 2 MUS_MUSCULUS 6 | ATTACTATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAG 7 | >IGHJ-KZ3G IGH 2 MUS_MUSCULUS 8 | CCTGGTTTGCTTACTGGGGCCAAGGGACTCTGGTCACTGTCTCTGCAG 9 | >IGHJ-32C2 IGH 1 MUS_MUSCULUS 10 | CTACTGGTACTTCGATGTCTGGGGCACAGGGACCACGGTCACCGTCTCCTCAG 11 | >IGHJ-G76U IGH 1 MUS_MUSCULUS 12 | CTACTGGTACTTCGATGTCTGGGGCGCAGGGACCACGGTCACCGTCTCCTCAG 13 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/j_genes/mouse/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ-UWMQ IGL 0 MUS_MUSCULUS 2 | AGGTTCTTTTTCCTCAAATGGCCTATTGTATGCAGGAG 3 | >IGLJ-FMEA IGL 1 MUS_MUSCULUS 4 | CTGGGTGTTCGGTGGAGGAACCAAACTGACTGTCCTAG 5 | >IGLJ-XVAO IGL 1 MUS_MUSCULUS 6 | CTGGGTGTTCGGTGGAGGAACCAAATTGACTGTCCTAG 7 | >IGLJ-KL6B IGL 1 MUS_MUSCULUS 8 | GTTTATTTTCGGCAGTGGAACCAAGGTCACTGTCCTAG 9 | >IGLJ-2ERU IGL 1 MUS_MUSCULUS 10 | TTATGTTTTCGGCAGTGGAACCAAGGTCACTGTCCTAG 11 | >IGLJ-ACBW IGL 1 MUS_MUSCULUS 12 | TTATGTTTTCGGCGGTGGAACCAAGGTCACTGTCCTAG 13 | >IGLJ-PG3G IGL 1 MUS_MUSCULUS 14 | TTGGGTGTTCGGAGGTGGAACCAGATTGACTGTCCTAGATGA 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/j_genes/mouse/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ-5NUZ IGK MUS_MUSCULUS 2 | TTFSDGTRLEIK 3 | >IGKJ-IIHF IGK MUS_MUSCULUS 4 | ITFSDGTRLEIK 5 | >IGKJ-LUDL IGK MUS_MUSCULUS 6 | FTFGTGTKLEIK 7 | >IGKJ-Z5J4 IGK MUS_MUSCULUS 8 | FTFGSGTKLEIK 9 | >IGKJ-WRAP IGK MUS_MUSCULUS 10 | PTFGGGTKLEIN 11 | >IGKJ-O4CL IGK MUS_MUSCULUS 12 | LTFGAGTKLELK 13 | >IGKJ-4JXG IGK MUS_MUSCULUS 14 | WTFGGGTKLEIK 15 | >IGKJ-MXRV IGK MUS_MUSCULUS 16 | YTFGGGTKLEIK 17 | >IGKJ-ABRI IGK MUS_MUSCULUS 18 | YTFGSGTKLEIK 19 | >IGKJ-HVOR IGK MUS_MUSCULUS 20 | YTFGSGTKLEIK 21 | >IGKJ-AR7L IGK MUS_MUSCULUS 22 | YTFGSGTKLEMK 23 | -------------------------------------------------------------------------------- /riot_prefiltering/lib.rs: -------------------------------------------------------------------------------- 1 | mod model; 2 | mod prefiltering; 3 | 4 | #[cfg(test)] 5 | mod tests; 6 | 7 | use model::{GeneMatch, GeneSegment, PrefilteringResult, PrefilteringSegmentResult, SegmentMatch}; 8 | use prefiltering::Prefiltering; 9 | use pyo3::prelude::*; 10 | 11 | #[pymodule] 12 | fn riot_na(_py: Python, m: &PyModule) -> PyResult<()> { 13 | m.add_class::()?; 14 | m.add_class::()?; 15 | m.add_class::()?; 16 | m.add_class::()?; 17 | m.add_class::()?; 18 | m.add_class::()?; 19 | Ok(()) 20 | } 21 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # EXAMPLE HOW TO INSTALL RIOT IN DOCKER IMAGE 2 | 3 | FROM python:3.11.11-slim 4 | 5 | WORKDIR /app 6 | 7 | # Copy the current directory contents into the container at /app 8 | COPY ./tests/test_e2e.py /app 9 | 10 | # GCC needed for scikit-bio installation. Can be removed after they provide wheels: https://github.com/scikit-bio/scikit-bio/issues/588 11 | RUN apt-get update && apt-get install -y gcc 12 | 13 | RUN --mount=type=cache,target=/root/.cache pip install riot_na 14 | 15 | # Define environment variable 16 | ENV BASE_PATH=/app 17 | 18 | # Test if riot is working correctly 19 | CMD ["python", "test_e2e.py"] 20 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/j_genes/alpaca/igh.fasta: -------------------------------------------------------------------------------- 1 | >ighJ-1 IGH 2 VICUGNA_PACOS 2 | CTTCCCCCATTGCTGGGGCACCTGGGCACCTGGGCACCGTGTCCTCA 3 | >ighJ-2 IGH 2 VICUGNA_PACOS 4 | GCTACAGGTATCTCGAAGTTTGGGGCCAGGGCACCCTGGTCACTGTCTCCTCA 5 | >ighJ-3 IGH 1 VICUGNA_PACOS 6 | CAATGCTTTGGACGCATGGGGCCAGGGGACCCTGGTCACTGTCTCCTCA 7 | >ighJ-4 IGH 2 VICUGNA_PACOS 8 | ATGAGTATGACTACTGGGGCCAGGGGACCCAGGTCACCGTCTCCTCA 9 | >ighJ-5 IGH 2 VICUGNA_PACOS 10 | ACCCCCAGTTTGAATACTGGGGCCAGGGCACCCTGGTCACTGTCTCA 11 | >ighJ-6 IGH 2 VICUGNA_PACOS 12 | CTGACTTTGGTTCCTGGGGCCAGGGGACCCAGGTCACCGTCTCCTCGGGTGAGTCCTCA 13 | >ighJ-7 IGH 2 VICUGNA_PACOS 14 | ATTACTACGGCATGGACTACTGGGGCAAAGGGACCCTGGTCACCGTCTCCTCA 15 | -------------------------------------------------------------------------------- /riot_na/common/gene_match_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utility functions for handling gene matches and species information. 3 | 4 | This module provides shared functionality for converting between different gene match formats 5 | and extracting species information, avoiding the need for proxy classes and code duplication. 6 | """ 7 | 8 | from typing import Sequence 9 | 10 | from riot_na.data.model import Gene, GeneAA 11 | 12 | 13 | def create_gene_lookup(genes: Sequence[Gene | GeneAA]) -> dict[str, Gene | GeneAA]: 14 | """Create a lookup dictionary for genes using species|name as key.""" 15 | return {f"{gene.species.value}|{gene.locus.value}|{gene.name}": gene for gene in genes} 16 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/j_genes/human/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHJ5*02 IGH 2 HOMO_SAPIENS 2 | ACAACTGGTTCGACCCCTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG 3 | >IGHJ4*02 IGH 2 HOMO_SAPIENS 4 | ACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG 5 | >IGHJ6*02 IGH 2 HOMO_SAPIENS 6 | ATTACTACTACTACTACGGTATGGACGTCTGGGGCCAAGGGACCACGGTCACCGTCTCCTCAG 7 | >IGHJ6*03 IGH 2 HOMO_SAPIENS 8 | ATTACTACTACTACTACTACATGGACGTCTGGGGCAAAGGGACCACGGTCACCGTCTCCTCAG 9 | >IGHJ2*01 IGH 1 HOMO_SAPIENS 10 | CTACTGGTACTTCGATCTCTGGGGCCGTGGCACCCTGGTCACTGTCTCCTCAG 11 | >IGHJ1*01 IGH 0 HOMO_SAPIENS 12 | GCTGAATACTTCCAGCACTGGGGCCAGGGCACCCTGGTCACCGTCTCCTCAG 13 | >IGHJ3*02 IGH 1 HOMO_SAPIENS 14 | TGATGCTTTTGATATCTGGGGCCAAGGGACAATGGTCACCGTCTCTTCAG 15 | -------------------------------------------------------------------------------- /tests/test_skbio_alignment.py: -------------------------------------------------------------------------------- 1 | from skbio.alignment import StripedSmithWaterman # type: ignore 2 | 3 | from riot_na.alignment.skbio_alignment import align 4 | 5 | 6 | def test_align(): 7 | # given 8 | query = "CTATACTACTATGGTTCGGGGAGTTATTATAGCCTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAGGGAGTGCATCCGCCCCAACCTCGT" 9 | target = "ACTACTTTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG" 10 | 11 | # when 12 | aligner = StripedSmithWaterman(query) 13 | alignment = align(aligner, "IGHJ4*02", target, 100, query) 14 | 15 | # then 16 | 17 | assert query[alignment.q_start : alignment.q_end] == "TTGACTACTGGGGCCAGGGAACCCTGGTCACCGTCTCCTCAG" 18 | 19 | 20 | if __name__ == "__main__": 21 | test_align() 22 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/j_genes/human/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLJ5*01 IGL 1 HOMO_SAPIENS 2 | CTGGGTGTTTGGTGAGGGGACCGAGCTGACCGTCCTAG 3 | >IGLJ5*02 IGL 1 HOMO_SAPIENS 4 | CTGGGTGTTTGGTGAGGGGACGGAGCTGACCGTCCTAG 5 | >IGLJ6*01 IGL 1 HOMO_SAPIENS 6 | TAATGTGTTCGGCAGTGGCACCAAGGTGACCGTCCTCG 7 | >IGLJ7*02 IGL 1 HOMO_SAPIENS 8 | TGCTGTGTTCGGAGGAGGCACCCAGCTGACCGCCCTCG 9 | >IGLJ7*01 IGL 1 HOMO_SAPIENS 10 | TGCTGTGTTCGGAGGAGGCACCCAGCTGACCGTCCTCG 11 | >IGLJ2*01 IGL 1 HOMO_SAPIENS 12 | TGTGGTATTCGGCGGAGGGACCAAGCTGACCGTCCTAG 13 | >IGLJ1*01 IGL 1 HOMO_SAPIENS 14 | TTATGTCTTCGGAACTGGGACCAAGGTCACCGTCCTAG 15 | >IGLJ3*02 IGL 1 HOMO_SAPIENS 16 | TTGGGTGTTCGGCGGAGGGACCAAGCTGACCGTCCTAG 17 | >IGLJ4*01 IGL 1 HOMO_SAPIENS 18 | TTTTGTATTTGGTGGAGGAACCCAGCTGATCATTTTAG 19 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/c_genes/human/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLC1 IGL HOMO_SAPIENS 2 | GQPKANPTVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADGSPVKAGVETTKPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPTECS 3 | >IGLC2 IGL HOMO_SAPIENS 4 | GQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPTECS 5 | >IGLC3 IGL HOMO_SAPIENS 6 | GQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHKSYSCQVTHEGSTVEKTVAPTECS 7 | >IGLC6 IGL HOMO_SAPIENS 8 | GQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVKVAWKADGSPVNTGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPAECS 9 | >IGLC7 IGL HOMO_SAPIENS 10 | GQPKAAPSVTLFPPSSEELQANKATLVCLVSDFNPGAVTVAWKADGSPVKVGVETTKPSKQSNNKYAASSYLSLTPEQWKSHRSYSCRVTHEGSTVEKTVAPAECS -------------------------------------------------------------------------------- /notebooks/final_benchmark/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Literal 2 | 3 | import pandas as pd 4 | 5 | 6 | def calculate_gene_allele_assignment_precision( 7 | ground_truth_df: pd.DataFrame, df: pd.DataFrame, gene: Literal["v", "j", "d", "c"] = "v" 8 | ) -> dict[str, float]: 9 | joined = ground_truth_df[[f"{gene}_call"]].join(df[[f"{gene}_call"]], lsuffix="_true", how="inner") 10 | return { 11 | "gene": (joined[f"{gene}_call_true"].str.split("*").str[0] == joined[f"{gene}_call"].str.split("*").str[0]) 12 | .value_counts(normalize=True) 13 | .loc[True] 14 | * 100, 15 | "allele": (joined[f"{gene}_call_true"] == joined[f"{gene}_call"]).value_counts(normalize=True).loc[True] * 100, 16 | } 17 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/c_genes/human/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLC1 IGL HOMO_SAPIENS 2 | GQPKANPTVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADGSPVKAGVETTKPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPTECS 3 | >IGLC2 IGL HOMO_SAPIENS 4 | GQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPTECS 5 | >IGLC3 IGL HOMO_SAPIENS 6 | GQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVTVAWKADSSPVKAGVETTTPSKQSNNKYAASSYLSLTPEQWKSHKSYSCQVTHEGSTVEKTVAPTECS 7 | >IGLC6 IGL HOMO_SAPIENS 8 | GQPKAAPSVTLFPPSSEELQANKATLVCLISDFYPGAVKVAWKADGSPVNTGVETTTPSKQSNNKYAASSYLSLTPEQWKSHRSYSCQVTHEGSTVEKTVAPAECS 9 | >IGLC7 IGL HOMO_SAPIENS 10 | GQPKAAPSVTLFPPSSEELQANKATLVCLVSDFNPGAVTVAWKADGSPVKVGVETTKPSKQSNNKYAASSYLSLTPEQWKSHRSYSCRVTHEGSTVEKTVAPAECS -------------------------------------------------------------------------------- /riot_na/__init__.py: -------------------------------------------------------------------------------- 1 | # pylint: skip-file 2 | # flake8: noqa 3 | # isort: skip_file 4 | __version__ = "4.0.7" 5 | from riot_na.riot_na import Prefiltering, GeneMatch 6 | from riot_na.api.riot_numbering import ( 7 | RiotNumberingAA, 8 | RiotNumberingNT, 9 | create_riot_aa, 10 | create_riot_nt, 11 | get_or_create_riot_aa, 12 | get_or_create_riot_nt, 13 | ) 14 | from riot_na.api.api_mp import run_on_file_mp 15 | from riot_na.data.scheme_regions import get_regions_definitions, get_region 16 | from riot_na.data.scheme_definitions import Regions, ChainRegions 17 | from riot_na.data.model import ( 18 | AirrRearrangementEntryNT, 19 | AirrRearrangementEntryAA, 20 | ShortRegion, 21 | ChainType, 22 | Scheme, 23 | Organism, 24 | Locus, 25 | ) 26 | from riot_na.api import utils 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Generated by Cargo 2 | # will have compiled files and executables 3 | debug/ 4 | target/ 5 | 6 | # Remove Cargo.lock from gitignore if creating an executable, leave it for libraries 7 | # More information here https://doc.rust-lang.org/cargo/guide/cargo-toml-vs-cargo-lock.html 8 | Cargo.lock 9 | 10 | # These are backup files generated by rustfmt 11 | **/*.rs.bk 12 | 13 | # MSVC Windows builds of rustc generate these, which store debugging information 14 | *.pdb 15 | 16 | __pycache__/ 17 | 18 | # Added by cargo 19 | 20 | /target 21 | data/ 22 | !/riot_na/data/ 23 | results/ 24 | 25 | venv 26 | scratchpad/ 27 | perf.* 28 | flamegraph* 29 | lib/ 30 | scratchpad* 31 | databases/* 32 | 33 | # Python 34 | __pycache__ 35 | .python-version 36 | .vscode/ 37 | cprofile.txt 38 | dist 39 | 40 | # C extensions 41 | *.so -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | edition = "2021" 3 | name = "riot_prefiltering" 4 | version = "1.0.0" 5 | 6 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 7 | 8 | [dependencies] 9 | ahash = "0.8.3" 10 | bio = "1.1.0" 11 | csv = "1.2.1" 12 | indicatif = "0.17.3" 13 | itertools = "0.10.5" 14 | pyo3 = { version = "0.19.2", features = ["extension-module"] } 15 | rand = "0.8.5" 16 | rayon = "1.7.0" 17 | serde = "1.0.164" 18 | serde_json = "1.0.99" 19 | 20 | [profile.release] 21 | debug = true 22 | 23 | [lib] 24 | # The name of the native library. This is the name which will be used in Python to import the 25 | # library (i.e. `import string_sum`). If you change this, you must also change the name of the 26 | # `#[pymodule]` in `src/lib.rs`. 27 | name = "riot_na" 28 | crate-type = ["cdylib"] 29 | path = "riot_prefiltering/lib.rs" -------------------------------------------------------------------------------- /riot_na/databases/gene_db/j_genes/mouse/igk.fasta: -------------------------------------------------------------------------------- 1 | >IGKJ-5NUZ IGK 1 MUS_MUSCULUS 2 | AACCACATTCAGTGATGGGACCAGACTGGAAATAAAAC 3 | >IGKJ-IIHF IGK 1 MUS_MUSCULUS 4 | AATCACATTCAGTGATGGGACCAGACTGGAAATAAAAC 5 | >IGKJ-LUDL IGK 1 MUS_MUSCULUS 6 | ATTCACGTTCGGCACGGGGACAAAATTGGAAATAAAAC 7 | >IGKJ-Z5J4 IGK 1 MUS_MUSCULUS 8 | ATTCACGTTCGGCTCGGGGACAAAGTTGGAAATAAAAC 9 | >IGKJ-WRAP IGK 0 MUS_MUSCULUS 10 | CCGACGTTCGGTGGAGGCACCAAGCTGGAAATCAATC 11 | >IGKJ-O4CL IGK 1 MUS_MUSCULUS 12 | GCTCACGTTCGGTGCTGGGACCAAGCTGGAGCTGAAAC 13 | >IGKJ-4JXG IGK 1 MUS_MUSCULUS 14 | GTGGACGTTCGGTGGAGGCACCAAGCTGGAAATCAAAC 15 | >IGKJ-MXRV IGK 2 MUS_MUSCULUS 16 | TGTACACGTTCGGAGGGGGGACCAAGCTGGAAATAAAAC 17 | >IGKJ-ABRI IGK 2 MUS_MUSCULUS 18 | TGTACACGTTCGGATCGGGGACCAAGCTGGAAATAAAAC 19 | >IGKJ-HVOR IGK 2 MUS_MUSCULUS 20 | TGTATACGTTCGGATCGGGGACCAAGCTGGAAATAAAAC 21 | >IGKJ-AR7L IGK 2 MUS_MUSCULUS 22 | TGTATACGTTCGGATCGGGGACCAAGCTGGAAATGAAAC 23 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/d_genes/mouse/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHD-4WOL MUS_MUSCULUS 2 | AGACAGCTCAGGCTAC 3 | >IGHD-25TR MUS_MUSCULUS 4 | AGACAGCTCGGGCTAC 5 | >IGHD-4V4R MUS_MUSCULUS 6 | CAACTGGGAC 7 | >IGHD-PF6D MUS_MUSCULUS 8 | CCTACTATAGGTACGAC 9 | >IGHD-2ZLC MUS_MUSCULUS 10 | CCTACTATAGTAACTAC 11 | >IGHD-U6K7 MUS_MUSCULUS 12 | CCTACTATAGTTACTATAGTAACTAC 13 | >IGHD-K7CQ MUS_MUSCULUS 14 | CCTACTATAGTTACTATAGTTACGAC 15 | >IGHD-XMSU MUS_MUSCULUS 16 | CCTACTATGGTAACTAC 17 | >IGHD-5S4H MUS_MUSCULUS 18 | CTAACTGGGAC 19 | >IGHD-4RNO MUS_MUSCULUS 20 | TCTACTATGATTACGAC 21 | >IGHD-7EZJ MUS_MUSCULUS 22 | TCTACTATGGTAACTAC 23 | >IGHD-PKVJ MUS_MUSCULUS 24 | TCTACTATGGTGACTAC 25 | >IGHD-BCLQ MUS_MUSCULUS 26 | TCTACTATGGTTACGAC 27 | >IGHD-IQWG MUS_MUSCULUS 28 | TCTATGATGGTTACTAC 29 | >IGHD-2LU5 MUS_MUSCULUS 30 | TCTATGGTGGTTACTAC 31 | >IGHD-F2TA MUS_MUSCULUS 32 | TTCATTACTACGGCTAC 33 | >IGHD-JK26 MUS_MUSCULUS 34 | TTTATTACTACAGTGGTAGCTAC 35 | >IGHD-XRGU MUS_MUSCULUS 36 | TTTATTACTACGATGGTAGCTAC 37 | >IGHD-DJ7I MUS_MUSCULUS 38 | TTTATTACTACGGTAGTAGCTAC 39 | -------------------------------------------------------------------------------- /riot_na/common/debug_utils.py: -------------------------------------------------------------------------------- 1 | import math 2 | from inspect import getframeinfo 3 | 4 | import pandas as pd # type: ignore 5 | 6 | from riot_na.airr.airr_validator import REGIONS 7 | from riot_na.data.model import AirrRearrangementEntryNT 8 | 9 | 10 | def get_sequence_alignment_with_masked_regions(record: pd.Series) -> str: 11 | def _get_mask_char(region_name: str) -> str: 12 | return "_" if "fwr" in region_name else "-" 13 | 14 | seq = record["sequence_alignment"] 15 | regions = record[REGIONS].replace(math.nan, None).to_dict().items() # type: ignore 16 | for region_name, region in regions: 17 | if region is not None and len(region) > 0: 18 | seq = seq.replace(region, _get_mask_char(region_name) * len(region)) 19 | return seq 20 | 21 | 22 | def get_asserion_error_msg(currentframe, rearrangement: AirrRearrangementEntryNT) -> str: 23 | frameinfo = getframeinfo(currentframe) 24 | return ( 25 | f"{frameinfo.filename}:{frameinfo.lineno} " 26 | f"sequence_id: {rearrangement.sequence_header.split(' ', maxsplit=1)[0]}" 27 | ) 28 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | from riot_na.api.utils import map_insertion_number_to_letter 2 | 3 | 4 | def test_map_insertion_number_to_letter_no_decimal(): 5 | # given 6 | position = "111" 7 | 8 | # when 9 | result = map_insertion_number_to_letter(position) 10 | 11 | # then 12 | assert result == "111" 13 | 14 | 15 | def test_map_insertion_number_to_letter_decimal(): 16 | # given 17 | position = "111.1" 18 | 19 | # when 20 | result = map_insertion_number_to_letter(position) 21 | 22 | # then 23 | assert result == "111A" 24 | 25 | 26 | def test_map_insertion_number_to_letter_z(): 27 | # given 28 | position = "111.26" 29 | 30 | # when 31 | result = map_insertion_number_to_letter(position) 32 | 33 | # then 34 | assert result == "111Z" 35 | 36 | 37 | def test_map_insertion_number_to_letter_double_letter(): 38 | # given 39 | position = "111.27" 40 | 41 | # when 42 | result = map_insertion_number_to_letter(position) 43 | 44 | # then 45 | assert result == "111AA" 46 | 47 | 48 | def test_map_insertion_number_to_letter_large_insertion(): 49 | # given 50 | position = "111.100" 51 | 52 | # when 53 | result = map_insertion_number_to_letter(position) 54 | 55 | # then 56 | assert result == "111CV" 57 | -------------------------------------------------------------------------------- /riot_na/common/io.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from typing import Iterable, Type 3 | 4 | from Bio import SeqIO 5 | 6 | from riot_na.common.airr_csv_writer import AirrRearrangementEntryWriter 7 | from riot_na.data.model import AirrRearrangementEntry_co # type: ignore 8 | 9 | 10 | def read_fasta(path: Path) -> tuple[list[str], list[str]]: 11 | headers = [] 12 | sequences = [] 13 | for record in SeqIO.parse(path, "fasta"): 14 | headers.append(record.description) 15 | sequences.append(str(record.seq)) 16 | return headers, sequences 17 | 18 | 19 | def count_fasta_records(input_fasta_path: Path, input_format: str = "fasta") -> int: 20 | total = 0 21 | start_token = ">" if input_format == "fasta" else "@" 22 | with open(input_fasta_path) as input_file: 23 | for line in input_file: 24 | if line.startswith(start_token): 25 | total += 1 26 | return total 27 | 28 | 29 | def write_airr_iter_to_csv( 30 | output_file_path: Path, 31 | cls: Type[AirrRearrangementEntry_co], 32 | airr_iter: Iterable[AirrRearrangementEntry_co] | Iterable[list[AirrRearrangementEntry_co]], 33 | ): 34 | output_file_path.parent.mkdir(exist_ok=True, parents=True) 35 | with open(output_file_path, "w") as output: 36 | AirrRearrangementEntryWriter(output, cls).write(airr_iter) 37 | -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | default_install_hook_types: ['pre-commit', 'commit-msg'] 2 | repos: 3 | - repo: https://github.com/pre-commit/pre-commit-hooks 4 | rev: v4.6.0 5 | hooks: 6 | - id: trailing-whitespace 7 | - id: check-yaml 8 | - id: check-ast 9 | - id: check-added-large-files 10 | exclude: 'bin|config|fastq|sra|csv|txt' 11 | args: ['--maxkb=2048'] 12 | - id: check-case-conflict 13 | 14 | - repo: https://github.com/commitizen-tools/commitizen 15 | rev: v3.29.0 16 | hooks: 17 | - id: commitizen 18 | stages: [commit-msg] 19 | 20 | - repo: local 21 | hooks: 22 | - id: isort 23 | name: isort 24 | entry: isort 25 | language: python 26 | types_or: [python] 27 | args: ["--profile", "black", "--filter-files"] 28 | - id: black 29 | name: black 30 | entry: black 31 | language: python 32 | types_or: [python, pyi] 33 | - id: pylint 34 | name: pylint 35 | entry: pylint 36 | language: python 37 | types: [ python ] 38 | - id: flake8 39 | name: flake8 40 | entry: flake8 41 | language: python 42 | types: [ python ] 43 | - id: mypy 44 | name: mypy 45 | entry: mypy 46 | language: python 47 | types: [ python ] 48 | -------------------------------------------------------------------------------- /notebooks/data_processing/dict_merge.py: -------------------------------------------------------------------------------- 1 | # Copyright 2016-2022 Paul Durivage 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def dict_merge(dct: dict, merge_dct: dict) -> dict: 17 | """Recursive dict merge. Inspired by :meth:``dict.update()``, instead of 18 | updating only top-level keys, dict_merge recurses down into dicts nested 19 | to an arbitrary depth, updating keys. The ``merge_dct`` is merged into 20 | ``dct``. 21 | 22 | :param dct: dict onto which the merge is executed 23 | :param merge_dct: dct merged into dct 24 | :return: None 25 | """ 26 | for k in merge_dct.keys(): 27 | if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], dict): # noqa 28 | dict_merge(dct[k], merge_dct[k]) 29 | else: 30 | dct[k] = merge_dct[k] 31 | return dct 32 | -------------------------------------------------------------------------------- /riot_na/common/airr_csv_writer.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from dataclasses import fields 3 | from typing import Any, Dict, Iterable, Optional, Type 4 | 5 | from riot_na.data.model import AirrRearrangementEntry_co, serialize_airr_entry 6 | 7 | 8 | class AirrRearrangementEntryWriter: 9 | def __init__( 10 | self, 11 | file_handle: Any, 12 | cls: Type[AirrRearrangementEntry_co], 13 | dialect: str = "excel", 14 | fmtparams: Optional[Dict[str, Any]] = None, 15 | ): 16 | if not file_handle: 17 | raise ValueError("The f argument is required") 18 | 19 | if fmtparams is None: 20 | fmtparams = {} 21 | 22 | self._fieldnames = [x.name for x in fields(cls)] 23 | self._writer = csv.DictWriter(file_handle, dialect=dialect, fieldnames=self._fieldnames, **fmtparams) 24 | 25 | def write( 26 | self, 27 | data: Iterable[AirrRearrangementEntry_co] | Iterable[list[AirrRearrangementEntry_co]], 28 | skip_header: bool = False, 29 | ): 30 | if not skip_header: 31 | self._writer.writeheader() 32 | 33 | for item in data: 34 | if isinstance(item, list): 35 | for subitem in item: 36 | row = serialize_airr_entry(subitem) 37 | self._writer.writerow(row) 38 | else: 39 | row = serialize_airr_entry(item) 40 | self._writer.writerow(row) 41 | -------------------------------------------------------------------------------- /riot_na/riot_na.pyi: -------------------------------------------------------------------------------- 1 | class GeneMatch: 2 | gene_id: str 3 | rev_comp: bool 4 | coverage: int 5 | 6 | class GeneSegment: 7 | start_target: int 8 | end_target: int 9 | start_query: int 10 | end_query: int 11 | coverage: int 12 | match_count: int 13 | 14 | class SegmentMatch: 15 | query_start: int 16 | query_end: int 17 | coverage: int 18 | match_count: int 19 | matching_genes: list[GeneMatch] 20 | segment_start: int 21 | 22 | def query_length(self) -> int: ... 23 | 24 | class PrefilteringResult: 25 | query: str 26 | rev_comp_query: str 27 | top_matches: list[GeneMatch] 28 | 29 | class PrefilteringSegmentResult: 30 | query: str 31 | rev_comp_query: str 32 | segments: list[SegmentMatch] 33 | 34 | def domain_count(self) -> int: ... 35 | 36 | class Prefiltering: 37 | def __init__( 38 | self, 39 | genes: dict[str, str], 40 | kmer_size: int, 41 | distance_threshold: int, 42 | top_n: int, 43 | modulo_n: int, 44 | min_segment_length: int = 30, 45 | min_coverage: int = 20, 46 | ): ... 47 | def calculate_top_matches_with_rev_comp(self, query: str) -> PrefilteringResult: ... 48 | def calculate_top_matches(self, query: str) -> PrefilteringResult: ... 49 | def calculate_segment_matches(self, query: str) -> PrefilteringSegmentResult: ... 50 | def calculate_segment_matches_with_rev_comp(self, query: str) -> PrefilteringSegmentResult: ... 51 | -------------------------------------------------------------------------------- /.pylintrc: -------------------------------------------------------------------------------- 1 | [FORMAT] 2 | max-line-length=120 3 | max-args=10 4 | min-public-methods=0 5 | 6 | [MASTER] 7 | extension-pkg-allow-list=math,freesasa,duckdb 8 | good-names=ex,i,df,s3 9 | 10 | [SIMILARITIES] 11 | min-similarity-lines=25 12 | ignore-imports=yes 13 | 14 | 15 | [MESSAGES CONTROL] 16 | 17 | # Only show warnings with the listed confidence levels. Leave empty to show 18 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED. 19 | confidence= 20 | 21 | # Disable the message, report, category or checker with the given id(s). You 22 | # can either give multiple identifiers separated by comma (,) or put this 23 | # option multiple times (only on the command line, not in the configuration 24 | # file where it should appear only once). You can also use "--disable=all" to 25 | # disable everything first and then reenable specific checks. For example, if 26 | # you want to run only the similarities checker, you can use "--disable=all 27 | # --enable=similarities". If you want to run only the classes checker, but have 28 | # no Warning level messages displayed, use "--disable=all --enable=classes 29 | # --disable=W". 30 | disable=missing-module-docstring, 31 | missing-function-docstring, 32 | missing-class-docstring, 33 | ; temporarily - until it is being fixed by pylint to support Option and Union generics 34 | unsubscriptable-object, 35 | ; temporary - until pylint will handle mongoengine 36 | no-member, 37 | unspecified-encoding, 38 | use-dict-literal, 39 | too-many-return-statements, 40 | line-too-long, 41 | too-many-locals, 42 | too-many-positional-arguments 43 | -------------------------------------------------------------------------------- /riot_na/data/scheme_mapping_facade.py: -------------------------------------------------------------------------------- 1 | import csv 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | from riot_na.config import GENE_DB_DIR 6 | from riot_na.data.model import AlignmentString, GeneId, Organism, Scheme 7 | 8 | 9 | class SchemeMappingFacade: 10 | mappings: dict[GeneId, AlignmentString] 11 | 12 | def __init__( 13 | self, scheme: Scheme, allowed_species: Optional[tuple[Organism, ...]] = None, db_dir: Path = GENE_DB_DIR 14 | ): 15 | if not allowed_species: 16 | allowed_species = (Organism.HOMO_SAPIENS, Organism.MUS_MUSCULUS, Organism.VICUGNA_PACOS) 17 | 18 | self.mappings = {} 19 | 20 | for species in allowed_species: 21 | file_path = db_dir / "scheme_mappings" / species.value / scheme.value / "scheme_mapping.csv" 22 | 23 | with file_path.open() as scheme_mapping_file: 24 | scheme_mapping_reader = csv.DictReader(scheme_mapping_file) 25 | for row in scheme_mapping_reader: 26 | self.mappings[species.value + "|" + row["gene_id"]] = AlignmentString(row["scheme_cigar"]) 27 | 28 | def get_mapping(self, organism: Organism, gene_id: str) -> AlignmentString: 29 | gene_id = gene_id.split("|")[-1] 30 | return AlignmentString(self.mappings[organism.value + "|" + gene_id]) 31 | 32 | 33 | if __name__ == "__main__": 34 | scheme_mapping_data_path = GENE_DB_DIR 35 | scheme_mapping_db = SchemeMappingFacade( 36 | scheme=Scheme.IMGT, allowed_species=(Organism.HOMO_SAPIENS,), db_dir=scheme_mapping_data_path 37 | ) 38 | 39 | print(scheme_mapping_db.mappings["IGHV1-2*07"]) 40 | print(scheme_mapping_db.mappings["IGKJ4*01"]) 41 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/d_genes/human/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHD2-21*02 HOMO_SAPIENS 2 | AGCATATTGTGGTGGTGACTGCTATTCC 3 | >IGHD2-21*01 HOMO_SAPIENS 4 | AGCATATTGTGGTGGTGATTGCTATTCC 5 | >IGHD2-8*01 HOMO_SAPIENS 6 | AGGATATTGTACTAATGGTGTATGCTATACC 7 | >IGHD2-8*02 HOMO_SAPIENS 8 | AGGATATTGTACTGGTGGTGTATGCTATACC 9 | >IGHD2-2*02 HOMO_SAPIENS 10 | AGGATATTGTAGTAGTACCAGCTGCTATACC 11 | >IGHD2-2*01 HOMO_SAPIENS 12 | AGGATATTGTAGTAGTACCAGCTGCTATGCC 13 | >IGHD2-15*01 HOMO_SAPIENS 14 | AGGATATTGTAGTGGTGGTAGCTGCTACTCC 15 | >IGHD7-27*01 HOMO_SAPIENS 16 | CTAACTGGGGA 17 | >IGHD6-6*01 HOMO_SAPIENS 18 | GAGTATAGCAGCTCGTCC 19 | >IGHD6-13*01 HOMO_SAPIENS 20 | GGGTATAGCAGCAGCTGGTAC 21 | >IGHD6-25*01 HOMO_SAPIENS 22 | GGGTATAGCAGCGGCTAC 23 | >IGHD6-19*01 HOMO_SAPIENS 24 | GGGTATAGCAGTGGCTGGTAC 25 | >IGHD1-1*01 HOMO_SAPIENS 26 | GGTACAACTGGAACGAC 27 | >IGHD1-14*01 HOMO_SAPIENS 28 | GGTATAACCGGAACCAC 29 | >IGHD1-20*01 HOMO_SAPIENS 30 | GGTATAACTGGAACGAC 31 | >IGHD1-7*01 HOMO_SAPIENS 32 | GGTATAACTGGAACTAC 33 | >IGHD1-26*01 HOMO_SAPIENS 34 | GGTATAGTGGGAGCTACTAC 35 | >IGHD5-24*01 HOMO_SAPIENS 36 | GTAGAGATGGCTACAATTAC 37 | >IGHD3-9*01 HOMO_SAPIENS 38 | GTATTACGATATTTTGACTGGTTATTATAAC 39 | >IGHD3-3*01 HOMO_SAPIENS 40 | GTATTACGATTTTTGGAGTGGTTATTATACC 41 | >IGHD3-22*01 HOMO_SAPIENS 42 | GTATTACTATGATAGTAGTGGTTATTACTAC 43 | >IGHD3-10*03 HOMO_SAPIENS 44 | GTATTACTATGGTTCAGGGAGTTATTATAAC 45 | >IGHD3-10*01 HOMO_SAPIENS 46 | GTATTACTATGGTTCGGGGAGTTATTATAAC 47 | >IGHD3-16*03 HOMO_SAPIENS 48 | GTATTATGATTACATTTGGGGGAGTTATCGTTATACC 49 | >IGHD3-16*02 HOMO_SAPIENS 50 | GTATTATGATTACGTTTGGGGGAGTTATCGTTATACC 51 | >IGHD5-18*02 HOMO_SAPIENS 52 | GTGAATATAGTGGCTACGATTAC 53 | >IGHD5-5*01 HOMO_SAPIENS 54 | GTGGATACAGCTATGGTTAC 55 | >IGHD5-12*01 HOMO_SAPIENS 56 | GTGGATATAGTGGCTACGATTAC 57 | >IGHD4-4*01 HOMO_SAPIENS 58 | TGACTACAGTAACTAC 59 | >IGHD4-17*01 HOMO_SAPIENS 60 | TGACTACGGTGACTAC 61 | >IGHD4-23*01 HOMO_SAPIENS 62 | TGACTACGGTGGTAACTCC 63 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/c_genes/human/igl.fasta: -------------------------------------------------------------------------------- 1 | >IGLC1 IGL HOMO_SAPIENS 2 | GGTCAGCCCAAGGCCAACCCCACTGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTCCAAGCCAACAAGGCCACACTAGT 3 | GTGTCTGATCAGTGACTTCTACCCGGGAGCTGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGCGGGAGTGG 4 | AGACCACCAAACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTACCTGAGCCTGACGCCCGAGCAGTGGAAG 5 | TCCCACAGAAGCTACAGCTGCCAGGTCACGCATGAAGGGAGCACCGTGGAGAAGACAGTGGCCCCTACAGAATGTTCATA 6 | G 7 | >IGLC2 IGL HOMO_SAPIENS 8 | GGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGT 9 | GTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGG 10 | AGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTATCTGAGCCTGACGCCTGAGCAGTGGAAG 11 | TCCCACAGAAGCTACAGCTGCCAGGTCACGCATGAAGGGAGCACCGTGGAGAAGACAGTGGCCCCTACAGAATGTTCATA 12 | G 13 | >IGLC3 IGL HOMO_SAPIENS 14 | GGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGT 15 | GTGTCTCATAAGTGACTTCTACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATAGCAGCCCCGTCAAGGCGGGAGTGG 16 | AGACCACCACACCCTCCAAACAAAGCAACAACAAGTACGCGGCCAGCAGCTACCTGAGCCTGACGCCTGAGCAGTGGAAG 17 | TCCCACAAAAGCTACAGCTGCCAGGTCACGCATGAAGGGAGCACCGTGGAGAAGACAGTGGCCCCTACAGAATGTTCATA 18 | G 19 | >IGLC6 IGL HOMO_SAPIENS 20 | GGTCAGCCCAAGGCTGCCCCATCGGTCACTCTGTTCCCGCCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGT 21 | GTGCCTGATCAGTGACTTCTACCCGGGAGCTGTGAAAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAACACGGGAGTGG 22 | AGACCACCACACCCTCCAAACAGAGCAACAACAAGTACGCGGCCAGCAGCTAGCTACCTGAGCCTGACGCCTGAGCAGTG 23 | GAAGTCCCACAGAAGCTACAGTTGCCAGGTCACGCATGAAGGGAGCACCGTGGAGAAGACAGTGGCCCCTGCAGAATGCT 24 | CTTAG 25 | >IGLC7 IGL HOMO_SAPIENS 26 | GGTCAGCCCAAGGCTGCCCCCTCGGTCACTCTGTTCCCACCCTCCTCTGAGGAGCTTCAAGCCAACAAGGCCACACTGGT 27 | GTGTCTCGTAAGTGACTTCAACCCGGGAGCCGTGACAGTGGCCTGGAAGGCAGATGGCAGCCCCGTCAAGGTGGGAGTGG 28 | AGACCACCAAACCCTCCAAACAAAGCAACAACAAGTATGCGGCCAGCAGCTACCTGAGCCTGACGCCCGAGCAGTGGAAG 29 | TCCCACAGAAGCTACAGCTGCCGGGTCACGCATGAAGGGAGCACCGTGGAGAAGACAGTGGCCCCTGCAGAATGCTCTTA 30 | G 31 | -------------------------------------------------------------------------------- /notebooks/data_processing/therapeutics_preprocess_and_filter_species.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | import pandas as pd 4 | 5 | 6 | def melt_heavy_light(df: pd.DataFrame) -> pd.DataFrame: 7 | df = df[["therapeutic", "heavy_sequence", "light_sequence"]].melt( 8 | ["therapeutic"], var_name="chain_type", value_name="sequence" 9 | ) 10 | df["sequence_id"] = df["therapeutic"] + "_" + df["chain_type"].str.strip("_sequence") 11 | df = df.set_index("sequence_id") 12 | df = df[df["sequence"] != "na"] 13 | return df 14 | 15 | 16 | DIR = Path(__file__).parent.parent.parent / "data" / "therapeutics" 17 | 18 | if __name__ == "__main__": 19 | sequence_types = ["human", "humanized"] 20 | dataset = pd.read_parquet(DIR / "therapeutics.parquet") 21 | dataset = dataset.drop_duplicates(subset=["therapeutic"]).reset_index(drop=True) 22 | 23 | dataset_heavy_light = melt_heavy_light(dataset) 24 | 25 | dataset_heavy_light.to_csv(DIR / "therapeutics.csv") 26 | 27 | with open(DIR / "therapeutics_aa.fasta", "w") as file: 28 | for pdb_id_chain, sequence in dataset_heavy_light[["sequence"]].itertuples(name=None): 29 | file.write(f">{pdb_id_chain}\n") 30 | file.write(f"{sequence}\n") 31 | 32 | metadata = pd.read_csv(DIR / "metadata.csv", skiprows=1) 33 | metadata.columns = metadata.columns.str.strip() 34 | metadata = metadata[["therapeutic", "type"]].copy() 35 | metadata["therapeutic"] = metadata["therapeutic"].str.strip().str.lower().str.capitalize() 36 | metadata["type"] = metadata["type"].str.strip().str.lower() 37 | metadata_human = metadata[metadata["type"].isin(sequence_types)] 38 | 39 | dataset_human = dataset.merge(metadata_human[["therapeutic"]], on="therapeutic") 40 | 41 | dataset_heavy_light_human = melt_heavy_light(dataset_human) 42 | dataset_heavy_light_human.to_csv(DIR / "therapeutics_human.csv") 43 | 44 | with open(DIR / "therapeutics_human_aa.fasta", "w") as file: 45 | for pdb_id_chain, sequence in dataset_heavy_light_human[["sequence"]].itertuples(name=None): 46 | file.write(f">{pdb_id_chain}\n") 47 | file.write(f"{sequence}\n") 48 | -------------------------------------------------------------------------------- /notebooks/utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | from typing import Optional 3 | 4 | import pandas as pd 5 | 6 | from riot_na.common.serialization_utils import base64_decode 7 | 8 | 9 | def base64_decode_series(df, col): 10 | df = df[df[col].notna()].copy() 11 | df[col] = df[col].apply(base64_decode).apply(json.loads) 12 | return df 13 | 14 | 15 | def not_na_df(df): 16 | return df[(df.ne("") & df.notna()).all(axis=1)] 17 | 18 | 19 | def calculate_validation_flags_summary(df): 20 | df_notna = df[df["v_call"].notna() & (df["v_call"] != "") & df["productive"]] 21 | return df_notna["additional_validation_flags"].apply(pd.Series).apply(lambda x: x[x.notna()].value_counts()).T 22 | 23 | 24 | def validation_flags_comparison( 25 | df_1, 26 | df_2, 27 | df_1_name: str = "old", 28 | df_2_name: str = "new", 29 | additional_properties: Optional[list[str]] = None, 30 | ): 31 | if additional_properties is None: 32 | additional_properties = ["productive", "complete_vdj"] 33 | df_1_notna = df_1[df_1["v_call"].notna() & (df_1["v_call"] != "") & df_1["productive"]] 34 | df_2_notna = df_2[df_2["v_call"].notna() & (df_2["v_call"] != "") & df_2["productive"]] 35 | common_index = df_1_notna.index.intersection(df_2_notna.index) 36 | flags_as_columns = df_1_notna.loc[common_index]["additional_validation_flags"].apply(pd.Series) 37 | 38 | for riot_property in additional_properties: 39 | flags_as_columns[riot_property] = df_1_notna.loc[common_index][riot_property] 40 | df_1_val_flags = flags_as_columns.apply(lambda x: x[x.notna()].value_counts()).T 41 | 42 | flags_as_columns = df_2_notna.loc[common_index]["additional_validation_flags"].apply(pd.Series) 43 | for riot_property in additional_properties: 44 | flags_as_columns[riot_property] = df_2_notna.loc[common_index][riot_property] 45 | df_2_val_flags = flags_as_columns.apply(lambda x: x[x.notna()].value_counts()).T 46 | 47 | comp = df_1_val_flags.join(df_2_val_flags, lsuffix=f"_{df_1_name}", rsuffix=f"_{df_2_name}") 48 | comp = comp.fillna(0) 49 | comp[f"False_{df_1_name}-{df_2_name}"] = comp[f"False_{df_1_name}"] - comp[f"False_{df_2_name}"] 50 | comp[f"True_{df_1_name}-{df_2_name}"] = comp[f"True_{df_1_name}"] - comp[f"True_{df_2_name}"] 51 | return comp[ 52 | [ 53 | f"False_{df_1_name}", 54 | f"False_{df_2_name}", 55 | f"False_{df_1_name}-{df_2_name}", 56 | f"True_{df_1_name}", 57 | f"True_{df_2_name}", 58 | f"True_{df_1_name}-{df_2_name}", 59 | ] 60 | ] 61 | -------------------------------------------------------------------------------- /riot_na/common/multi_species_prefiltering.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from riot_na import Prefiltering 4 | from riot_na.data.model import ( 5 | Gene, 6 | GeneAA, 7 | Locus, 8 | Organism, 9 | SpeciesGeneMatch, 10 | SpeciesPrefilteringResult, 11 | ) 12 | from riot_na.riot_na import PrefilteringResult # pylint: disable=no-name-in-module 13 | 14 | 15 | class MultiSpeciesPrefiltering: 16 | def __init__( 17 | self, all_genes: Sequence[Gene | GeneAA], kmer_size: int, distance_threshold: int, top_n: int, modulo_n: int 18 | ): 19 | self.all_genes = dict( 20 | map(lambda gene: (gene.species + "|" + gene.locus + "|" + gene.name, gene.sequence), all_genes) 21 | ) 22 | 23 | self.prefiltering = Prefiltering( 24 | self.all_genes, 25 | top_n=top_n, 26 | kmer_size=kmer_size, 27 | distance_threshold=distance_threshold, 28 | modulo_n=modulo_n, 29 | ) 30 | 31 | def calculate_top_matches_with_rev_comp(self, query: str) -> SpeciesPrefilteringResult: 32 | 33 | try: 34 | raw_result = self.prefiltering.calculate_top_matches_with_rev_comp(query) 35 | except ValueError: 36 | return SpeciesPrefilteringResult(query=query, rev_comp_query="", top_matches=[]) 37 | 38 | return self._map_raw_result_to_matches(raw_result) 39 | 40 | def calculate_top_matches(self, query: str) -> SpeciesPrefilteringResult: 41 | 42 | try: 43 | raw_result = self.prefiltering.calculate_top_matches(query) 44 | except ValueError: 45 | return SpeciesPrefilteringResult(query=query, rev_comp_query="", top_matches=[]) 46 | 47 | return self._map_raw_result_to_matches(raw_result) 48 | 49 | def _map_raw_result_to_matches(self, raw_result: PrefilteringResult) -> SpeciesPrefilteringResult: 50 | mapped_matches = [] 51 | for match in raw_result.top_matches: 52 | (species, locus, name) = match.gene_id.split("|") 53 | mapped_matches.append( 54 | SpeciesGeneMatch( 55 | species_gene_id=match.gene_id, 56 | gene_id=name, 57 | rev_comp=match.rev_comp, 58 | coverage=match.coverage, 59 | species=Organism(species), 60 | locus=Locus(locus), 61 | ) 62 | ) 63 | return SpeciesPrefilteringResult( 64 | query=raw_result.query, rev_comp_query=raw_result.rev_comp_query, top_matches=mapped_matches 65 | ) 66 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/c_genes/alpaca/igh.fasta: -------------------------------------------------------------------------------- 1 | >ighmu IGH VICUGNA_PACOS 2 | SSSAPTLFPLASCESPVSDESPVALGCLARDFLPGSITFSWSYPNGIAVSSQSIKTFPSVLREGKYVATSQVLLPSQSVLQGSELICKVQHSKGNSDMVVPLPILDLPPSVTLFMPPRDGFSGTSKRTSKLICQATDFSPREISVSWFREGKRLVSGFITEDVEASKSNPGTFSVISMLTITDGDWFSQAVYTCQVEHRGMVIEKNVSSQCNPPSPGIEVFAIPPSFSDIFLNKSAKLTCLVTGLVTYDSLRISWTRQGEKAVDSQIIDSTILPNGTFSATCVASVCVEDWESGDRFTCTVTHLDLPSPLKRSIFKPEVHKHMPSVYVLPPAREQLSLRESASITCLVKGFSPPDVFVQWLKKGEQEPLSPDNYVTSAPVPEPNSPGYYFVHSVLTVSEKDWSAGATYTCVVGHEALPHLVTERTVDKSTGKPTLYNVSLVMSDTASTCY 3 | >ighgama2b IGH VICUGNA_PACOS 4 | EPKTPKPQPQPQPQPQPNPTTESKCPKCPAPELLGGPSVFIFPPKPKDVLSISGRPEVTCVVVDVGQEDPEVSFNWYIDGAEVRTANTRPKEEQFNSTYRVVSVLPIQHQDWLTGKEFKCKVNNKALPAPIEKTISKAKGQTREPQVYTLAPHREELAKDTVSVTCLVKGFYPPDINVEWQRNRQPEPEGTYATTPPQLDNDGTYFLYSKLSVGKNTWQRGETFTCVVMHEALHNHYTQKSITQSSGK 5 | >ighgama1a IGH VICUGNA_PACOS 6 | STKAPSVYPLTARCGDTPGSTVAFGCLVWGYIPEPVTVTWNSGALSSGVHTFPSVFMSSGLYTLSSLVTMPASSSTGKTFICNVAHPASSTKVDKRVELKTPQPQSQPECRCPKCPAPELLGGPSVFIFPPKPKDVLSISGRPEVTCVVVDVGQEDPEVSFNWYIDGAEVRTANTKPKEEQFNSTYRVVSVLPIRHQDWLTGKEFKCKVNNKALPAPIERTISKAKGQTREPQVYALAPHREELAKDTVSVTCLVKDFYPVDINIEWQRNGQPESEGTYATTPPQLDNDGTYFLYSKLSVGKNTWQRGETFTCVVMHEALPNHYTQKSITQSSGK 7 | >ighgama1b IGH VICUGNA_PACOS 8 | STKAPSVYPLTARCGDTPGSTVAFGCLVWGYIPEPVTVTWNSGALSSGVHTFPSVFMSSGLYSLSSLVTLPTSSSTGKTFICNVAHPASSTKVDKRVEPHGGCTCPQCPAPELPGGPSVFVFPPKPKDVLSISGRPEVTCVVVDVGKEDPEVNFNWYIDGVEVRTANTKPKEEQFNSTYRVVSVLPIQHQDWLTGKEFKCKVNNKALPAPIERTISKAKGQTREPQVYTLAPHREELAKDTVSVTCLVKGFYPADINVEWQRNGQPESEGTYANTPPQLDNDGTYFLYSKLSVGKNTWQRGETLTCVVMHEALHNHYTQKSISQSPGK 9 | >ighgama2c IGH VICUGNA_PACOS 10 | AHHSEDPSSKCPKCPGPELLGGPTVFIFPPKPKDVLSITRKPEVTCVVVDVGKEDPEIEFSWSVDDTEVHTAETKPKEEQFNSTYRVVSVLPIQHQDWLTGKEFKCKVNNKALPAPIERTISKAKGQTREPQVYTLAPHREELAKDTVSVTCLVKGFFPADINVEWQRNGQPESEGTYATTPPQLDNDGTYFLYSKLSVGKNTWQQGEVFTCVVMHEALHNHSTQKSISQSPGK 11 | >ighepsilon IGH VICUGNA_PACOS 12 | ASTQKPTVFPLTCCKNTTDVTAVALGCLVTGYFPEPVTVTWDTGSLNSSTRTFPAIQNLESSLYTTSSQVTILGKWSKQKFTCSVAHPDSNITITKVVPGCFKDFPEPSVKLFHSSCNPDGDTHTTIQLLCLISGYTPGRIQVAWLEDGQAVTDRFPQTANDRPEGKLASTHSQLNITQEEWLSQKTYTCQVTYNGFTYEDHARKCTESDPRGVSAYLIPPTPLDLYVHKSPKITCLVVDLARKEGMNLTWFRENRGPAQPDSLVIKTQFNKTVTATSTLLVDVQDWIEGETYYCKVTHPDLPRSILRSISKAPGKRLAPEVYVLSPRKEERAAKDKLTLTCLAQNFFPEDISVQWLRNNALIQTDQHSTTKPHKANGPSPAFFVYSRLVVSRADWEQKNKFTCRVVHEALPGSRTLEKSVSSDLGK 13 | >ighalpha IGH VICUGNA_PACOS 14 | SEPATSPSVFPLGPSYDKASRQVVLACLVHGFFPPAPLKVTWGLSGQNVSVMDFPTVQPASGVLYTMSSQLTTPVEQCPDSEIVTCQVQHLSSSSQTVNVPCRAPTPQPLCCKPSLALHPPALEDLLLGSNASLTCTLSGLRNPEGAQFTWTPSGGKVAVQQSPKHDPCGCFSVSSVLPGCAEQWNSKTTFSCSATHPESENTLTATITKSLEDPIRPQVHLLPPPSEELALNEMVTLTCVVRGFSPKDVLVRWLHGNQELPREKYLTWRPLPEPEQSITTYAVTSLLRVEAEAWKQGDNYSCMVGHEALPLAFTQKTIDRLSGKPTHVNVSVVMAEAEGVCY 15 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/c_genes/alpaca/igh.fasta: -------------------------------------------------------------------------------- 1 | >ighmu IGH VICUGNA_PACOS 2 | SSSAPTLFPLASCESPVSDESPVALGCLARDFLPGSITFSWSYPNGIAVSSQSIKTFPSVLREGKYVATSQVLLPSQSVLQGSELICKVQHSKGNSDMVVPLPILDLPPSVTLFMPPRDGFSGTSKRTSKLICQATDFSPREISVSWFREGKRLVSGFITEDVEASKSNPGTFSVISMLTITDGDWFSQAVYTCQVEHRGMVIEKNVSSQCNPPSPGIEVFAIPPSFSDIFLNKSAKLTCLVTGLVTYDSLRISWTRQGEKAVDSQIIDSTILPNGTFSATCVASVCVEDWESGDRFTCTVTHLDLPSPLKRSIFKPEVHKHMPSVYVLPPAREQLSLRESASITCLVKGFSPPDVFVQWLKKGEQEPLSPDNYVTSAPVPEPNSPGYYFVHSVLTVSEKDWSAGATYTCVVGHEALPHLVTERTVDKSTGKPTLYNVSLVMSDTASTCY 3 | >ighgama2b IGH VICUGNA_PACOS 4 | EPKTPKPQPQPQPQPQPNPTTESKCPKCPAPELLGGPSVFIFPPKPKDVLSISGRPEVTCVVVDVGQEDPEVSFNWYIDGAEVRTANTRPKEEQFNSTYRVVSVLPIQHQDWLTGKEFKCKVNNKALPAPIEKTISKAKGQTREPQVYTLAPHREELAKDTVSVTCLVKGFYPPDINVEWQRNRQPEPEGTYATTPPQLDNDGTYFLYSKLSVGKNTWQRGETFTCVVMHEALHNHYTQKSITQSSGK 5 | >ighgama1a IGH VICUGNA_PACOS 6 | STKAPSVYPLTARCGDTPGSTVAFGCLVWGYIPEPVTVTWNSGALSSGVHTFPSVFMSSGLYTLSSLVTMPASSSTGKTFICNVAHPASSTKVDKRVELKTPQPQSQPECRCPKCPAPELLGGPSVFIFPPKPKDVLSISGRPEVTCVVVDVGQEDPEVSFNWYIDGAEVRTANTKPKEEQFNSTYRVVSVLPIRHQDWLTGKEFKCKVNNKALPAPIERTISKAKGQTREPQVYALAPHREELAKDTVSVTCLVKDFYPVDINIEWQRNGQPESEGTYATTPPQLDNDGTYFLYSKLSVGKNTWQRGETFTCVVMHEALPNHYTQKSITQSSGK 7 | >ighgama1b IGH VICUGNA_PACOS 8 | STKAPSVYPLTARCGDTPGSTVAFGCLVWGYIPEPVTVTWNSGALSSGVHTFPSVFMSSGLYSLSSLVTLPTSSSTGKTFICNVAHPASSTKVDKRVEPHGGCTCPQCPAPELPGGPSVFVFPPKPKDVLSISGRPEVTCVVVDVGKEDPEVNFNWYIDGVEVRTANTKPKEEQFNSTYRVVSVLPIQHQDWLTGKEFKCKVNNKALPAPIERTISKAKGQTREPQVYTLAPHREELAKDTVSVTCLVKGFYPADINVEWQRNGQPESEGTYANTPPQLDNDGTYFLYSKLSVGKNTWQRGETLTCVVMHEALHNHYTQKSISQSPGK 9 | >ighgama2c IGH VICUGNA_PACOS 10 | AHHSEDPSSKCPKCPGPELLGGPTVFIFPPKPKDVLSITRKPEVTCVVVDVGKEDPEIEFSWSVDDTEVHTAETKPKEEQFNSTYRVVSVLPIQHQDWLTGKEFKCKVNNKALPAPIERTISKAKGQTREPQVYTLAPHREELAKDTVSVTCLVKGFFPADINVEWQRNGQPESEGTYATTPPQLDNDGTYFLYSKLSVGKNTWQQGEVFTCVVMHEALHNHSTQKSISQSPGK 11 | >ighepsilon IGH VICUGNA_PACOS 12 | ASTQKPTVFPLTCCKNTTDVTAVALGCLVTGYFPEPVTVTWDTGSLNSSTRTFPAIQNLESSLYTTSSQVTILGKWSKQKFTCSVAHPDSNITITKVVPGCFKDFPEPSVKLFHSSCNPDGDTHTTIQLLCLISGYTPGRIQVAWLEDGQAVTDRFPQTANDRPEGKLASTHSQLNITQEEWLSQKTYTCQVTYNGFTYEDHARKCTESDPRGVSAYLIPPTPLDLYVHKSPKITCLVVDLARKEGMNLTWFRENRGPAQPDSLVIKTQFNKTVTATSTLLVDVQDWIEGETYYCKVTHPDLPRSILRSISKAPGKRLAPEVYVLSPRKEERAAKDKLTLTCLAQNFFPEDISVQWLRNNALIQTDQHSTTKPHKANGPSPAFFVYSRLVVSRADWEQKNKFTCRVVHEALPGSRTLEKSVSSDLGK 13 | >ighalpha IGH VICUGNA_PACOS 14 | SEPATSPSVFPLGPSYDKASRQVVLACLVHGFFPPAPLKVTWGLSGQNVSVMDFPTVQPASGVLYTMSSQLTTPVEQCPDSEIVTCQVQHLSSSSQTVNVPCRAPTPQPLCCKPSLALHPPALEDLLLGSNASLTCTLSGLRNPEGAQFTWTPSGGKVAVQQSPKHDPCGCFSVSSVLPGCAEQWNSKTTFSCSATHPESENTLTATITKSLEDPIRPQVHLLPPPSEELALNEMVTLTCVVRGFSPKDVLVRWLHGNQELPREKYLTWRPLPEPEQSITTYAVTSLLRVEAEAWKQGDNYSCMVGHEALPLAFTQKTIDRLSGKPTHVNVSVVMAEAEGVCY 15 | -------------------------------------------------------------------------------- /riot_na/schemes/collapse_alignment.py: -------------------------------------------------------------------------------- 1 | from itertools import groupby 2 | from math import ceil 3 | from typing import Sequence 4 | 5 | from riot_na.data.model import AlignmentString 6 | 7 | 8 | def collapse_ins_del(ops: Sequence[str]) -> AlignmentString: 9 | match_count = ops.count("M") 10 | deletions = ops.count("D") 11 | insertions = ops.count("I") 12 | extra_matches = min(deletions, insertions) 13 | match_count = match_count + extra_matches 14 | left_match_count = ceil(match_count / 2) 15 | right_match_count = match_count - left_match_count 16 | if deletions > insertions: 17 | return AlignmentString(f"{left_match_count*'M'}{(deletions-insertions) *'D'}{right_match_count*'M'}") 18 | if deletions < insertions: 19 | return AlignmentString(f"{left_match_count*'M'}{(insertions-deletions) *'I'}{right_match_count*'M'}") 20 | return AlignmentString(f"{match_count*'M'}") 21 | 22 | 23 | def _collapse_ins_del_ordered(ops: Sequence[str]) -> AlignmentString: 24 | deletions_indices: list[int] = [] 25 | insertion_indices: list[int] = [] 26 | deletion_indices_to_change_to_match = set() 27 | insertion_indices_to_remove = set() 28 | last_op = "" 29 | 30 | for i, op in enumerate(ops): 31 | match op: 32 | case "D": 33 | if last_op == "I": 34 | deletions_indices = [] 35 | if insertion_indices: 36 | deletion_indices_to_change_to_match.add(i) 37 | insertion_indices_to_remove.add(insertion_indices.pop()) 38 | else: 39 | deletions_indices.append(i) 40 | last_op = op 41 | case "I": 42 | if last_op == "D": 43 | insertion_indices = [] 44 | if deletions_indices: 45 | deletion_indices_to_change_to_match.add(deletions_indices.pop()) 46 | insertion_indices_to_remove.add(i) 47 | else: 48 | insertion_indices.append(i) 49 | last_op = op 50 | 51 | ops_without_extra_insertions = [ 52 | op if i not in deletion_indices_to_change_to_match else "M" 53 | for i, op in enumerate(ops) 54 | if i not in insertion_indices_to_remove 55 | ] 56 | return AlignmentString("".join(ops_without_extra_insertions)) 57 | 58 | 59 | def collapse_alignment_str(alignment_str: AlignmentString, ordered: bool = False) -> AlignmentString: 60 | collapse_ins_del_fn = _collapse_ins_del_ordered if ordered else collapse_ins_del 61 | res = [] 62 | for key, group in groupby(alignment_str, lambda x: x == "M"): 63 | ops = list(group) 64 | if key: 65 | res.extend(ops) 66 | else: 67 | res.append(collapse_ins_del_fn(ops)) 68 | return AlignmentString("".join(res)) 69 | -------------------------------------------------------------------------------- /notebooks/data_processing/ngs_sample_filter_species.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 5, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pandas as pd\n", 10 | "from pathlib import Path" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 18, 16 | "metadata": {}, 17 | "outputs": [], 18 | "source": [ 19 | "DATA_DIR = Path().absolute().parent / \"data\" / \"ngs_stratified\"\n", 20 | "METADATA_DIR = DATA_DIR / \"metadata\"" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 21, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "read_run_df = pd.read_csv(METADATA_DIR / \"read_run.csv\")\n", 30 | "sample_df = pd.read_csv(METADATA_DIR / \"sample.csv\")\n", 31 | "study_df = pd.read_csv(METADATA_DIR / \"study.csv\").set_index(\"study_accession\")\n", 32 | "ngs_sample_df = pd.read_csv(DATA_DIR / \"ngs_sample_clean.csv\")" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 47, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "ngs_sample_df[\"run_accession\"] = ngs_sample_df[\"sequence_id\"].str.split(\".\").str[0]\n", 42 | "metadata_df = read_run_df[[\"sample_accession\", \"study_accession\", \"run_accession\", \"library_source\"]].merge(\n", 43 | " sample_df[[\"sample_accession\", \"scientific_name\"]], on=\"sample_accession\"\n", 44 | ")\n", 45 | "filtered_metadata_df = metadata_df[\n", 46 | " (metadata_df[\"library_source\"] == \"TRANSCRIPTOMIC\") & (metadata_df[\"scientific_name\"] == \"Homo sapiens\")\n", 47 | "]\n", 48 | "filtered_ngs_sample_df = ngs_sample_df.merge(filtered_metadata_df.drop(columns=[\"study_accession\"]), on=\"run_accession\")\n", 49 | "\n", 50 | "filtered_ngs_sample_df = filtered_ngs_sample_df.drop_duplicates(subset=[\"sequence\"]) \n", 51 | "filtered_ngs_sample_df.to_csv(DATA_DIR / \"ngs_sample_human.csv\", index=False)\n", 52 | "with open(DATA_DIR / \"ngs_sample_human_nt.fasta\", \"w\") as file:\n", 53 | " for record in filtered_ngs_sample_df[[\"sequence_id\", \"sequence\"]].itertuples():\n", 54 | " file.write(f\">{record.sequence_id}\\n\")\n", 55 | " file.write(f\"{record.sequence}\\n\")" 56 | ] 57 | } 58 | ], 59 | "metadata": { 60 | "kernelspec": { 61 | "display_name": "riot-ZhbTWDtr-py3.10", 62 | "language": "python", 63 | "name": "python3" 64 | }, 65 | "language_info": { 66 | "codemirror_mode": { 67 | "name": "ipython", 68 | "version": 3 69 | }, 70 | "file_extension": ".py", 71 | "mimetype": "text/x-python", 72 | "name": "python", 73 | "nbconvert_exporter": "python", 74 | "pygments_lexer": "ipython3", 75 | "version": "3.10.7" 76 | } 77 | }, 78 | "nbformat": 4, 79 | "nbformat_minor": 2 80 | } 81 | -------------------------------------------------------------------------------- /tests/test_collapse_alignment.py: -------------------------------------------------------------------------------- 1 | from riot_na.data.model import AlignmentString 2 | from riot_na.schemes.collapse_alignment import ( 3 | _collapse_ins_del_ordered, 4 | collapse_alignment_str, 5 | collapse_ins_del, 6 | ) 7 | 8 | 9 | def test_collapse_alignment_str(): 10 | # given 11 | alignment_str = AlignmentString("MDDIM") 12 | # when 13 | result = collapse_alignment_str(alignment_str) 14 | 15 | # then 16 | assert result == "MMDM" 17 | 18 | # given 19 | alignment_str = AlignmentString("MDDDIIIIIIM") 20 | # when 21 | result = collapse_alignment_str(alignment_str) 22 | 23 | # then 24 | assert result == "MMMIIIMM" 25 | 26 | # given 27 | alignment_str = AlignmentString("MDDDMIDIDIIM") 28 | # when 29 | result = collapse_alignment_str(alignment_str) 30 | 31 | # then 32 | assert result == "MDDDMMIIMM" 33 | 34 | # indels in the middle 35 | assert collapse_alignment_str(AlignmentString("MMMIIIIDDDMMM")) == "MMMMMIMMMM" 36 | 37 | # indels ordered 38 | assert collapse_alignment_str(AlignmentString("MMMIIIIDDDMMM"), ordered=True) == "MMMIMMMMMM" 39 | 40 | 41 | def test_collapse_ins_del_ordered(): 42 | assert _collapse_ins_del_ordered(list("DDDIIII")) == "MMMI" 43 | assert _collapse_ins_del_ordered(list("IIIDDDD")) == "MMMD" 44 | 45 | assert _collapse_ins_del_ordered(list("DDDDIII")) == "DMMM" 46 | assert _collapse_ins_del_ordered(list("IIIIDDD")) == "IMMM" 47 | 48 | assert _collapse_ins_del_ordered(list("IDDDI")) == "MDM" 49 | assert _collapse_ins_del_ordered(list("DIIID")) == "MIM" 50 | 51 | assert _collapse_ins_del_ordered(list("DDDIII")) == "MMM" 52 | assert _collapse_ins_del_ordered(list("IIIDDD")) == "MMM" 53 | 54 | assert _collapse_ins_del_ordered(list("IDIDID")) == "MMM" 55 | 56 | assert _collapse_ins_del_ordered(list("IDIIDIDDI")) == "MIMMM" 57 | 58 | assert _collapse_ins_del_ordered(list("IIIIDDDIIDIDDDIID")) == "IMMMIMMMMD" 59 | 60 | assert _collapse_ins_del_ordered(list("DDIDIDID")) == "DMMMD" 61 | 62 | assert _collapse_ins_del_ordered(list("IIDIDIDI")) == "IMMMI" 63 | 64 | 65 | def test_collapse_insdel_insertions_in_junctions(): 66 | assert collapse_ins_del(list("IDDDDDDDD ")) == "MDDDDDDD" 67 | 68 | assert collapse_ins_del(list("DIDDDDDDD ")) == "MDDDDDDD" 69 | 70 | assert collapse_ins_del(list("DIDDDDDID ")) == "MDDDDDM" 71 | 72 | assert collapse_ins_del(list("IDDDDDDID ")) == "MDDDDDM" 73 | 74 | assert collapse_ins_del(list("IDDDDDDDI ")) == "MDDDDDM" 75 | 76 | assert _collapse_ins_del_ordered(list("DIDDDDDID")) == "MDDDDMD" 77 | 78 | assert _collapse_ins_del_ordered(list("IMDDDDDDD")) == "MMDDDDDD" 79 | assert _collapse_ins_del_ordered(list("IMDDDDDMI")) == "MMDDDMM" 80 | 81 | 82 | if __name__ == "__main__": 83 | test_collapse_alignment_str() 84 | test_collapse_ins_del_ordered() 85 | test_collapse_insdel_insertions_in_junctions() 86 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | authors = [{ name = "NaturalAntibody" }] 3 | classifiers = [ 4 | "Programming Language :: Rust", 5 | "Programming Language :: Python :: Implementation :: CPython", 6 | ] 7 | dependencies = [ 8 | "biopython == 1.84", # pinned because biopython does not comply with semantic versioning 9 | "blosum ~= 2.0", 10 | "click ~=8.1", 11 | "psutil ~=5.9", 12 | "python-dotenv ~= 1.0", 13 | "scikit-bio == 0.6.2", # pinned because of the danger of deprecating SSW https://github.com/scikit-bio/scikit-bio/issues/1814 14 | "scipy ~=1.9", 15 | "tqdm ~= 4.66", 16 | "cachetools (>=5.5.2,<6.0.0)", 17 | ] 18 | description = "Antibody numbering software" 19 | name = "riot_na" 20 | readme = "README.md" 21 | requires-python = ">=3.10, <4.0" 22 | version = "4.0.7" 23 | 24 | [project.urls] 25 | Homepage = "https://github.com/NaturalAntibody/riot_na" 26 | 27 | [project.scripts] 28 | riot_na = "riot_na.cli:run_riot" 29 | 30 | # Poetry setup: for local development 31 | [tool.poetry] 32 | include = [{ path = "riot_na/riot_na.*.so", format = ["sdist", "wheel"] }] 33 | 34 | [tool.poetry.group.dev.dependencies] 35 | black = "^24.0" 36 | cutadapt = "^4.5" 37 | flake8 = "^7.0" 38 | fsspec = "^2023.9.2" 39 | isort = "^5.12" 40 | maturin = "^1.8.2" 41 | mypy = "^1.7" 42 | pre-commit = "^3.5.0" 43 | presto = "^0.7.1" 44 | pyarrow = "^18.0" 45 | pylint = "^3.0" 46 | pytest = "^7.4.2" 47 | pytest-cov = "^4.1.0" 48 | pytest-icdiff = "^0.9" 49 | s3fs = "^2023.9.2" 50 | tabulate = "^0.9.0" 51 | types-psutil = "^5.9.5.17" 52 | types-tqdm = "^4.66.0.5" 53 | commitizen = "^4.4.1" 54 | 55 | [tool.poetry.group.release.dependencies] 56 | python-semantic-release = "^9.4.0" 57 | 58 | [tool.poetry.scripts] 59 | riot_na = "riot_na.cli:run_riot" 60 | 61 | [build-system] 62 | build-backend = "maturin" 63 | requires = ["maturin>=1,<2"] 64 | 65 | # Maturin setup: for wheel building 66 | [tool.maturin] 67 | include = [{ path = "riot", format = "wheel" }] 68 | 69 | [tool.black] 70 | line-length = 120 71 | 72 | [tool.isort] 73 | profile = "black" 74 | 75 | [tool.pytest.ini_options] 76 | testpaths = ["tests"] 77 | 78 | [tool.semantic_release] 79 | build_command = "maturin build -r --out dist/" 80 | changelog_file = "CHANGELOG.md" 81 | commit_message = "bump: {version} [skip ci]\n\nAutomatically generated by python-semantic-release" 82 | commit_version_number = true 83 | hvcs = "github" 84 | tag_format = "v{version}" 85 | upload_to_pypi = false 86 | version_toml = ["pyproject.toml:project.version"] 87 | version_variables = ["riot_na/__init__.py:__version__"] 88 | 89 | [tool.semantic_release.commit_parser_options] 90 | allowed_tags = [ 91 | "build", 92 | "chore", 93 | "ci", 94 | "docs", 95 | "feat", 96 | "fix", 97 | "perf", 98 | "style", 99 | "refactor", 100 | "test", 101 | ] 102 | minor_tags = ["feat"] 103 | patch_tags = ["fix", "perf", "docs", "build"] 104 | -------------------------------------------------------------------------------- /tests/test_smooth_alignment/test_smooth_cdr_junctions_martin.py: -------------------------------------------------------------------------------- 1 | from riot_na.data.model import AlignmentString, ChainType, Scheme 2 | from riot_na.data.scheme_definitions import ( 3 | MARTIN_POSITIONS_HEAVY, 4 | MARTIN_POSITIONS_LIGHT, 5 | ) 6 | from riot_na.schemes.smooth_alignment import smooth_cdr_junctions 7 | 8 | SCHEME = Scheme.MARTIN 9 | 10 | 11 | def test_smooth_cdr_junctions_martin_insertions_and_deletions(): 12 | regions = { 13 | "fwr1": "MMDMDMIIMMDMDDMMDMMDMMIIMMMMIMII", 14 | "cdr1": "DMDMIIMDM", 15 | "fwr2": "IMIMMDMDMMMDMMDDDMMMIMI", 16 | "cdr2": "MDDDM", 17 | "fwr3": "MDMMMMMDMMMMMMMMDMMMIIMMMMDMMIIMMMMIMMDMDMMIMIII", 18 | "cdr3": "MDMMMM", 19 | "fwr4": "IIMMMMDMMMDMMIIM", 20 | } 21 | given_alignment_str = AlignmentString("".join(regions.values())) 22 | assert sum(map(given_alignment_str.count, ["M", "D", "N"])) == MARTIN_POSITIONS_HEAVY 23 | 24 | regions = { 25 | "fwr1": "MMDMDMIIMMDMDDMMDMMDMMIIMMMMIM", 26 | "cdr1": "MMMMMMIIM", 27 | "fwr2": "MIMMDMDMMMDMMDDDMMMIM", 28 | "cdr2": "DDMMM", 29 | "fwr3": "MDMMMMMDMMMMMMMMDMMMIIMMMMDMMIIMMMMIMMDMDMMIM", 30 | "cdr3": "MMMMMIIIIM", 31 | "fwr4": "MMMMDMMMDMMIIM", 32 | } 33 | then_alignment_str = AlignmentString("".join(regions.values())) 34 | assert sum(map(then_alignment_str.count, ["M", "D", "N"])) == MARTIN_POSITIONS_HEAVY 35 | 36 | assert ( 37 | smooth_cdr_junctions(alignment_str=given_alignment_str, chain_type=ChainType.HEAVY, scheme=SCHEME) 38 | == then_alignment_str 39 | ) 40 | 41 | 42 | def test_smooth_cdr_junctions_martin_insertions_and_deletions_light_chain(): 43 | regions = { 44 | "fwr1": "MMMMIMMMMDMMMMIIMMDDMMMMMMDMI", 45 | "cdr1": "MDMMIMMIMI", 46 | "fwr2": "MDMDMMMMIIMMDMMMMMD", 47 | "cdr2": "MDM", 48 | "fwr3": "IMIMMDDMMMMMIIMMMDMMMDMMMMMIIIMMMMDMMMMDMMMMMII", 49 | "cdr3": "MDMDDD", 50 | "fwr4": "MIMMMMIIMMDMDM", 51 | } 52 | given_alignment_str = AlignmentString("".join(regions.values())) 53 | assert sum(map(given_alignment_str.count, ["M", "D", "N"])) == MARTIN_POSITIONS_LIGHT 54 | 55 | regions = { 56 | "fwr1": "MMMMIMMMMDMMMMIIMMDDMMMMMMDM", 57 | "cdr1": "MMMMMIIIMM", 58 | "fwr2": "MDMDMMMMIIMMDMMMMMD", 59 | "cdr2": "MMM", 60 | "fwr3": "MIMMDDMMMMMIIMMMDMMMDMMMMMIIIMMMMDMMMMDMMMMM", 61 | "cdr3": "MMMDDM", 62 | "fwr4": "MIMMMMIIMMDMDM", 63 | } 64 | then_alignment_str = AlignmentString("".join(regions.values())) 65 | assert sum(map(then_alignment_str.count, ["M", "D", "N"])) == MARTIN_POSITIONS_LIGHT 66 | 67 | assert ( 68 | smooth_cdr_junctions(alignment_str=given_alignment_str, chain_type=ChainType.LIGHT, scheme=SCHEME) 69 | == then_alignment_str 70 | ) 71 | 72 | 73 | if __name__ == "__main__": 74 | test_smooth_cdr_junctions_martin_insertions_and_deletions() 75 | test_smooth_cdr_junctions_martin_insertions_and_deletions_light_chain() 76 | -------------------------------------------------------------------------------- /tests/test_smooth_alignment/test_smooth_cdr_junctions_chothia.py: -------------------------------------------------------------------------------- 1 | from riot_na.data.model import AlignmentString, ChainType, Scheme 2 | from riot_na.data.scheme_definitions import ( 3 | CHOTHIA_POSITIONS_HEAVY, 4 | CHOTHIA_POSITIONS_LIGHT, 5 | ) 6 | from riot_na.schemes.smooth_alignment import smooth_cdr_junctions 7 | 8 | SCHEME = Scheme.CHOTHIA 9 | 10 | 11 | def test_smooth_cdr_junctions_chothia_insertions_and_deletions(): 12 | regions = { 13 | "fwr1": "MMDMDMIIMMDMDDMMDMMDMMIIMMMMIMII", 14 | "cdr1": "DMDMIIMDM", 15 | "fwr2": "IMIMMDMDMMMDMMDDDMMMIMI", 16 | "cdr2": "MDDDM", 17 | "fwr3": "MDMMMMMDMMMMMMMMDMMMIIMMMMDMMIIMMMMIMMDMDMMIMIII", 18 | "cdr3": "MDMMMM", 19 | "fwr4": "IIMMMMDMMMDMMIIM", 20 | } 21 | given_alignment_str = AlignmentString("".join(regions.values())) 22 | assert sum(map(given_alignment_str.count, ["M", "D", "N"])) == CHOTHIA_POSITIONS_HEAVY 23 | 24 | regions = { 25 | "fwr1": "MMDMDMIIMMDMDDMMDMMDMMIIMMMMIM", 26 | "cdr1": "MMMMMMIIM", 27 | "fwr2": "MIMMDMDMMMDMMDDDMMMIM", 28 | "cdr2": "DDMMM", 29 | "fwr3": "MDMMMMMDMMMMMMMMDMMMIIMMMMDMMIIMMMMIMMDMDMMIM", 30 | "cdr3": "MMMMMIIIIM", 31 | "fwr4": "MMMMDMMMDMMIIM", 32 | } 33 | then_alignment_str = AlignmentString("".join(regions.values())) 34 | assert sum(map(then_alignment_str.count, ["M", "D", "N"])) == CHOTHIA_POSITIONS_HEAVY 35 | 36 | assert ( 37 | smooth_cdr_junctions(alignment_str=given_alignment_str, chain_type=ChainType.HEAVY, scheme=SCHEME) 38 | == then_alignment_str 39 | ) 40 | 41 | 42 | def test_smooth_cdr_junctions_chothia_insertions_and_deletions_light_chain(): 43 | regions = { 44 | "fwr1": "MMMMIMMMMDMMMMIIMMDDMMMMMMDMI", 45 | "cdr1": "MDMMIMMIMI", 46 | "fwr2": "MDMDMMMMIIMMDMMMMMD", 47 | "cdr2": "MDM", 48 | "fwr3": "IMIMMDDMMMMMIIMMMDMMMDMMMMMIIIMMMMDMMMMDMMMMMII", 49 | "cdr3": "MDMDDD", 50 | "fwr4": "MIMMMMIIMMDMDM", 51 | } 52 | given_alignment_str = AlignmentString("".join(regions.values())) 53 | assert sum(map(given_alignment_str.count, ["M", "D", "N"])) == CHOTHIA_POSITIONS_LIGHT 54 | 55 | regions = { 56 | "fwr1": "MMMMIMMMMDMMMMIIMMDDMMMMMMDM", 57 | "cdr1": "MMMMMIIIMM", 58 | "fwr2": "MDMDMMMMIIMMDMMMMMD", 59 | "cdr2": "MMM", 60 | "fwr3": "MIMMDDMMMMMIIMMMDMMMDMMMMMIIIMMMMDMMMMDMMMMM", 61 | "cdr3": "MMMDDM", 62 | "fwr4": "MIMMMMIIMMDMDM", 63 | } 64 | then_alignment_str = AlignmentString("".join(regions.values())) 65 | assert sum(map(then_alignment_str.count, ["M", "D", "N"])) == CHOTHIA_POSITIONS_LIGHT 66 | 67 | assert ( 68 | smooth_cdr_junctions(alignment_str=given_alignment_str, chain_type=ChainType.LIGHT, scheme=SCHEME) 69 | == then_alignment_str 70 | ) 71 | 72 | 73 | if __name__ == "__main__": 74 | test_smooth_cdr_junctions_chothia_insertions_and_deletions() 75 | test_smooth_cdr_junctions_chothia_insertions_and_deletions_light_chain() 76 | -------------------------------------------------------------------------------- /riot_na/api/utils.py: -------------------------------------------------------------------------------- 1 | from functools import cache 2 | 3 | from riot_na.data.model import AirrRearrangementEntryAA, ShortRegion 4 | 5 | 6 | def get_region_position_indices(airr: AirrRearrangementEntryAA, region: ShortRegion) -> list[int]: 7 | match region: 8 | case ShortRegion.CDR1: 9 | assert airr.cdr1_start_aa and airr.cdr1_end_aa 10 | return list(range(airr.cdr1_start_aa - 1, airr.cdr1_end_aa)) 11 | case ShortRegion.CDR2: 12 | assert airr.cdr2_start_aa and airr.cdr2_end_aa 13 | return list(range(airr.cdr2_start_aa - 1, airr.cdr2_end_aa)) 14 | case ShortRegion.CDR3: 15 | assert airr.cdr3_start_aa and airr.cdr3_end_aa 16 | return list(range(airr.cdr3_start_aa - 1, airr.cdr3_end_aa)) 17 | case ShortRegion.FW1: 18 | assert airr.fwr1_start_aa and airr.fwr1_end_aa 19 | return list(range(airr.fwr1_start_aa - 1, airr.fwr1_end_aa)) 20 | case ShortRegion.FW2: 21 | assert airr.fwr2_start_aa and airr.fwr2_end_aa 22 | return list(range(airr.fwr2_start_aa - 1, airr.fwr2_end_aa)) 23 | case ShortRegion.FW3: 24 | assert airr.fwr3_start_aa and airr.fwr3_end_aa 25 | return list(range(airr.fwr3_start_aa - 1, airr.fwr3_end_aa)) 26 | case ShortRegion.FW4: 27 | assert airr.fwr4_start_aa and airr.fwr4_end_aa 28 | return list(range(airr.fwr4_start_aa - 1, airr.fwr4_end_aa)) 29 | 30 | 31 | def get_regions_position_indices(airr: AirrRearrangementEntryAA, regions: list[ShortRegion]) -> list[int]: 32 | indices = set() 33 | for region in regions: 34 | indices.update(get_region_position_indices(airr, region)) 35 | return sorted(list(indices)) 36 | 37 | 38 | def scheme_positions_to_index(airr: AirrRearrangementEntryAA, scheme_positions: list[str]) -> list[int]: 39 | assert airr.positional_scheme_mapping 40 | scheme_positional_mapping = {v: k for k, v in airr.positional_scheme_mapping.items()} 41 | return [scheme_positional_mapping[pos] for pos in scheme_positions] 42 | 43 | 44 | def get_primary_seq(airr: AirrRearrangementEntryAA) -> str: 45 | assert airr.scheme_residue_mapping is not None 46 | return "".join(airr.scheme_residue_mapping.values()) 47 | 48 | 49 | @cache 50 | def int_to_str_insertion(n: int) -> str: 51 | """ 52 | Converts an integer (1-based) to an IMGT-style insertion letter. 53 | Example: 54 | 1 -> 'A', 26 -> 'Z', 27 -> 'AA', 28 -> 'AB', ..., 52 -> 'AZ', 53 -> 'BA' 55 | """ 56 | if n < 1: 57 | raise ValueError("Input must be a positive integer.") 58 | 59 | result = "" 60 | while n > 0: 61 | n -= 1 # Adjust for 1-based indexing 62 | result = chr(ord("A") + (n % 26)) + result 63 | n //= 26 64 | return result 65 | 66 | 67 | def map_insertion_number_to_letter(position: str) -> str: 68 | # Translates 111.1, 111.2 to 111A, 111B ETC 69 | if "." not in position: 70 | return position 71 | position_number, insertion_number = position.split(".") 72 | return f"{position_number}{int_to_str_insertion(int(insertion_number))}" 73 | -------------------------------------------------------------------------------- /riot_na/alignment/alignment_metrics.py: -------------------------------------------------------------------------------- 1 | import math 2 | from enum import Enum 3 | 4 | import blosum # type: ignore 5 | 6 | from riot_na.alignment.alignment_utils import get_cigar_op_groups, unfold_cigar 7 | from riot_na.data.model import Cigar 8 | 9 | # L (Lambda) and K constants are depended on scoring matrix and gap penalties 10 | # and are calculated by ALP (Ascending Ladder Program) - https://doi.org/10.1093%2Fbioinformatics%2Fbtv575 11 | # The present values were calculated for the following parameters (default SSW): 12 | # Match score: 2 13 | # Mismatch score: -3 14 | # Gap open penalty: 5 15 | # Gap extend penalty: 2 16 | # Background probabilities: A, T, G, C 0.2499975, N 0.00001 17 | 18 | 19 | class GumbellParams(Enum): 20 | IGBLAST = {"L": 1.08, "K": 0.28} 21 | AA = {"L": 0.26453605633241922, "K": 0.043186874595437463} 22 | 23 | 24 | def compute_raw_score_aa( 25 | query: str, 26 | target: str, 27 | cigar: Cigar, 28 | gap_open_penalty=11, 29 | gap_extend_penalty=1, 30 | substitution_matrix=blosum.BLOSUM(62), 31 | ) -> float: 32 | raw_score = 0 33 | query_pos = 0 34 | target_pos = 0 35 | prev_match = True 36 | for op in unfold_cigar(cigar): 37 | if op == "M": 38 | raw_score += substitution_matrix[query[query_pos]][target[target_pos]] 39 | 40 | query_pos += 1 41 | target_pos += 1 42 | prev_match = True 43 | elif op == "I": 44 | if prev_match: 45 | raw_score -= gap_open_penalty 46 | prev_match = False 47 | else: 48 | raw_score -= gap_extend_penalty 49 | query_pos += 1 50 | elif op == "D": 51 | if prev_match: 52 | raw_score -= gap_open_penalty 53 | prev_match = False 54 | else: 55 | raw_score -= gap_extend_penalty 56 | target_pos += 1 57 | return raw_score 58 | 59 | 60 | def compute_bit_score(raw_score: int, gumbell_params: GumbellParams = GumbellParams.IGBLAST) -> float: 61 | return (gumbell_params.value["L"] * raw_score - math.log(gumbell_params.value["K"])) / math.log(2) 62 | 63 | 64 | def compute_raw_score_from_bit_score(bit_score: float, gumbell_params: GumbellParams = GumbellParams.IGBLAST) -> float: 65 | return (math.log(gumbell_params.value["K"]) + bit_score * math.log(2)) / gumbell_params.value["L"] 66 | 67 | 68 | def compute_evalue(query_length: int, db_length: int, bit_score: float) -> float: 69 | return query_length * db_length * 2 ** (-bit_score) 70 | 71 | 72 | def calculate_seq_identity(cigar: Cigar, query: str, target: str, query_start: int = 0, target_start: int = 0) -> float: 73 | cigar_items = get_cigar_op_groups(cigar) 74 | query_pos = query_start 75 | target_pos = target_start 76 | match_cnt = 0 77 | total = 0 78 | for op_cnt, op in cigar_items: 79 | match op: 80 | case "M": 81 | for i in range(op_cnt): 82 | if query[query_pos + i] == target[target_pos + i]: 83 | match_cnt += 1 84 | query_pos += op_cnt 85 | target_pos += op_cnt 86 | case "I": 87 | query_pos += op_cnt 88 | case "D": 89 | target_pos += op_cnt 90 | total += op_cnt 91 | return match_cnt / total 92 | -------------------------------------------------------------------------------- /riot_na/common/multi_species_segment_prefiltering.py: -------------------------------------------------------------------------------- 1 | from typing import Sequence 2 | 3 | from riot_na import Prefiltering 4 | from riot_na.data.model import ( 5 | Gene, 6 | GeneAA, 7 | Locus, 8 | Organism, 9 | SegmentMatch, 10 | SpeciesGeneMatch, 11 | SpeciesPrefilteringSegmentResult, 12 | ) 13 | from riot_na.riot_na import ( # pylint: disable=no-name-in-module 14 | PrefilteringSegmentResult, 15 | ) 16 | 17 | 18 | class MultiSpeciesSegmentPrefiltering: 19 | def __init__( 20 | self, 21 | all_genes: Sequence[Gene | GeneAA], 22 | kmer_size: int, 23 | distance_threshold: int, 24 | top_n: int, 25 | modulo_n: int, 26 | min_segment_length: int = 30, 27 | min_coverage: int = 20, 28 | ): 29 | self.all_genes = dict( 30 | map(lambda gene: (gene.species + "|" + gene.locus + "|" + gene.name, gene.sequence), all_genes) 31 | ) 32 | 33 | self.prefiltering = Prefiltering( 34 | self.all_genes, 35 | top_n=top_n, 36 | kmer_size=kmer_size, 37 | distance_threshold=distance_threshold, 38 | modulo_n=modulo_n, 39 | min_segment_length=min_segment_length, 40 | min_coverage=min_coverage, 41 | ) 42 | 43 | def calculate_segment_matches_with_rev_comp(self, query: str) -> SpeciesPrefilteringSegmentResult: 44 | try: 45 | raw_result = self.prefiltering.calculate_segment_matches_with_rev_comp(query) 46 | except ValueError: 47 | return SpeciesPrefilteringSegmentResult(query=query, rev_comp_query="", segments=[]) 48 | 49 | return self._map_raw_result_to_segments(raw_result) 50 | 51 | def calculate_segment_matches(self, query: str) -> SpeciesPrefilteringSegmentResult: 52 | try: 53 | raw_result = self.prefiltering.calculate_segment_matches(query) 54 | except ValueError: 55 | return SpeciesPrefilteringSegmentResult(query=query, rev_comp_query="", segments=[]) 56 | 57 | return self._map_raw_result_to_segments(raw_result) 58 | 59 | def _map_raw_result_to_segments(self, raw_result: PrefilteringSegmentResult) -> SpeciesPrefilteringSegmentResult: 60 | segments = [] 61 | for segment in raw_result.segments: 62 | mapped_matches = [] 63 | for match in segment.matching_genes: 64 | (species, locus, name) = match.gene_id.split("|") 65 | mapped_matches.append( 66 | SpeciesGeneMatch( 67 | species_gene_id=match.gene_id, 68 | gene_id=name, 69 | rev_comp=match.rev_comp, 70 | coverage=match.coverage, 71 | species=Organism(species), 72 | locus=Locus(locus), 73 | ) 74 | ) 75 | segments.append( 76 | SegmentMatch( 77 | segment_start=segment.segment_start, 78 | segment_length=segment.query_length(), 79 | query_start=segment.query_start, 80 | query_end=segment.query_end, 81 | coverage=segment.coverage, 82 | match_count=segment.match_count, 83 | matching_genes=mapped_matches, 84 | ) 85 | ) 86 | return SpeciesPrefilteringSegmentResult( 87 | query=raw_result.query, rev_comp_query=raw_result.rev_comp_query, segments=segments 88 | ) 89 | -------------------------------------------------------------------------------- /riot_na/alignment/alignment_utils.py: -------------------------------------------------------------------------------- 1 | from collections.abc import Iterator 2 | from dataclasses import replace 3 | from itertools import groupby 4 | 5 | from Bio.Seq import Seq 6 | 7 | from riot_na.data.model import AlignmentEntry, AlignmentString, Cigar 8 | 9 | 10 | def get_cigar_op_groups(cigar: Cigar) -> Iterator[tuple[int, str]]: 11 | cigar_groups = (list(grouper) for _, grouper in groupby(cigar, lambda character: character.isdigit())) 12 | for group_size_grouper, op_grouper in zip(cigar_groups, cigar_groups): 13 | yield int("".join(group_size_grouper)), op_grouper[0] 14 | 15 | 16 | def unfold_cigar(cigar: Cigar) -> AlignmentString: 17 | result = [] 18 | for group_size, operation in get_cigar_op_groups(cigar): 19 | for _ in range(group_size): 20 | result.append(operation) 21 | return AlignmentString("".join(result)) 22 | 23 | 24 | def fold_cigar(alignment_str: AlignmentString) -> Cigar: 25 | return Cigar("".join(f"{sum(1 for _ in op_group)}{op}" for op, op_group in groupby(alignment_str))) 26 | 27 | 28 | def has_frameshift(cigar: Cigar) -> bool: 29 | counter = 0 30 | 31 | for size, operation in get_cigar_op_groups(cigar): 32 | if operation == "M": 33 | if counter % 3 != 0: 34 | return True 35 | counter = 0 36 | 37 | if operation == "I": 38 | counter -= size 39 | elif operation == "D": 40 | counter += size 41 | 42 | return False 43 | 44 | 45 | def translate(query_sequence: str, coding_frame: int) -> str: 46 | assert coding_frame in [0, 1, 2] 47 | 48 | query_sequence = query_sequence[coding_frame:] 49 | partial_codon_len = len(query_sequence) % 3 50 | if partial_codon_len: 51 | query_sequence = query_sequence[:-partial_codon_len] # To avoid BioPython "Partial codon" warning. 52 | coding_dna = Seq(query_sequence) 53 | return str(coding_dna.translate(gap=".")) 54 | 55 | 56 | def infer_reading_frame(t_start: int, t_frame: int) -> int: 57 | # AT GAC TATGCTATGGACTACTGGGGTCAAGGAACCTCAGTCACCGTCTCCTCAG 58 | # MM MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMM 59 | # T TAC TAC TAC TAC TACGGTATGGACGTCTGGGGGCAAGGGACCACGGTCACCGTCTCCTCAG target reading frame is 1 60 | # | | 61 | # t_frame t_start 62 | # t_start = 11 63 | # t_frame = 1 64 | # frame_offset = ((t_start -t_frame) %3) = 10 % 3 = 1 65 | # q_frame = (3 - frame_offset) %3 = (3 - 1) % 3 = 2 66 | 67 | # offset from the reading frame of the first aligned position 68 | frame_offset = (t_start - t_frame) % 3 69 | q_frame = (3 - frame_offset) % 3 70 | return q_frame 71 | 72 | 73 | def offset_alignments(offset: int, aln: AlignmentEntry) -> AlignmentEntry: 74 | return replace(aln, q_start=aln.q_start + offset, q_end=aln.q_end + offset) 75 | 76 | 77 | def align_sequences(query: str, target: str, alignment: AlignmentString) -> tuple[str, str]: 78 | query_aln = [] 79 | target_aln = [] 80 | 81 | for op in alignment: 82 | if op == "M": 83 | query_aln.append(query[0]) 84 | target_aln.append(target[0]) 85 | query = query[1:] 86 | target = target[1:] 87 | elif op == "I": 88 | query_aln.append(query[0]) 89 | target_aln.append("-") 90 | query = query[1:] 91 | elif op == "D": 92 | query_aln.append("-") 93 | target_aln.append(target[0]) 94 | target = target[1:] 95 | 96 | return "".join(query_aln), "".join(target_aln) 97 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes/c_genes/human/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHM IGH HOMO_SAPIENS 2 | GSASAPTLFPLVSCENSPSDTSSVAVGCLAQDFLPDSITFSWKYKNNSDISSTRGFPSVLRGGKYAATSQVLLPSKDVMQGTDEHVVCKVQHPNGNKEKNVPLPVIAELPPKVSVFVPPRDGFFGNPRKSKLICQATGFSPRQIQVSWLREGKQVGSGVTTDQVQAEAKESGPTTYKVTSTLTIKESDWLGQSMFTCRVDHRGLTFQQNASSMCVPDQDTAIRVFAIPPSFASIFLTKSTKLTCLVTDLTTYDSVTISWTRQNGEAVKTHTNISESHPNATFSAVGEASICEDDWNSGERFTCTVTHTDLPSPLKQTISRPKGVALHRPDVYLLPPAREQLNLRESATITCLVTGFSPADVFVQWMQRGQPLSPEKYVTSAPMPEPQAPGRYFAHSILTVSEEEWNTGETYTCVVAHEALPNRVTERTVDKSTEGEVSADEEGFENLWATASTFIVLFLLSLFYSTTVTLFKVK 3 | >IGHG1 IGH HOMO_SAPIENS 4 | ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATVTFFKVKWIFSSVVDLKQTIIPDYRNMIGQGA 5 | >IGHG2 IGH HOMO_SAPIENS 6 | ASTKGPSVFPLAPCSRSTSESTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSNFGTQTYTCNVDHKPSNTKVDKTVERKCCVECPPCPAPPVAGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVQFNWYVDGVEVHNAKTKPREEQFNSTFRVVSVLTVVHQDWLNGKEYKCKVSNKGLPAPIEKTISKTKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDISVEWESNGQPENNYKTTPPMLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATITFFKVKWIFSSVVDLKQTIVPDYRNMIRQGA 7 | >IGHG3 IGH HOMO_SAPIENS 8 | ASTKGPSVFPLAPCSRSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYTCNVNHKPSNTKVDKRVELKTPLGDTTHTCPRCPEPKSCDTPPPCPRCPEPKSCDTPPPCPRCPEPKSCDTPPPCPRCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVQFKWYVDGVEVHNAKTKPREEQYNSTFRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKTKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESSGQPENNYNTTPPMLDSDGSFFLYSKLTVDKSRWQQGNIFSCSVMHEALHNRFTQKSLSLSPELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATVTFFKVKWIFSSVVDLKQTIIPDYRNMIGQGA 9 | >IGHG4 IGH HOMO_SAPIENS 10 | ASTKGPSVFPLAPCSRSTSESTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTKTYTCNVDHKPSNTKVDKRVESKYGPPCPSCPAPEFLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSQEDPEVQFNWYVDGVEVHNAKTKPREEQFNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKGLPSSIEKTISKAKGQPREPQVYTLPPSQEEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSRLTVDKSRWQEGNVFSCSVMHEALHNHYTQKSLSLSLELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATVTFFKVKWIFSSVVDLKQTIVPDYRNMIRQGA 11 | >IGHA1 IGH HOMO_SAPIENS 12 | ASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFPPSQDASGDLYTTSSQLTLPATQCLAGKSVTCHVKHYTNPSQDVTVPCPVPSTPPTPSPSTPPTPSPSCCHPRLSLHRPALEDLLLGSEANLTCTLTGLRDASGVTFTWTPSSGKSAVQGPPERDLCGCYSVSSVLPGCAEPWNHGKTFTCTAAYPESKTPLTATLSKSGNTFRPEVHLLPPPSEELALNELVTLTCLARGFSPKDVLVRWLQGSQELPREKYLTWASRQEPSQGTTTFAVTSILRVAAEDWKKGDTFSCMVGHEALPLAFTQKTIDRLADWQMPPPYVVLDLPQETLEEETPGANLWPTTITFLTLFLLSLFYSTALTVTSVRGPSGNREGPQY 13 | >IGHA2 IGH HOMO_SAPIENS 14 | ASPTSPKVFPLSLDSTPQDGNVVVACLVQGFFPQEPLSVTWSESGQNVTARNFPPSQDASGDLYTTSSQLTLPATQCPDGKSVTCHVKHYTNSSQDVTVPCRVPPPPPCCHPRLSLHRPALEDLLLGSEANLTCTLTGLRDASGATFTWTPSSGKSAVQGPPERDLCGCYSVSSVLPGCAQPWNHGETFTCTAAHPELKTPLTANITKSGNTFRPEVHLLPPPSEELALNELVTLTCLARGFSPKDVLVRWLQGSQELPREKYLTWASRQEPSQGTTTYAVTSILRVAAEDWKKGETFSCMVGHEALPLAFTQKTIDRMAGSCCVADWQMPPPYVVLDLPQETLEEETPGANLWPTTITFLTLFLLSLFYSTALTVTSVRGPSGKREGPQY 15 | >IGHD IGH HOMO_SAPIENS 16 | APTKAPDVFPIISGCRHPKDNSPVVLACLITGYHPTSVTVTWYMGTQSQPQRTFPEIQRRDSYYMTSSQLSTPLQQWRQGEYKCVVQHTASKSKKEIFRWPESPKAQASSVPTAQPQAEGSLAKATTAPATTRNTGRGGEEKKKEKEKEEQEERETKTPECPSHTQPLGVYLLTPAVQDLWLRDKATFTCFVVGSDLKDAHLTWEVAGKVPTGGVEEGLLERHSNGSQSQHSRLTLPRSLWNAGTSVTCTLNHPSLPPQRLMALREPAAQAPVKLSLNLLASSDPPEAASWLLCEVSGFSPPNILLMWLEDQREVNTSGFAPARPPPQPRSTTFWAWSVLRVPAPPSPQPATYTCVVSHEDSRTLLNASRSLEVSYLAMTPLIPQSKDENSDDYTTFDDVGSLWTTLSTFVALFILTLLYSGIVTFIKVK 17 | >IGHE IGH HOMO_SAPIENS 18 | ASTQSPSVFPLTRCCKNIPSNATSVTLGCLATGYFPEPVMVTWDTGSLNGTTMTLPATTLTLSGHYATISLLTVSGAWAKQMFTCRVAHTPSSTDWVDNKTFSVCSRDFTPPTVKILQSSCDGGGHFPPTIQLLCLVSGYTPGTINITWLEDGQVMDVDLSTASTTQEGELASTQSELTLSQKHWLSDRTYTCQVTYQGHTFEDSTKKCADSNPRGVSAYLSRPSPFDLFIRKSPTITCLVVDLAPSKGTVNLTWSRASGKPVNHSTRKEEKQRNGTLTVTSTLPVGTRDWIEGETYQCRVTHPHLPRALMRSTTKTSGPRAAPEVYAFATPEWPGSRDKRTLACLIQNFMPEDISVQWLHNEVQLPDARHSTTQPRKTKGSGFFVFSRLEVTRAEWEQKDEFICRAVHEAASPSQTVQRAVSVNPGLAGGSAQSQRAPDRVLCHSGQQQGLPRAAGGSVPHPRCHCGAGRADWPGPPELDVCVEEAEGEAPWTWTGLCIFAALFLLSVSYSAAITLLMVQRFLSATRQGRPQTSLDYTNVLQPHA -------------------------------------------------------------------------------- /riot_na/databases/gene_db/aa_genes_deduplicated/c_genes/human/igh.fasta: -------------------------------------------------------------------------------- 1 | >IGHM IGH HOMO_SAPIENS 2 | GSASAPTLFPLVSCENSPSDTSSVAVGCLAQDFLPDSITFSWKYKNNSDISSTRGFPSVLRGGKYAATSQVLLPSKDVMQGTDEHVVCKVQHPNGNKEKNVPLPVIAELPPKVSVFVPPRDGFFGNPRKSKLICQATGFSPRQIQVSWLREGKQVGSGVTTDQVQAEAKESGPTTYKVTSTLTIKESDWLGQSMFTCRVDHRGLTFQQNASSMCVPDQDTAIRVFAIPPSFASIFLTKSTKLTCLVTDLTTYDSVTISWTRQNGEAVKTHTNISESHPNATFSAVGEASICEDDWNSGERFTCTVTHTDLPSPLKQTISRPKGVALHRPDVYLLPPAREQLNLRESATITCLVTGFSPADVFVQWMQRGQPLSPEKYVTSAPMPEPQAPGRYFAHSILTVSEEEWNTGETYTCVVAHEALPNRVTERTVDKSTEGEVSADEEGFENLWATASTFIVLFLLSLFYSTTVTLFKVK 3 | >IGHG1 IGH HOMO_SAPIENS 4 | ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATVTFFKVKWIFSSVVDLKQTIIPDYRNMIGQGA 5 | >IGHG2 IGH HOMO_SAPIENS 6 | ASTKGPSVFPLAPCSRSTSESTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSNFGTQTYTCNVDHKPSNTKVDKTVERKCCVECPPCPAPPVAGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVQFNWYVDGVEVHNAKTKPREEQFNSTFRVVSVLTVVHQDWLNGKEYKCKVSNKGLPAPIEKTISKTKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDISVEWESNGQPENNYKTTPPMLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATITFFKVKWIFSSVVDLKQTIVPDYRNMIRQGA 7 | >IGHG3 IGH HOMO_SAPIENS 8 | ASTKGPSVFPLAPCSRSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYTCNVNHKPSNTKVDKRVELKTPLGDTTHTCPRCPEPKSCDTPPPCPRCPEPKSCDTPPPCPRCPEPKSCDTPPPCPRCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVQFKWYVDGVEVHNAKTKPREEQYNSTFRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKTKGQPREPQVYTLPPSREEMTKNQVSLTCLVKGFYPSDIAVEWESSGQPENNYNTTPPMLDSDGSFFLYSKLTVDKSRWQQGNIFSCSVMHEALHNRFTQKSLSLSPELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATVTFFKVKWIFSSVVDLKQTIIPDYRNMIGQGA 9 | >IGHG4 IGH HOMO_SAPIENS 10 | ASTKGPSVFPLAPCSRSTSESTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTKTYTCNVDHKPSNTKVDKRVESKYGPPCPSCPAPEFLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSQEDPEVQFNWYVDGVEVHNAKTKPREEQFNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKGLPSSIEKTISKAKGQPREPQVYTLPPSQEEMTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSRLTVDKSRWQEGNVFSCSVMHEALHNHYTQKSLSLSLELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATVTFFKVKWIFSSVVDLKQTIVPDYRNMIRQGA 11 | >IGHA1 IGH HOMO_SAPIENS 12 | ASPTSPKVFPLSLCSTQPDGNVVIACLVQGFFPQEPLSVTWSESGQGVTARNFPPSQDASGDLYTTSSQLTLPATQCLAGKSVTCHVKHYTNPSQDVTVPCPVPSTPPTPSPSTPPTPSPSCCHPRLSLHRPALEDLLLGSEANLTCTLTGLRDASGVTFTWTPSSGKSAVQGPPERDLCGCYSVSSVLPGCAEPWNHGKTFTCTAAYPESKTPLTATLSKSGNTFRPEVHLLPPPSEELALNELVTLTCLARGFSPKDVLVRWLQGSQELPREKYLTWASRQEPSQGTTTFAVTSILRVAAEDWKKGDTFSCMVGHEALPLAFTQKTIDRLADWQMPPPYVVLDLPQETLEEETPGANLWPTTITFLTLFLLSLFYSTALTVTSVRGPSGNREGPQY 13 | >IGHA2 IGH HOMO_SAPIENS 14 | ASPTSPKVFPLSLDSTPQDGNVVVACLVQGFFPQEPLSVTWSESGQNVTARNFPPSQDASGDLYTTSSQLTLPATQCPDGKSVTCHVKHYTNSSQDVTVPCRVPPPPPCCHPRLSLHRPALEDLLLGSEANLTCTLTGLRDASGATFTWTPSSGKSAVQGPPERDLCGCYSVSSVLPGCAQPWNHGETFTCTAAHPELKTPLTANITKSGNTFRPEVHLLPPPSEELALNELVTLTCLARGFSPKDVLVRWLQGSQELPREKYLTWASRQEPSQGTTTYAVTSILRVAAEDWKKGETFSCMVGHEALPLAFTQKTIDRMAGSCCVADWQMPPPYVVLDLPQETLEEETPGANLWPTTITFLTLFLLSLFYSTALTVTSVRGPSGKREGPQY 15 | >IGHD IGH HOMO_SAPIENS 16 | APTKAPDVFPIISGCRHPKDNSPVVLACLITGYHPTSVTVTWYMGTQSQPQRTFPEIQRRDSYYMTSSQLSTPLQQWRQGEYKCVVQHTASKSKKEIFRWPESPKAQASSVPTAQPQAEGSLAKATTAPATTRNTGRGGEEKKKEKEKEEQEERETKTPECPSHTQPLGVYLLTPAVQDLWLRDKATFTCFVVGSDLKDAHLTWEVAGKVPTGGVEEGLLERHSNGSQSQHSRLTLPRSLWNAGTSVTCTLNHPSLPPQRLMALREPAAQAPVKLSLNLLASSDPPEAASWLLCEVSGFSPPNILLMWLEDQREVNTSGFAPARPPPQPRSTTFWAWSVLRVPAPPSPQPATYTCVVSHEDSRTLLNASRSLEVSYLAMTPLIPQSKDENSDDYTTFDDVGSLWTTLSTFVALFILTLLYSGIVTFIKVK 17 | >IGHE IGH HOMO_SAPIENS 18 | ASTQSPSVFPLTRCCKNIPSNATSVTLGCLATGYFPEPVMVTWDTGSLNGTTMTLPATTLTLSGHYATISLLTVSGAWAKQMFTCRVAHTPSSTDWVDNKTFSVCSRDFTPPTVKILQSSCDGGGHFPPTIQLLCLVSGYTPGTINITWLEDGQVMDVDLSTASTTQEGELASTQSELTLSQKHWLSDRTYTCQVTYQGHTFEDSTKKCADSNPRGVSAYLSRPSPFDLFIRKSPTITCLVVDLAPSKGTVNLTWSRASGKPVNHSTRKEEKQRNGTLTVTSTLPVGTRDWIEGETYQCRVTHPHLPRALMRSTTKTSGPRAAPEVYAFATPEWPGSRDKRTLACLIQNFMPEDISVQWLHNEVQLPDARHSTTQPRKTKGSGFFVFSRLEVTRAEWEQKDEFICRAVHEAASPSQTVQRAVSVNPGLAGGSAQSQRAPDRVLCHSGQQQGLPRAAGGSVPHPRCHCGAGRADWPGPPELDVCVEEAEGEAPWTWTGLCIFAALFLLSVSYSAAITLLMVQRFLSATRQGRPQTSLDYTNVLQPHA -------------------------------------------------------------------------------- /riot_na/alignment/skbio_alignment.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from skbio.alignment import StripedSmithWaterman # type: ignore 4 | 5 | from riot_na.alignment.alignment_metrics import ( 6 | GumbellParams, 7 | calculate_seq_identity, 8 | compute_bit_score, 9 | compute_evalue, 10 | ) 11 | from riot_na.data.model import Cigar, InternalAlignmentEntry, InternalAlignmentEntryAA 12 | 13 | 14 | # Extracted aligner call for profiling purposes. 15 | def _align(aligner, target): 16 | res = aligner(target) 17 | return res 18 | 19 | 20 | def align( 21 | aligner: StripedSmithWaterman, 22 | target_id: str, 23 | target: str, 24 | db_length: int, 25 | query: str, 26 | rev_comp: bool = False, 27 | ) -> InternalAlignmentEntry: 28 | res = _align(aligner, target) 29 | bit_score = compute_bit_score(res["optimal_alignment_score"]) 30 | e_value = compute_evalue(len(query), db_length, res["optimal_alignment_score"]) 31 | # Need to add 1 because of: https://github.com/biocore/scikit-bio/issues/1340 32 | 33 | q_start = res["query_begin"] 34 | q_end = res["query_end"] + 1 35 | t_start = res["target_begin"] 36 | t_end = res["target_end_optimal"] + 1 37 | 38 | # seq_identity = calculate_seq_identity(res["cigar"], query[q_start:q_end], target[t_start:t_end]) 39 | return InternalAlignmentEntry( 40 | target_id=target_id, 41 | alignment_score=bit_score, 42 | seq_identity=calculate_seq_identity(res["cigar"], query, target, q_start, t_start), 43 | e_value=e_value, 44 | q_start=q_start, 45 | q_end=q_end, 46 | q_len=q_end - q_start, 47 | t_start=t_start, 48 | t_end=t_end, 49 | t_len=t_end - t_start, 50 | cigar=Cigar(res["cigar"]), 51 | query=query, 52 | rev_comp=rev_comp, 53 | ) 54 | 55 | 56 | def align_aa( 57 | aligner: StripedSmithWaterman, 58 | query: str, 59 | target_id: str, 60 | target: str, 61 | db_length: Optional[int] = None, 62 | calculate_score: bool = True, 63 | extend_c_terminus: bool = False, 64 | extend_n_terminus: bool = False, 65 | ) -> InternalAlignmentEntryAA: 66 | res = aligner(target) 67 | 68 | q_start = res["query_begin"] 69 | q_end = res["query_end"] + 1 70 | t_start = res["target_begin"] 71 | t_end = res["target_end_optimal"] + 1 72 | 73 | if calculate_score: 74 | assert db_length is not None 75 | assert extend_c_terminus is False, "Score calculation not supported for extended C-terminus" 76 | assert extend_n_terminus is False, "Score calculation not supported for extended N-terminus" 77 | bit_score = compute_bit_score(res["optimal_alignment_score"], gumbell_params=GumbellParams.AA) 78 | e_value = compute_evalue(len(query), db_length, res["optimal_alignment_score"]) 79 | seq_identity = calculate_seq_identity(res["cigar"], query, target, q_start, t_start) if query else 0 80 | else: 81 | bit_score = None 82 | e_value = None 83 | seq_identity = None 84 | 85 | fixed_cigar = Cigar(res["cigar"]) 86 | 87 | if extend_n_terminus: 88 | n_term_number_of_matches = min(q_start, t_start) 89 | fixed_cigar = Cigar(f"{n_term_number_of_matches}M" + fixed_cigar) 90 | q_start = q_start - n_term_number_of_matches 91 | t_start = t_start - n_term_number_of_matches 92 | 93 | if extend_c_terminus: 94 | c_term_number_of_matches = min(len(query) - q_end, len(target) - t_end) 95 | fixed_cigar = Cigar(fixed_cigar + f"{c_term_number_of_matches}M") 96 | q_end = q_end + c_term_number_of_matches 97 | t_end = t_end + c_term_number_of_matches 98 | 99 | return InternalAlignmentEntryAA( 100 | target_id=target_id, 101 | alignment_score=bit_score, 102 | seq_identity=seq_identity, 103 | e_value=e_value, 104 | q_start=q_start, 105 | q_end=q_end, 106 | t_start=t_start, 107 | t_end=t_end, 108 | cigar=fixed_cigar, 109 | ) 110 | -------------------------------------------------------------------------------- /notebooks/data_processing/deduplicate_aa_genes.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 58, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "from riot_na.config import GENE_DB_DIR\n", 10 | "from Bio import SeqIO\n", 11 | "import pandas as pd\n", 12 | "from pathlib import Path\n", 13 | "\n", 14 | "\n", 15 | "def df_to_fasta(df: pd.DataFrame, output_path: Path):\n", 16 | " with output_path.open(\"w\") as output_file:\n", 17 | " for row in df.itertuples(index=False):\n", 18 | " output_file.write(f\">{row.description}\\n\")\n", 19 | " output_file.write(f\"{row.sequence}\\n\")\n", 20 | "\n", 21 | "\n", 22 | "def deduplicate_genes(input_path) -> tuple[pd.DataFrame, pd.DataFrame]:\n", 23 | " df = pd.DataFrame.from_records(\n", 24 | " (\n", 25 | " {\"allele_id\": record.id, \"description\": record.description, \"sequence\": str(record.seq)}\n", 26 | " for record in SeqIO.parse(input_path, \"fasta\")\n", 27 | " )\n", 28 | " )\n", 29 | " df[\"allele\"] = df[\"allele_id\"].str.split(\"*\").str[1]\n", 30 | " df[\"gene_id\"] = df[\"allele_id\"].str.split(\"*\").str[0]\n", 31 | "\n", 32 | " df = df.sort_values([\"gene_id\", \"allele\"])\n", 33 | " deduplicated_df = df.drop_duplicates(subset=[\"sequence\"])\n", 34 | " first_allele_df = deduplicated_df.groupby(\"gene_id\").first()\n", 35 | " return deduplicated_df, first_allele_df\n", 36 | "\n", 37 | "\n", 38 | "AA_GENES_DIR = GENE_DB_DIR / \"gene_db\" / \"aa_genes\"\n", 39 | "OUTPUT_GENES_DEDUP_DIR = GENE_DB_DIR / \"gene_db\" / \"aa_genes_deduplicated\"\n", 40 | "OUTPUT_GENES_FIRST_ALLELE_DIR = GENE_DB_DIR / \"gene_db\" / \"aa_genes_first_allele\"\n", 41 | "\n", 42 | "\n", 43 | "for organism in [\"human\", \"mouse\"]:\n", 44 | "\n", 45 | " input_path = AA_GENES_DIR / \"v_genes\" / f\"{organism}.fasta\"\n", 46 | "\n", 47 | " deduplicated_df, first_allele_df = deduplicate_genes(input_path)\n", 48 | " output_dir = OUTPUT_GENES_DEDUP_DIR / \"v_genes\"\n", 49 | " output_dir.mkdir(exist_ok=True, parents=True)\n", 50 | " df_to_fasta(deduplicated_df, output_dir / f\"{organism}.fasta\")\n", 51 | " output_dir = OUTPUT_GENES_FIRST_ALLELE_DIR / \"v_genes\"\n", 52 | " output_dir.mkdir(exist_ok=True, parents=True)\n", 53 | " df_to_fasta(first_allele_df, output_dir / f\"{organism}.fasta\")\n", 54 | "\n", 55 | "with (\n", 56 | " open(OUTPUT_GENES_DEDUP_DIR / \"v_genes\" / \"human.fasta\") as human_file,\n", 57 | " open(OUTPUT_GENES_DEDUP_DIR / \"v_genes\" / \"mouse.fasta\") as mouse_file,\n", 58 | " open(OUTPUT_GENES_DEDUP_DIR / \"v_genes\" / \"all_species.fasta\", \"w\") as all_species_file,\n", 59 | "):\n", 60 | " all_species_file.write(human_file.read() + mouse_file.read())\n", 61 | "\n", 62 | "for organism in [\"human\", \"mouse\"]:\n", 63 | "\n", 64 | " for input_path in (AA_GENES_DIR / \"j_genes\" / organism).iterdir():\n", 65 | "\n", 66 | " deduplicated_df, first_allele_df = deduplicate_genes(input_path)\n", 67 | " output_dir = OUTPUT_GENES_DEDUP_DIR / \"j_genes\" / organism\n", 68 | " output_dir.mkdir(exist_ok=True, parents=True)\n", 69 | " df_to_fasta(deduplicated_df, output_dir / f\"{input_path.stem}.fasta\")\n", 70 | " output_dir = OUTPUT_GENES_FIRST_ALLELE_DIR / \"j_genes\" / organism\n", 71 | " output_dir.mkdir(exist_ok=True, parents=True)\n", 72 | " df_to_fasta(first_allele_df, output_dir / f\"{input_path.stem}.fasta\")\n", 73 | "\n" 74 | ] 75 | } 76 | ], 77 | "metadata": { 78 | "kernelspec": { 79 | "display_name": "riot-ZhbTWDtr-py3.10", 80 | "language": "python", 81 | "name": "python3" 82 | }, 83 | "language_info": { 84 | "codemirror_mode": { 85 | "name": "ipython", 86 | "version": 3 87 | }, 88 | "file_extension": ".py", 89 | "mimetype": "text/x-python", 90 | "name": "python", 91 | "nbconvert_exporter": "python", 92 | "pygments_lexer": "ipython3", 93 | "version": "3.10.7" 94 | } 95 | }, 96 | "nbformat": 4, 97 | "nbformat_minor": 2 98 | } 99 | -------------------------------------------------------------------------------- /notebooks/prefiltering_grid_search/utils.py: -------------------------------------------------------------------------------- 1 | # type: ignore 2 | import json 3 | import multiprocessing as mp 4 | import re 5 | from functools import partial 6 | 7 | import pandas as pd 8 | from tqdm import tqdm 9 | 10 | from riot_na.alignment.alignment_utils import get_cigar_op_groups 11 | 12 | GRID_SEARCH_PARAMS = ["top", "kmer_size", "distance_threshold", "modulo", "gap_open", "gap_ext", "x_drop"] 13 | 14 | 15 | def parse_file_name(path, params=None): 16 | if params is None: 17 | params = GRID_SEARCH_PARAMS 18 | regex = "_".join([f"{param}_([0-9]*)" for param in params]) 19 | match = re.search(regex, path.stem) 20 | return match.groups() 21 | 22 | 23 | def _extract_gene_match(matches_string, top_n, with_rev_comp): 24 | matches = json.loads(matches_string)[:top_n] 25 | if with_rev_comp: 26 | return {(match["gene_id"], match["rev_comp"]) for match in matches} 27 | return {match["gene_id"] for match in matches} 28 | 29 | 30 | def _agg_grid_search_worker(path, params, top_n, ground_truth_df, with_rev_comp): 31 | df = pd.read_csv(path, index_col=0, usecols=["sequence_id", "best_genes"]) 32 | df["best_genes"] = df["best_genes"].apply(partial(_extract_gene_match, top_n=top_n, with_rev_comp=with_rev_comp)) 33 | df = df.join(ground_truth_df, rsuffix="_true", how="inner") 34 | df["is_match"] = df.apply(lambda s: s["target_id"] in s["best_genes"], axis=1) 35 | match_percent = df["is_match"].value_counts(normalize=True).loc[True] 36 | return {**dict(zip(params, parse_file_name(path, params))), "match_percent": match_percent} 37 | 38 | 39 | def aggregate_grid_search_results(results_dir, params, top_n, ground_truth_df, with_rev_comp): 40 | worker_partial = partial( 41 | _agg_grid_search_worker, 42 | params=params, 43 | top_n=top_n, 44 | ground_truth_df=ground_truth_df, 45 | with_rev_comp=with_rev_comp, 46 | ) 47 | paths = list(results_dir.glob("top_*")) 48 | with mp.Pool(4) as pool: 49 | res = list(tqdm(pool.imap(worker_partial, paths), total=len(paths))) 50 | res_df = pd.DataFrame.from_records(res) 51 | res_df["top"] = top_n 52 | res_df[params] = res_df[params].astype(int) 53 | return res_df 54 | 55 | 56 | def format_cigar(cigar: str, query: str, target: str): 57 | cigar_items = get_cigar_op_groups(cigar) 58 | out = "" 59 | pos = 0 60 | for cnt, operation in cigar_items: 61 | if operation == "M": 62 | for i in range(cnt): 63 | if query[pos + i] == target[pos + i]: 64 | out += "|" 65 | else: 66 | out += "X" 67 | elif operation == "=": 68 | out += cnt * "|" 69 | elif operation == "X": 70 | out += cnt * "X" 71 | elif operation == "I": 72 | target = target[:pos] + cnt * "-" + target[pos:] 73 | out += cnt * "+" 74 | elif operation == "D": 75 | query = query[:pos] + cnt * "-" + query[pos:] 76 | out += cnt * "-" 77 | else: 78 | out += cnt * "?" 79 | pos += cnt 80 | return out, query, target 81 | 82 | 83 | def calculate_cigar_op_sum(cigar: str) -> int: 84 | return sum(len for len, _ in get_cigar_op_groups(cigar)) 85 | 86 | 87 | def display_riot_alignment(row): 88 | print(f"ID: {row.name}") 89 | print(f"rev_comp: {row['rev_comp']}") 90 | print() 91 | 92 | t_start = row["t_start"] 93 | t_end = row["t_end"] 94 | q_start = row["q_start"] 95 | q_end = row["q_end"] 96 | seq_identity = row["seq_identity"] 97 | bit_score = row["bit_score"] 98 | e_value = row["e_value"] 99 | target = row["target"][t_start:t_end] 100 | query = row["sequence"][q_start:q_end] 101 | cigar = row["cigar"] 102 | cigar_op_sum = calculate_cigar_op_sum(cigar) 103 | 104 | print(f"Riot: {row['target_id']}") 105 | print(f"{t_start=}, {t_end=}, {len(target)=}, {q_start=}, {q_end=}, {len(query)=}, {cigar=}, {cigar_op_sum=}") 106 | print(f"{seq_identity=}") 107 | print(f"{bit_score=}") 108 | print(f"{e_value=}") 109 | cigar_exploded, query, target = format_cigar(cigar, query, target) 110 | 111 | print("Target: ", target) 112 | print(" ", cigar_exploded) 113 | print("Query: ", query) 114 | -------------------------------------------------------------------------------- /tests/test_e2e.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | from tempfile import TemporaryDirectory 3 | 4 | from riot_na import ( 5 | AirrRearrangementEntryAA, 6 | AirrRearrangementEntryNT, 7 | ChainType, 8 | Locus, 9 | Scheme, 10 | ) 11 | from riot_na.api.riot_numbering import get_or_create_riot_aa, get_or_create_riot_nt 12 | from riot_na.common.io import write_airr_iter_to_csv 13 | from riot_na.data.model import SegmentedAirrRearrangementEntryNT 14 | 15 | SEQUENCES_NT = { 16 | "H": "AACAACACATGTCCAATGTCCTCTCCACAGACACTGAACACACTGACTCTAACCATGGGAAGGAGCTGGATCTTTCTCTTCCTCCTGTCAGGAACTGCAGGTGTCCACTCTGAGGTCCAGCTGCAACAGTCTGGACCTGTGCTGGTGAAGCCTGGGGCTTCAGTGAAGATGTCCTGTAAGGCTTCTGGATACACATTCACTGACTACTATATGAACTGGGTGAAGCAGAGCCATGGAAAGAGACTTGAGTGGATTGGAGTTATTAATCCTTACAACGGTGGTACTAACTATAACCAGAAGTTCAAGGGCAAGGCCACATTGACTGTTGACAAGTCCTCCAGCACAGCCTACATGGAGCTCAACAGCCTGACATCTGAGGACTCTGCAGTCTATTACTGTGCAGATGGGATTATTACGAATTGGTATTTCGATGTCTGGGGCACAGGGACCACGGTCACCGTCTCCTCAGCCAAAACGACACCCCCATCTGTCTATCCACTGGCCCCTGGATCTGCTGCCCAAACTAACTCCATGGTGACCCTGGGATGCCTGGTCAAGGGCTATTTCCCTGAGCCAGTGACAGTGACCTGGAACTCTGGATCCCTGTCCAGCGGTGTGCACACCTTCCCAGCTGTCCTGCAGTCTGACCTCTACACTCTGAGCAGCTCAGTGACTGTCCCCTCCAGCACCTGGCCCAGCCAGACCGTCACCTGCAACGTTGCCCACCCGGCCAGCAGCACCAAGGTGGACAAGAAAATTGTGCCCAGGGATTGTGGTTGTAAGCCTTGCATATGTACAGTCCCAGAAGT", 17 | "L": "GCTGACCAATATTGAAAAGAATAGACCTGGTTTGTGAATTATGGCCTGGATTTCACTTATACTCTCTCTCCTGGCTCTCAGCTCAGGGGCCATTTCCCAGGCTGTTGTGACTCAGGAATCTGCACTCACCACATCACCTGGTGAAACAGTCACACTCACTTGTCGCTCAAGTACTGGGGCTGTTACAACTAGTAACTATGCCAACTGGGTCCAAGAAAAACCAGATCATTTATTCACTGGTCTAATAGGTGGTACCAACAACCGAGCTCCAGGTGTTCCTGCCAGATTCTCAGGCTCCCTGATTGGAGACAAGGCTGCCCTCACCATCACAGGGGCACAGACTGAGGATGAGGCAATATATTTCTGTGCTCTATGGTACAGCAACCATTTCCACAATGACATGTGTAGATGGGGAAGTAGAACAAGAACACTCTGGTACAGTCTCATAACT", 18 | } 19 | 20 | 21 | def test_e2e_nucleotides(): 22 | riot_numbering_nt = get_or_create_riot_nt() 23 | 24 | for scheme in Scheme: 25 | airr_heavy: AirrRearrangementEntryNT = riot_numbering_nt.run_on_sequence("", SEQUENCES_NT["H"], scheme=scheme) 26 | assert airr_heavy.v_call 27 | assert airr_heavy.j_call 28 | assert ChainType.from_locus(Locus(airr_heavy.locus)) == ChainType.HEAVY 29 | 30 | airr_light: AirrRearrangementEntryNT = riot_numbering_nt.run_on_sequence("", SEQUENCES_NT["L"], scheme=scheme) 31 | assert airr_light.v_call 32 | assert airr_light.j_call 33 | assert ChainType.from_locus(Locus(airr_light.locus)) == ChainType.LIGHT 34 | 35 | riot_numbering_nt = get_or_create_riot_nt(return_all_domains=True) 36 | airrs_list: list[SegmentedAirrRearrangementEntryNT] = riot_numbering_nt.run_on_sequence( 37 | "", SEQUENCES_NT["H"] + "GGGGGGSGGGG" + SEQUENCES_NT["L"], scheme=Scheme.IMGT 38 | ) 39 | assert airrs_list[0].v_call 40 | assert airrs_list[0].j_call 41 | assert ChainType.from_locus(Locus(airrs_list[0].locus)) == ChainType.HEAVY 42 | assert airrs_list[1].v_call 43 | assert airrs_list[1].j_call 44 | assert ChainType.from_locus(Locus(airrs_list[1].locus)) == ChainType.LIGHT 45 | 46 | with TemporaryDirectory() as temp_dir: 47 | write_airr_iter_to_csv(Path(temp_dir) / "output.csv", SegmentedAirrRearrangementEntryNT, airrs_list) 48 | 49 | 50 | SEQUENCES_AA = { 51 | "H": "QVQLVQSGVEVKKPGASVKVSCKASGYTFTNYYMYWVRQAPGQGLEWMGGINPSNGGTNFNEKFKNRVTLTTDSSTTTAYMELKSLQFDDTAVYYCARRDYRFDMGFDYWGQGTTVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHHHHHH", 52 | "L": "EIVLTQSPATLSLSPGERATLSCRASKGVSTSGYSYLHWYQQKPGQAPRLLIYLASYLESGVPARFSGSGSGTDFTLTISSLEPEDFAVYYCQHSRDLPLTFGGGTKVEIKRTVAAPSVFIFPPSDEQLKSGTASVVCLLNNFYPREAKVQWKVDNALQSGNSQESVTEQDSKDSTYSLSSTLTLSKADYEKHKVYACEVTHQGLSSPVTKSFNRGEC", 53 | } 54 | 55 | 56 | def test_e2e_amino_acids(): 57 | riot_numbering_aa = get_or_create_riot_aa() 58 | 59 | for scheme in Scheme: 60 | airr_heavy: AirrRearrangementEntryAA = riot_numbering_aa.run_on_sequence("", SEQUENCES_AA["H"], scheme=scheme) 61 | assert airr_heavy.v_call 62 | assert airr_heavy.j_call 63 | assert ChainType.from_locus(Locus(airr_heavy.locus)) == ChainType.HEAVY 64 | 65 | airr_light: AirrRearrangementEntryAA = riot_numbering_aa.run_on_sequence("", SEQUENCES_AA["L"], scheme=scheme) 66 | assert airr_light.v_call 67 | assert airr_light.j_call 68 | assert ChainType.from_locus(Locus(airr_light.locus)) == ChainType.LIGHT 69 | 70 | 71 | if __name__ == "__main__": 72 | test_e2e_nucleotides() 73 | test_e2e_amino_acids() 74 | print("All tests passed!") 75 | -------------------------------------------------------------------------------- /riot_na/cli.py: -------------------------------------------------------------------------------- 1 | import time 2 | from pathlib import Path 3 | from typing import Any, Optional 4 | 5 | import click 6 | import psutil 7 | 8 | from riot_na.api.api_mp import run_on_file_mp 9 | from riot_na.api.riot_numbering import create_riot_aa, create_riot_nt 10 | from riot_na.common.io import write_airr_iter_to_csv 11 | from riot_na.config import GENE_DB_DIR 12 | from riot_na.data.model import ( 13 | AirrRearrangementEntryAA, 14 | AirrRearrangementEntryNT, 15 | InputType, 16 | Organism, 17 | Scheme, 18 | SegmentedAirrRearrangementEntryAA, 19 | SegmentedAirrRearrangementEntryNT, 20 | ) 21 | 22 | 23 | @click.command() 24 | @click.option("-f", "--input-file", type=Path, help="Path to input FASTA file.") 25 | @click.option("-s", "--sequence", type=str, help="Input sequence file.") 26 | @click.option("-o", "--output-file", type=Path, help="Path to output CSV file. If not specified, stdout is used.") 27 | @click.option( 28 | "--scheme", 29 | type=click.Choice(Scheme), # type: ignore 30 | default=Scheme.IMGT, 31 | help="Which numbering scheme should be used: imgt, kabat, chothia, martin. Default IMGT", 32 | ) 33 | @click.option( 34 | "--species", 35 | type=click.Choice(Organism), # type: ignore 36 | default=None, 37 | help="Which species germline sequences should be used. Default is all species.", 38 | ) 39 | @click.option( 40 | "--input-type", 41 | type=click.Choice(InputType), # type: ignore 42 | default=InputType.NT, 43 | help="What kind of sequences are provided on input. Default is nucleotide sequences.", 44 | ) 45 | @click.option( 46 | "-p", 47 | "--ncpu", 48 | type=int, 49 | default=psutil.cpu_count(logical=False), 50 | help="Number of parallel processes to use. Default is number of physical cores.", 51 | ) 52 | @click.option( 53 | "-e", 54 | "--extend_alignment", 55 | type=bool, 56 | default=False, 57 | help=( 58 | "Include unaligned beginning of the query sequence in numbering." 59 | "This option impacts only amino acid sequences." 60 | ), 61 | ) 62 | @click.option( 63 | "--multiple-domains", 64 | type=bool, 65 | default=False, 66 | help=("Return all domains of multiple domain proteins."), 67 | ) 68 | def run_riot( 69 | input_file: Optional[Path], 70 | sequence: Optional[str], 71 | output_file: Optional[Path], 72 | scheme: Scheme, 73 | species: Optional[Organism], 74 | input_type: InputType, 75 | ncpu: int, 76 | extend_alignment: bool, 77 | multiple_domains: bool, 78 | ): 79 | species_list = (species,) if species else None 80 | if input_file and input_file.exists(): 81 | if not output_file: 82 | output_file = Path() / "numbering_result.csv" 83 | start = time.perf_counter() 84 | run_on_file_mp( 85 | GENE_DB_DIR, 86 | input_file, 87 | output_file, 88 | input_format="fasta", 89 | scheme=scheme, 90 | allowed_species=species_list, 91 | n_processes=ncpu, 92 | input_type=input_type, 93 | return_all_domains=multiple_domains, 94 | ) 95 | end = time.perf_counter() 96 | elapsed_time = end - start 97 | print("Execution time:", time.strftime("%H:%M:%S", time.gmtime(elapsed_time))) 98 | elif sequence: 99 | result: Any 100 | record_type: Any 101 | match input_type: 102 | case InputType.NT: 103 | numbering_nt = create_riot_nt( 104 | allowed_species=species_list, db_dir=GENE_DB_DIR, return_all_domains=multiple_domains 105 | ) 106 | record_type = AirrRearrangementEntryNT if not multiple_domains else SegmentedAirrRearrangementEntryNT 107 | result = numbering_nt.run_on_sequence(header="-", query_sequence=sequence, scheme=scheme) 108 | case InputType.AA: 109 | numbering_aa = create_riot_aa( 110 | allowed_species=species_list, db_dir=GENE_DB_DIR, return_all_domains=multiple_domains 111 | ) 112 | record_type = AirrRearrangementEntryAA if not multiple_domains else SegmentedAirrRearrangementEntryAA 113 | result = numbering_aa.run_on_sequence( 114 | header="-", query_sequence=sequence, scheme=scheme, extend_alignment=extend_alignment 115 | ) 116 | 117 | if output_file: 118 | write_airr_iter_to_csv(output_file, record_type, result if not multiple_domains else [result]) 119 | else: 120 | if not multiple_domains: 121 | print(result.__dict__) 122 | else: 123 | for single_result in result: 124 | print(single_result.__dict__) 125 | 126 | else: 127 | print("Need to specify input sequence or FASTA file!") 128 | -------------------------------------------------------------------------------- /riot_na/schemes/region_offsets.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | from riot_na.data.model import ( 4 | ChainType, 5 | RegionOffsetsAA, 6 | RegionOffsetsNT, 7 | Scheme, 8 | SchemeAlignment, 9 | ShortRegion, 10 | ) 11 | from riot_na.data.scheme_regions import get_region 12 | 13 | 14 | def infer_region_offsets( 15 | scheme_alignment: SchemeAlignment, scheme: Scheme, chain_type: ChainType 16 | ) -> dict[ShortRegion, list[int]]: 17 | result: dict[ShortRegion, list[int]] = {} 18 | 19 | scheme_position = 0 20 | query_position = scheme_alignment.q_start or 0 21 | insertion_counter = 0 22 | alignment_str = scheme_alignment.alignment_str 23 | 24 | region_id = ShortRegion.FW1 25 | 26 | for op in alignment_str: 27 | if op == "M": 28 | insertion_counter = 0 29 | query_position = query_position + 1 30 | scheme_position = scheme_position + 1 31 | region_id = get_region(scheme_position, scheme, chain_type) 32 | 33 | residues = result.get(region_id, []) 34 | residues.append(query_position) 35 | result[region_id] = residues 36 | 37 | elif op in {"D", "N"}: 38 | insertion_counter = 0 39 | scheme_position = scheme_position + 1 40 | region_id = get_region(scheme_position, scheme, chain_type) 41 | elif op == "I": 42 | insertion_counter = insertion_counter + 1 43 | query_position = query_position + 1 44 | residues = result.get(region_id, []) 45 | residues.append(query_position) 46 | result[region_id] = residues 47 | 48 | return result 49 | 50 | 51 | def get_first_aa(region_positions: Optional[list[int]]) -> int: 52 | if not region_positions or len(region_positions) == 0: 53 | return -1 54 | 55 | return region_positions[0] 56 | 57 | 58 | def get_last_aa(region_positions: Optional[list[int]]) -> int: 59 | if not region_positions or len(region_positions) == 0: 60 | return -1 61 | 62 | return region_positions[-1] 63 | 64 | 65 | def infer_aa_region_offsets(aa_regions: dict[ShortRegion, list[int]]) -> RegionOffsetsAA: 66 | return RegionOffsetsAA( 67 | fwr1_start_aa=get_first_aa(aa_regions.get(ShortRegion.FW1)), 68 | fwr1_end_aa=get_last_aa(aa_regions.get(ShortRegion.FW1)), 69 | cdr1_start_aa=get_first_aa(aa_regions.get(ShortRegion.CDR1)), 70 | cdr1_end_aa=get_last_aa(aa_regions.get(ShortRegion.CDR1)), 71 | fwr2_start_aa=get_first_aa(aa_regions.get(ShortRegion.FW2)), 72 | fwr2_end_aa=get_last_aa(aa_regions.get(ShortRegion.FW2)), 73 | cdr2_start_aa=get_first_aa(aa_regions.get(ShortRegion.CDR2)), 74 | cdr2_end_aa=get_last_aa(aa_regions.get(ShortRegion.CDR2)), 75 | fwr3_start_aa=get_first_aa(aa_regions.get(ShortRegion.FW3)), 76 | fwr3_end_aa=get_last_aa(aa_regions.get(ShortRegion.FW3)), 77 | cdr3_start_aa=get_first_aa(aa_regions.get(ShortRegion.CDR3)), 78 | cdr3_end_aa=get_last_aa(aa_regions.get(ShortRegion.CDR3)), 79 | fwr4_start_aa=get_first_aa(aa_regions.get(ShortRegion.FW4)), 80 | fwr4_end_aa=get_last_aa(aa_regions.get(ShortRegion.FW4)), 81 | ) 82 | 83 | 84 | def get_first_nt(region: Optional[list[int]], offset: int) -> int: 85 | if region is None: 86 | return -1 87 | 88 | return region[0] * 3 - 2 + offset 89 | 90 | 91 | def get_last_nt(region: Optional[list[int]], offset: int) -> int: 92 | if region is None: 93 | return -1 94 | 95 | return region[-1] * 3 + offset 96 | 97 | 98 | def infer_nt_region_offsets( 99 | aa_regions: dict[ShortRegion, list[int]], nt_alignment_start: int = 0, reading_frame: int = 0 100 | ) -> RegionOffsetsNT: 101 | # nt 1 2 3 4 5 6 7 102 | # pt 1 2 103 | # start | 104 | # end | 105 | offset = nt_alignment_start + reading_frame 106 | 107 | return RegionOffsetsNT( 108 | fwr1_start=get_first_nt(aa_regions.get(ShortRegion.FW1), offset), 109 | fwr1_end=get_last_nt(aa_regions.get(ShortRegion.FW1), offset), 110 | cdr1_start=get_first_nt(aa_regions.get(ShortRegion.CDR1), offset), 111 | cdr1_end=get_last_nt(aa_regions.get(ShortRegion.CDR1), offset), 112 | fwr2_start=get_first_nt(aa_regions.get(ShortRegion.FW2), offset), 113 | fwr2_end=get_last_nt(aa_regions.get(ShortRegion.FW2), offset), 114 | cdr2_start=get_first_nt(aa_regions.get(ShortRegion.CDR2), offset), 115 | cdr2_end=get_last_nt(aa_regions.get(ShortRegion.CDR2), offset), 116 | fwr3_start=get_first_nt(aa_regions.get(ShortRegion.FW3), offset), 117 | fwr3_end=get_last_nt(aa_regions.get(ShortRegion.FW3), offset), 118 | cdr3_start=get_first_nt(aa_regions.get(ShortRegion.CDR3), offset), 119 | cdr3_end=get_last_nt(aa_regions.get(ShortRegion.CDR3), offset), 120 | fwr4_start=get_first_nt(aa_regions.get(ShortRegion.FW4), offset), 121 | fwr4_end=get_last_nt(aa_regions.get(ShortRegion.FW4), offset), 122 | ) 123 | -------------------------------------------------------------------------------- /riot_na/data/scheme_regions.py: -------------------------------------------------------------------------------- 1 | from functools import cache 2 | from typing import Optional 3 | 4 | from riot_na.common.assert_never import assert_never 5 | from riot_na.data.model import ChainType, Scheme, ShortRegion 6 | from riot_na.data.scheme_definitions import ( 7 | CHOTHIA_REGIONS, 8 | IMGT_REGIONS, 9 | KABAT_REGIONS, 10 | MARTIN_REGIONS, 11 | ChainRegions, 12 | Regions, 13 | ) 14 | 15 | 16 | class UnknownPosition(Exception): 17 | pass 18 | 19 | 20 | @cache 21 | def get_regions_by_scheme_and_chain(scheme: Scheme, chain: ChainType) -> ChainRegions: 22 | """ 23 | Get the region of a given position in a specified scheme entry. 24 | """ 25 | match scheme: 26 | case Scheme.KABAT: 27 | match chain: 28 | case ChainType.HEAVY: 29 | return KABAT_REGIONS[chain] 30 | case ChainType.LIGHT: 31 | return KABAT_REGIONS[chain] 32 | assert_never(chain) 33 | case Scheme.IMGT: 34 | match chain: 35 | case ChainType.HEAVY: 36 | return IMGT_REGIONS[chain] 37 | case ChainType.LIGHT: 38 | return IMGT_REGIONS[chain] 39 | assert_never(chain) 40 | case Scheme.CHOTHIA: 41 | match chain: 42 | case ChainType.HEAVY: 43 | return CHOTHIA_REGIONS[chain] 44 | case ChainType.LIGHT: 45 | return CHOTHIA_REGIONS[chain] 46 | assert_never(chain) 47 | case Scheme.MARTIN: 48 | match chain: 49 | case ChainType.HEAVY: 50 | return MARTIN_REGIONS[chain] 51 | case ChainType.LIGHT: 52 | return MARTIN_REGIONS[chain] 53 | assert_never(chain) 54 | assert_never(scheme) 55 | 56 | 57 | def position_to_region(regions: ChainRegions, position: int) -> Optional[ShortRegion]: 58 | if regions[ShortRegion.FW1]["min"] <= position <= regions[ShortRegion.FW1]["max"]: 59 | return ShortRegion.FW1 60 | if regions[ShortRegion.CDR1]["min"] <= position <= regions[ShortRegion.CDR1]["max"]: 61 | return ShortRegion.CDR1 62 | if regions[ShortRegion.FW2]["min"] <= position <= regions[ShortRegion.FW2]["max"]: 63 | return ShortRegion.FW2 64 | if regions[ShortRegion.CDR2]["min"] <= position <= regions[ShortRegion.CDR2]["max"]: 65 | return ShortRegion.CDR2 66 | if regions[ShortRegion.FW3]["min"] <= position <= regions[ShortRegion.FW3]["max"]: 67 | return ShortRegion.FW3 68 | if regions[ShortRegion.CDR3]["min"] <= position <= regions[ShortRegion.CDR3]["max"]: 69 | return ShortRegion.CDR3 70 | if regions[ShortRegion.FW4]["min"] <= position <= regions[ShortRegion.FW4]["max"]: 71 | return ShortRegion.FW4 72 | 73 | return None 74 | 75 | 76 | def get_region(scheme_position: int, scheme: Scheme, chain_type: ChainType) -> ShortRegion: 77 | """ 78 | Based on scheme position, get the region 79 | """ 80 | region_mapping = get_regions_by_scheme_and_chain(scheme, chain_type) 81 | 82 | region = position_to_region(region_mapping, scheme_position) 83 | 84 | if region: 85 | return region 86 | 87 | raise UnknownPosition(f"Unknown position {scheme_position} for scheme {scheme} chain {chain_type}") 88 | 89 | 90 | def get_regions_definitions() -> dict[Scheme, Regions]: 91 | return { 92 | Scheme.IMGT: IMGT_REGIONS, 93 | Scheme.KABAT: KABAT_REGIONS, 94 | Scheme.CHOTHIA: CHOTHIA_REGIONS, 95 | Scheme.MARTIN: CHOTHIA_REGIONS, 96 | } 97 | 98 | 99 | @cache 100 | def get_cdr_ranges(scheme: Scheme, chain_type: ChainType) -> list[tuple[int, int]]: 101 | def _get_cdrs(regions: ChainRegions): 102 | return [ 103 | (regions[region]["min"], regions[region]["max"]) 104 | for region in [ShortRegion.CDR1, ShortRegion.CDR2, ShortRegion.CDR3] 105 | ] 106 | 107 | match (scheme, chain_type): 108 | case Scheme.IMGT, ChainType.HEAVY: 109 | return _get_cdrs(IMGT_REGIONS[chain_type]) 110 | case Scheme.IMGT, ChainType.LIGHT: 111 | return _get_cdrs(IMGT_REGIONS[chain_type]) 112 | case Scheme.KABAT, ChainType.HEAVY: 113 | return _get_cdrs(KABAT_REGIONS[chain_type]) 114 | case Scheme.KABAT, ChainType.LIGHT: 115 | return _get_cdrs(KABAT_REGIONS[chain_type]) 116 | case Scheme.CHOTHIA, ChainType.HEAVY: 117 | return _get_cdrs(CHOTHIA_REGIONS[chain_type]) 118 | case Scheme.CHOTHIA, ChainType.LIGHT: 119 | return _get_cdrs(CHOTHIA_REGIONS[chain_type]) 120 | case Scheme.MARTIN, ChainType.HEAVY: 121 | return _get_cdrs(MARTIN_REGIONS[chain_type]) 122 | case Scheme.MARTIN, ChainType.LIGHT: 123 | return _get_cdrs(MARTIN_REGIONS[chain_type]) 124 | 125 | raise ValueError(f"Unknown scheme {scheme} or chain {chain_type}") 126 | 127 | 128 | if __name__ == "__main__": 129 | print(list(get_cdr_ranges(Scheme.IMGT, ChainType.HEAVY))) 130 | -------------------------------------------------------------------------------- /tests/test_aa_alignments.py: -------------------------------------------------------------------------------- 1 | from riot_na.alignment.aa_gene_alignments import create_vjc_aligner_aa 2 | from riot_na.data.model import Locus, Organism 3 | 4 | 5 | def test_produce_aa_alignments(): 6 | # given 7 | aligner = create_vjc_aligner_aa(use_segment_aligner=True) 8 | 9 | query_sequence = "QVQLQQWGAGLLKPSETLSLTCAVFGGSFSGYYWSWIRQPPGKGLEWIGEINHRGNTNDNPSLKSRVTISVDTSKNQFALKLSSVTAADTAVYYCARERGYTYGNFDHWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK" 10 | 11 | # when 12 | alignments = aligner.produce_aa_alignments(query_sequence)[0] 13 | 14 | # then 15 | assert alignments.v is not None 16 | assert alignments.v.target_id == "IGHV4-34*01" 17 | assert alignments.v.alignment_score == 192.68417758360758 18 | assert alignments.v.seq_identity == 0.9484536082474226 19 | assert alignments.v.e_value == 2.2632803302266517e-141 20 | assert alignments.v.q_start == 0 21 | assert alignments.v.q_end == 97 22 | assert alignments.v.t_start == 0 23 | assert alignments.v.t_end == 97 24 | assert alignments.v.cigar == "97M" 25 | assert alignments.v.species == Organism.HOMO_SAPIENS 26 | assert alignments.v.locus == Locus.IGH 27 | assert ( 28 | alignments.v.q_seq 29 | == "QVQLQQWGAGLLKPSETLSLTCAVFGGSFSGYYWSWIRQPPGKGLEWIGEINHRGNTNDNPSLKSRVTISVDTSKNQFALKLSSVTAADTAVYYCARERGYTYGNFDHWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK" 30 | ) 31 | assert ( 32 | alignments.v.t_seq 33 | == "QVQLQQWGAGLLKPSETLSLTCAVYGGSFSGYYWSWIRQPPGKGLEWIGEINHSGSTNYNPSLKSRVTISVDTSKNQFSLKLSSVTAADTAVYYCAR" 34 | ) 35 | 36 | assert alignments.j is not None 37 | assert alignments.j.target_id == "IGHJ1*01" 38 | assert alignments.j.alignment_score == 32.01169295201648 39 | assert alignments.j.seq_identity == 0.9285714285714286 40 | assert alignments.j.e_value == 9.01920682236379e-18 41 | assert alignments.j.q_start == 8 42 | assert alignments.j.q_end == 22 43 | assert alignments.j.t_start == 3 44 | assert alignments.j.t_end == 17 45 | assert alignments.j.cigar == "14M" 46 | assert alignments.j.species == Organism.HOMO_SAPIENS 47 | assert alignments.j.locus == Locus.IGH 48 | assert ( 49 | alignments.j.q_seq 50 | == "ERGYTYGNFDHWGQGTLVTVSSASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK" 51 | ) 52 | assert alignments.j.t_seq == "AEYFQHWGQGTLVTVSS" 53 | 54 | assert alignments.c is not None 55 | assert alignments.c.target_id == "IGHG1" 56 | assert alignments.c.alignment_score == 674.7016314783808 57 | assert alignments.c.seq_identity == 1.0 58 | assert alignments.c.e_value == 0.0 59 | assert alignments.c.q_start == 0 60 | assert alignments.c.q_end == 328 61 | assert alignments.c.t_start == 0 62 | assert alignments.c.t_end == 328 63 | assert alignments.c.cigar == "328M" 64 | assert alignments.c.species == Organism.HOMO_SAPIENS 65 | assert alignments.c.locus == Locus.IGH 66 | assert ( 67 | alignments.c.q_seq 68 | == "ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPGK" 69 | ) 70 | assert ( 71 | alignments.c.t_seq 72 | == "ASTKGPSVFPLAPSSKSTSGGTAALGCLVKDYFPEPVTVSWNSGALTSGVHTFPAVLQSSGLYSLSSVVTVPSSSLGTQTYICNVNHKPSNTKVDKKVEPKSCDKTHTCPPCPAPELLGGPSVFLFPPKPKDTLMISRTPEVTCVVVDVSHEDPEVKFNWYVDGVEVHNAKTKPREEQYNSTYRVVSVLTVLHQDWLNGKEYKCKVSNKALPAPIEKTISKAKGQPREPQVYTLPPSRDELTKNQVSLTCLVKGFYPSDIAVEWESNGQPENNYKTTPPVLDSDGSFFLYSKLTVDKSRWQQGNVFSCSVMHEALHNHYTQKSLSLSPELQLEESCAEAQDGELDGLWTTITIFITLFLLSVCYSATVTFFKVKWIFSSVVDLKQTIIPDYRNMIGQGA" 73 | ) 74 | 75 | 76 | def test_produce_aa_alignments_therapeutic(): 77 | # given 78 | aligner = create_vjc_aligner_aa(use_segment_aligner=True) 79 | 80 | query_sequence = "EVQLVESGGGSVQPGGSLRLSCTASGFTISRSYWICWVRQAPGKGLEWVGCIYGDNDITPLYANWAKGRFTISRDTSKNTVYLQMNSLRAEDTATYYCARLGYADYAYDLWGQGTTVTVSS" 81 | 82 | # when 83 | alignments = aligner.produce_aa_alignments(query_sequence)[0] 84 | 85 | # then 86 | assert alignments.v is not None 87 | assert alignments.v.target_id == "IGHV3-66*01" 88 | 89 | 90 | if __name__ == "__main__": 91 | test_produce_aa_alignments() 92 | -------------------------------------------------------------------------------- /notebooks/final_benchmark/time_measurments.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Execution time comparison" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 18, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "from pathlib import Path\n", 18 | "import time" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 23, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "def sec_to_hr_time(elapsed_time: float) -> str:\n", 28 | " return time.strftime(\"%H:%M:%S\", time.gmtime(elapsed_time))\n", 29 | "\n", 30 | "\n", 31 | "RESULTS_PATH = Path().absolute().parent.parent / \"results\"\n", 32 | "\n", 33 | "NT_SEQ_NUM = 392857\n", 34 | "AA_SEQ_NUM = 366412\n", 35 | "\n", 36 | "riot_nt_df = pd.read_csv(RESULTS_PATH / \"final_human_ngs_nt_elapsed.csv\", index_col=0)\n", 37 | "riot_aa_df = pd.read_csv(RESULTS_PATH / \"final_human_ngs_aa_elapsed.csv\", index_col=0)\n", 38 | "igblast_nt_df = pd.read_csv(\"/home/bartosz/Documents/igblast/results/final_human_ngs_nt_elapsed.csv\")\n", 39 | "anarci_aa_df = pd.read_csv(\"/home/bartosz/Documents/ANARCI/results/therapeutics_deduplicated_human/elapsed_time.csv\")" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## Nucleotides - RIOT vs IGBLAST" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 32, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/html": [ 57 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | "
Human NGS - Nucleotides, sample size: 392857
 elapsed_time (HH:MM:SS)
riot00:06:28
igblast00:27:58
\n" 78 | ], 79 | "text/plain": [ 80 | "" 81 | ] 82 | }, 83 | "execution_count": 32, 84 | "metadata": {}, 85 | "output_type": "execute_result" 86 | } 87 | ], 88 | "source": [ 89 | "res = pd.DataFrame(\n", 90 | " data=[sec_to_hr_time(riot_nt_df[\"elapsed_time\"].mean()), sec_to_hr_time(igblast_nt_df[\"elapsed_time\"].mean())],\n", 91 | " columns=[\"elapsed_time (HH:MM:SS)\"],\n", 92 | " index=[\"riot\", \"igblast\"],\n", 93 | ").style.set_caption(f\"Human NGS - Nucleotides, sample size: {NT_SEQ_NUM}\")\n", 94 | "res" 95 | ] 96 | }, 97 | { 98 | "cell_type": "markdown", 99 | "metadata": {}, 100 | "source": [ 101 | "## Amino acids - RIOT vs ANARCI" 102 | ] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "execution_count": 34, 107 | "metadata": {}, 108 | "outputs": [ 109 | { 110 | "data": { 111 | "text/html": [ 112 | "\n", 114 | "\n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | "
Human NGS - Nucleotides, sample size: 366412
 elapsed_time (HH:MM:SS)
riot00:04:50
anarci00:29:59
\n" 133 | ], 134 | "text/plain": [ 135 | "" 136 | ] 137 | }, 138 | "execution_count": 34, 139 | "metadata": {}, 140 | "output_type": "execute_result" 141 | } 142 | ], 143 | "source": [ 144 | "res = pd.DataFrame(\n", 145 | " data=[sec_to_hr_time(riot_aa_df[\"elapsed_time\"].mean()), sec_to_hr_time(anarci_aa_df[\"elapsed_time\"].mean())],\n", 146 | " columns=[\"elapsed_time (HH:MM:SS)\"],\n", 147 | " index=[\"riot\", \"anarci\"],\n", 148 | ").style.set_caption(f\"Human NGS - Amino Acids, sample size: {AA_SEQ_NUM}\")\n", 149 | "res" 150 | ] 151 | } 152 | ], 153 | "metadata": { 154 | "kernelspec": { 155 | "display_name": "riot-ZhbTWDtr-py3.10", 156 | "language": "python", 157 | "name": "python3" 158 | }, 159 | "language_info": { 160 | "codemirror_mode": { 161 | "name": "ipython", 162 | "version": 3 163 | }, 164 | "file_extension": ".py", 165 | "mimetype": "text/x-python", 166 | "name": "python", 167 | "nbconvert_exporter": "python", 168 | "pygments_lexer": "ipython3", 169 | "version": "3.10.7" 170 | } 171 | }, 172 | "nbformat": 4, 173 | "nbformat_minor": 2 174 | } 175 | -------------------------------------------------------------------------------- /riot_na/api/api_mp.py: -------------------------------------------------------------------------------- 1 | import itertools 2 | import multiprocessing as mp 3 | from pathlib import Path 4 | from typing import Any, Optional 5 | 6 | from Bio import SeqIO 7 | from psutil import cpu_count 8 | from tqdm import tqdm 9 | 10 | from riot_na.api.riot_numbering import create_riot_aa, create_riot_nt 11 | from riot_na.common.io import count_fasta_records, write_airr_iter_to_csv 12 | from riot_na.config import GENE_DB_DIR 13 | from riot_na.data.model import ( 14 | AirrRearrangementEntryAA, 15 | AirrRearrangementEntryNT, 16 | InputType, 17 | Organism, 18 | Scheme, 19 | SegmentedAirrRearrangementEntryAA, 20 | SegmentedAirrRearrangementEntryNT, 21 | ) 22 | 23 | 24 | class _WorkerNT: 25 | def __init__( 26 | self, 27 | allowed_species: Optional[tuple[Organism, ...]] = None, 28 | scheme: Scheme = Scheme.IMGT, 29 | db_dir: Path = GENE_DB_DIR, 30 | return_all_domains: bool = False, 31 | ) -> None: 32 | self.numbering = create_riot_nt( 33 | allowed_species=allowed_species, db_dir=db_dir, return_all_domains=return_all_domains 34 | ) 35 | self.scheme = scheme 36 | 37 | def __call__(self, fasta_record: SeqIO.SeqRecord) -> AirrRearrangementEntryNT | list[AirrRearrangementEntryNT]: 38 | try: 39 | res = self.numbering.run_on_sequence( 40 | fasta_record.description, 41 | str(fasta_record.seq), 42 | self.scheme, 43 | ) 44 | except Exception as exc: 45 | print(fasta_record.description) 46 | print(self.scheme) 47 | print(f"seq: {fasta_record.seq}") 48 | raise exc 49 | 50 | return res 51 | 52 | 53 | class _WorkerAA: 54 | def __init__( 55 | self, 56 | allowed_species: Optional[tuple[Organism, ...]] = None, 57 | scheme: Scheme = Scheme.IMGT, 58 | db_dir: Path = GENE_DB_DIR, 59 | return_all_domains: bool = False, 60 | extend_alignment: bool = False, 61 | ) -> None: 62 | self.numbering = create_riot_aa( 63 | allowed_species=allowed_species, db_dir=db_dir, return_all_domains=return_all_domains 64 | ) 65 | self.scheme = scheme 66 | self.extend_alignment = extend_alignment 67 | 68 | def __call__(self, fasta_record: SeqIO.SeqRecord) -> AirrRearrangementEntryAA | list[AirrRearrangementEntryAA]: 69 | try: 70 | res = self.numbering.run_on_sequence( 71 | fasta_record.description, 72 | str(fasta_record.seq), 73 | self.scheme, 74 | extend_alignment=self.extend_alignment, 75 | ) 76 | except Exception as exc: 77 | print(fasta_record.description) 78 | print(self.scheme) 79 | print(f"seq: {fasta_record.seq}") 80 | raise exc 81 | 82 | return res 83 | 84 | 85 | def _worker_initializer( 86 | allowed_species: Optional[tuple[Organism, ...]] = None, 87 | scheme: Scheme = Scheme.IMGT, 88 | db_dir: Path = GENE_DB_DIR, 89 | input_type: InputType = InputType.NT, 90 | return_all_domains: bool = False, 91 | extend_alignment: bool = False, 92 | ): 93 | global worker # pylint: disable=global-statement,global-variable-undefined 94 | if input_type == InputType.NT: 95 | worker = _WorkerNT(allowed_species, scheme, db_dir, return_all_domains) # type: ignore 96 | else: 97 | worker = _WorkerAA(allowed_species, scheme, db_dir, return_all_domains, extend_alignment) # type: ignore 98 | 99 | 100 | def _worker_call(*args, **kwds): 101 | return worker(*args, **kwds) # type: ignore 102 | 103 | 104 | def run_on_file_mp( # pylint: disable=too-many-arguments 105 | db_dir: Path, 106 | input_fasta_path: Path, 107 | result_path: Path, 108 | n_processes: int = cpu_count(logical=False), 109 | input_format: str = "fasta", 110 | scheme: Scheme = Scheme.IMGT, 111 | allowed_species: Optional[tuple[Organism, ...]] = None, 112 | input_type: InputType = InputType.NT, 113 | limit: Optional[int] = None, 114 | return_all_domains: bool = False, 115 | extend_alignment: bool = False, 116 | ): 117 | with mp.Pool( 118 | processes=n_processes, 119 | initializer=_worker_initializer, 120 | initargs=(allowed_species, scheme, db_dir, input_type, return_all_domains, extend_alignment), 121 | ) as pool: 122 | result_iter = tqdm( 123 | pool.imap(_worker_call, itertools.islice(SeqIO.parse(input_fasta_path, input_format), limit)), 124 | total=count_fasta_records(input_fasta_path, input_format=input_format), 125 | ) 126 | record_type: Any 127 | match input_type: 128 | case InputType.NT: 129 | record_type = AirrRearrangementEntryNT if not return_all_domains else SegmentedAirrRearrangementEntryNT 130 | case InputType.AA: 131 | record_type = AirrRearrangementEntryAA if not return_all_domains else SegmentedAirrRearrangementEntryAA 132 | 133 | write_airr_iter_to_csv(result_path, record_type, result_iter) 134 | 135 | 136 | if __name__ == "__main__": 137 | # Example usage 138 | 139 | # .write.text("/home/pawel.dudzic/workspace/analyzer/projects/automation/experiments/exploration/new_therapeutics/paper/new_mess/data/13_08_2025/molecules_segments.fasta", lineSep="\n") 140 | INPUT_PATH = "/home/pawel.dudzic/workspace/analyzer/projects/automation/experiments/exploration/new_therapeutics/paper/new_mess/data/13_08_2025/molecules_segments.fasta" 141 | OUTPUT_PATH = "/home/pawel.dudzic/workspace/analyzer/projects/automation/experiments/exploration/new_therapeutics/paper/new_mess/data/13_08_2025/molecules_segments_numbered.csv" 142 | run_on_file_mp( 143 | db_dir=GENE_DB_DIR, 144 | input_fasta_path=Path(INPUT_PATH), 145 | result_path=Path(OUTPUT_PATH), 146 | n_processes=8, 147 | input_format="fasta", 148 | scheme=Scheme.IMGT, 149 | allowed_species=(Organism.HOMO_SAPIENS,), 150 | input_type=InputType.AA, 151 | ) 152 | -------------------------------------------------------------------------------- /notebooks/final_benchmark/nt_gene_assignment_ngs.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gene | allele assignment precision - HUMAN NGS" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "from pathlib import Path\n", 18 | "from utils import calculate_gene_allele_assignment_precision" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "DATA_PATH = Path().absolute().parent.parent / \"data\"\n", 28 | "RESULTS_PATH = Path().absolute().parent.parent / \"results\"" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": {}, 35 | "outputs": [], 36 | "source": [ 37 | "ground_truth_df = pd.read_csv(DATA_PATH / \"ngs_stratified\" / \"ngs_sample_human_ground_truth.csv\", index_col=0)\n", 38 | "igblast_df = pd.read_csv(\"/home/bartosz/Documents/igblast/results/ngs_sample_clean_dj_penalty_-1_human.tsv\", index_col=0, sep=\"\\t\")\n", 39 | "riot_df = pd.read_csv(RESULTS_PATH / \"ngs_human_nt/ngs_sample_human_imgt_human.csv\", index_col=0)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "markdown", 44 | "metadata": {}, 45 | "source": [ 46 | "## V genes" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 4, 52 | "metadata": {}, 53 | "outputs": [ 54 | { 55 | "data": { 56 | "text/html": [ 57 | "\n", 59 | "\n", 60 | " \n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | "
Nucleotides V gene - Human NGS
 geneallele
riot93.13%93.11%
igblast93.15%93.01%
\n" 81 | ], 82 | "text/plain": [ 83 | "" 84 | ] 85 | }, 86 | "execution_count": 4, 87 | "metadata": {}, 88 | "output_type": "execute_result" 89 | } 90 | ], 91 | "source": [ 92 | "gene = \"v\"\n", 93 | "pd.DataFrame.from_records(\n", 94 | " [\n", 95 | " calculate_gene_allele_assignment_precision(ground_truth_df, riot_df, gene),\n", 96 | " calculate_gene_allele_assignment_precision(ground_truth_df, igblast_df, gene),\n", 97 | " ],\n", 98 | " index=[\"riot\", \"igblast\"],\n", 99 | ").style.format(\"{:.2f}%\").set_caption(f\"Nucleotides {gene.upper()} gene - Human NGS\")" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "## J genes" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 5, 112 | "metadata": {}, 113 | "outputs": [ 114 | { 115 | "data": { 116 | "text/html": [ 117 | "\n", 119 | "\n", 120 | " \n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | "
Nucleotides J gene - Human NGS
 geneallele
riot92.11%92.11%
igblast91.12%91.09%
\n" 141 | ], 142 | "text/plain": [ 143 | "" 144 | ] 145 | }, 146 | "execution_count": 5, 147 | "metadata": {}, 148 | "output_type": "execute_result" 149 | } 150 | ], 151 | "source": [ 152 | "gene = \"j\"\n", 153 | "pd.DataFrame.from_records(\n", 154 | " [\n", 155 | " calculate_gene_allele_assignment_precision(ground_truth_df, riot_df, gene),\n", 156 | " calculate_gene_allele_assignment_precision(ground_truth_df, igblast_df, gene),\n", 157 | " ],\n", 158 | " index=[\"riot\", \"igblast\"],\n", 159 | ").style.format(\"{:.2f}%\").set_caption(f\"Nucleotides {gene.upper()} gene - Human NGS\")" 160 | ] 161 | } 162 | ], 163 | "metadata": { 164 | "kernelspec": { 165 | "display_name": "riot-ZhbTWDtr-py3.10", 166 | "language": "python", 167 | "name": "python3" 168 | }, 169 | "language_info": { 170 | "codemirror_mode": { 171 | "name": "ipython", 172 | "version": 3 173 | }, 174 | "file_extension": ".py", 175 | "mimetype": "text/x-python", 176 | "name": "python", 177 | "nbconvert_exporter": "python", 178 | "pygments_lexer": "ipython3", 179 | "version": "3.10.7" 180 | } 181 | }, 182 | "nbformat": 4, 183 | "nbformat_minor": 2 184 | } 185 | -------------------------------------------------------------------------------- /notebooks/final_benchmark/aa_gene_assignment_therapeutics.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Gene | allele assignment precision - HUMAN therapeutics " 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 13, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import pandas as pd\n", 17 | "from pathlib import Path\n", 18 | "from typing import Literal\n", 19 | "from utils import calculate_gene_allele_assignment_precision" 20 | ] 21 | }, 22 | { 23 | "cell_type": "code", 24 | "execution_count": 14, 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "DATA_PATH = Path().absolute().parent.parent / \"data\"\n", 29 | "RESULTS_PATH = Path().absolute().parent.parent / \"results\"" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 15, 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "ground_truth_df = pd.read_csv(DATA_PATH / \"therapeutics\" / \"therapeutics_ground_truth_human_new.csv\", index_col=0)\n", 39 | "anarci_df = pd.read_csv(\"/home/bartosz/Documents/ANARCI/results/therapeutics_deduplicated/results.csv\", index_col=0)\n", 40 | "riot_df = pd.read_csv(RESULTS_PATH / \"therapeutics_human_aa/therapeutics_human_imgt_human.csv\", index_col=0)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "markdown", 45 | "metadata": {}, 46 | "source": [ 47 | "## V genes" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 19, 53 | "metadata": {}, 54 | "outputs": [ 55 | { 56 | "data": { 57 | "text/html": [ 58 | "\n", 60 | "\n", 61 | " \n", 62 | " \n", 63 | " \n", 64 | " \n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | "
Nucleotides V gene - Human NGS
 geneallele
riot96.94%96.86%
anarci65.42%62.17%
\n" 82 | ], 83 | "text/plain": [ 84 | "" 85 | ] 86 | }, 87 | "execution_count": 19, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "gene = \"v\"\n", 94 | "pd.DataFrame.from_records(\n", 95 | " [\n", 96 | " calculate_gene_allele_assignment_precision(ground_truth_df, riot_df, gene),\n", 97 | " calculate_gene_allele_assignment_precision(ground_truth_df, anarci_df, gene),\n", 98 | " ],\n", 99 | " index=[\"riot\", \"anarci\"],\n", 100 | ").style.format(\"{:.2f}%\").set_caption(f\"Nucleotides {gene.upper()} gene - Human NGS\")" 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": {}, 106 | "source": [ 107 | "## J genes" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 23, 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "data": { 117 | "text/html": [ 118 | "\n", 120 | "\n", 121 | " \n", 122 | " \n", 123 | " \n", 124 | " \n", 125 | " \n", 126 | " \n", 127 | " \n", 128 | " \n", 129 | " \n", 130 | " \n", 131 | " \n", 132 | " \n", 133 | " \n", 134 | " \n", 135 | " \n", 136 | " \n", 137 | " \n", 138 | " \n", 139 | " \n", 140 | " \n", 141 | "
Nucleotides J gene - Human NGS
 geneallele
riot97.72%97.65%
anarci78.90%78.56%
\n" 142 | ], 143 | "text/plain": [ 144 | "" 145 | ] 146 | }, 147 | "execution_count": 23, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "gene = \"j\"\n", 154 | "pd.DataFrame.from_records(\n", 155 | " [\n", 156 | " calculate_gene_allele_assignment_precision(ground_truth_df, riot_df, gene),\n", 157 | " calculate_gene_allele_assignment_precision(ground_truth_df, anarci_df, gene),\n", 158 | " ],\n", 159 | " index=[\"riot\", \"anarci\"],\n", 160 | ").style.format(\"{:.2f}%\").set_caption(f\"Nucleotides {gene.upper()} gene - Human NGS\")" 161 | ] 162 | } 163 | ], 164 | "metadata": { 165 | "kernelspec": { 166 | "display_name": "riot-na-le-VoPNA-py3.10", 167 | "language": "python", 168 | "name": "python3" 169 | }, 170 | "language_info": { 171 | "codemirror_mode": { 172 | "name": "ipython", 173 | "version": 3 174 | }, 175 | "file_extension": ".py", 176 | "mimetype": "text/x-python", 177 | "name": "python", 178 | "nbconvert_exporter": "python", 179 | "pygments_lexer": "ipython3", 180 | "version": "3.10.12" 181 | } 182 | }, 183 | "nbformat": 4, 184 | "nbformat_minor": 2 185 | } 186 | -------------------------------------------------------------------------------- /riot_na/alignment/nt_gene_alignments.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from pathlib import Path 3 | from typing import Optional 4 | 5 | from riot_na.alignment.alignment_utils import offset_alignments 6 | from riot_na.alignment.gene_aligner import ( 7 | GeneAligner, 8 | create_aligner, 9 | create_v_gene_aligner, 10 | ) 11 | from riot_na.alignment.segment_gene_aligner import SegmentGeneAligner 12 | from riot_na.alignment.segment_gene_aligner import ( 13 | create_v_gene_aligner as create_segment_v_gene_aligner, 14 | ) 15 | from riot_na.config import GENE_DB_DIR 16 | from riot_na.data.model import ( 17 | AlignmentSegment, 18 | AlignmentsNT, 19 | GermlineGene, 20 | Locus, 21 | Organism, 22 | ) 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | 27 | class VDJCAlignerNT: 28 | def __init__( 29 | self, 30 | v_aligner: SegmentGeneAligner | GeneAligner, 31 | d_aligners: dict[Organism, GeneAligner], 32 | j_aligners: dict[Organism, dict[Locus, GeneAligner]], 33 | c_aligners: dict[Organism, dict[Locus, GeneAligner]], 34 | ) -> None: 35 | self.v_aligner = v_aligner 36 | self.d_aligners = d_aligners 37 | self.j_aligners = j_aligners 38 | self.c_aligners = c_aligners # at the time of writing, there is only one c germline database 39 | 40 | def produce_nt_alignments(self, query: str) -> list[AlignmentsNT]: 41 | results = [] 42 | 43 | v_alignments = self.v_aligner.align(query, both_strains=True) 44 | for v_alignment in v_alignments: 45 | try: 46 | aligments_nt = self._produce_vdjc_nt_alignment(v_alignment) 47 | results.append(aligments_nt) 48 | except ValueError as e: 49 | logger.error( 50 | "Error aligning %s: %s", 51 | v_alignment.best_alignment.q_seq if v_alignment.best_alignment else "unknown sequence", 52 | e, 53 | ) 54 | return results 55 | 56 | def _produce_vdjc_nt_alignment(self, v_alignment_segment: AlignmentSegment) -> AlignmentsNT: 57 | aligments_nt = AlignmentsNT() 58 | aligments_nt.segment_start = v_alignment_segment.segment_start 59 | aligments_nt.segment_end = v_alignment_segment.segment_end 60 | 61 | v_alignment = v_alignment_segment.best_alignment 62 | if v_alignment is None: 63 | return aligments_nt 64 | 65 | query_v = v_alignment.q_seq 66 | aligments_nt.v = v_alignment 67 | 68 | # mask alignments 69 | query_j = v_alignment.q_seq[v_alignment.q_end :] 70 | 71 | # align j genes 72 | species = v_alignment.species 73 | locus = v_alignment.locus 74 | 75 | j_aligner = self.j_aligners[species][locus] 76 | j_alignment = j_aligner.align(query_j, both_strains=False)[0].best_alignment 77 | 78 | if j_alignment is None: 79 | return aligments_nt 80 | 81 | j_alignment = offset_alignments(v_alignment.q_end, j_alignment) 82 | assert j_alignment is not None 83 | aligments_nt.j = j_alignment 84 | 85 | # mask c input (query_db_clean -> valn_end + jaln_end) 86 | query_c = query_v[j_alignment.q_end :] 87 | 88 | if query_c: 89 | # align c 90 | c_aligner = self.c_aligners[species][locus] 91 | c_alignment = c_aligner.align(query_c, both_strains=False)[0].best_alignment 92 | 93 | if c_alignment is not None: 94 | c_alignment = offset_alignments(j_alignment.q_end, c_alignment) 95 | aligments_nt.c = c_alignment 96 | 97 | if locus == Locus.IGH: 98 | # mask d input (query_db_clean -> valn_end + jaln_end) 99 | query_d = query_v[v_alignment.q_end : j_alignment.q_start] 100 | if query_d: 101 | # align d genes 102 | d_aligner = self.d_aligners[species] 103 | d_alignment = d_aligner.align(query_d, both_strains=False)[0].best_alignment 104 | 105 | if d_alignment is not None: 106 | d_alignment = offset_alignments(v_alignment.q_end, d_alignment) 107 | aligments_nt.d = d_alignment 108 | 109 | return aligments_nt 110 | 111 | 112 | def create_vdjc_aligner_nt( 113 | allowed_species: Optional[tuple[Organism, ...]] = None, 114 | db_dir: Path = GENE_DB_DIR, 115 | use_segment_aligner: bool = False, 116 | ): 117 | if not allowed_species: 118 | allowed_species = (Organism.HOMO_SAPIENS, Organism.MUS_MUSCULUS, Organism.VICUGNA_PACOS) 119 | 120 | v_aligner = ( 121 | create_segment_v_gene_aligner(allowed_species=allowed_species, db_dir=db_dir) 122 | if use_segment_aligner 123 | else create_v_gene_aligner(allowed_species=allowed_species, db_dir=db_dir) 124 | ) 125 | 126 | d_aligners = {} 127 | for organism in allowed_species: 128 | d_aligner = create_aligner(organism=organism, germline_gene=GermlineGene.D, locus=Locus.IGH, db_dir=db_dir) 129 | d_aligners[organism] = d_aligner 130 | 131 | j_aligners: dict[Organism, dict[Locus, GeneAligner]] = {} 132 | 133 | for organism in allowed_species: 134 | organism_aligners = j_aligners.get(organism, {}) 135 | 136 | for locus in Locus: 137 | if organism == Organism.VICUGNA_PACOS and locus != Locus.IGH: 138 | continue 139 | 140 | j_aligner = create_aligner(organism=organism, germline_gene=GermlineGene.J, locus=locus, db_dir=db_dir) 141 | organism_aligners[locus] = j_aligner 142 | 143 | j_aligners[organism] = organism_aligners 144 | 145 | c_aligners: dict[Organism, dict[Locus, GeneAligner]] = {} 146 | 147 | for organism in allowed_species: 148 | 149 | organism_aligners = c_aligners.get(organism, {}) 150 | 151 | if organism not in [Organism.HOMO_SAPIENS, Organism.VICUGNA_PACOS, Organism.CUSTOM]: 152 | c_organism = Organism.HOMO_SAPIENS 153 | else: 154 | c_organism = organism 155 | 156 | for locus in Locus: 157 | if organism == Organism.VICUGNA_PACOS and locus != Locus.IGH: 158 | continue 159 | 160 | c_aligner = create_aligner(organism=c_organism, germline_gene=GermlineGene.C, locus=locus, db_dir=db_dir) 161 | organism_aligners[locus] = c_aligner 162 | 163 | c_aligners[organism] = organism_aligners 164 | 165 | vdjc_aligner_nt = VDJCAlignerNT(v_aligner, d_aligners, j_aligners, c_aligners) 166 | 167 | return vdjc_aligner_nt 168 | -------------------------------------------------------------------------------- /riot_na/data/scheme_definitions.py: -------------------------------------------------------------------------------- 1 | from functools import cache 2 | from typing import Final, TypedDict 3 | 4 | from riot_na.data.model import ChainType, Scheme, ShortRegion 5 | 6 | # Number of scheme positions for each chain type 7 | IMGT_POSITIONS: Final[int] = 128 8 | KABAT_POSITIONS_LIGHT: Final[int] = 107 9 | KABAT_POSITIONS_HEAVY: Final[int] = 113 10 | CHOTHIA_POSITIONS_LIGHT: Final[int] = 107 11 | CHOTHIA_POSITIONS_HEAVY: Final[int] = 113 12 | MARTIN_POSITIONS_LIGHT: Final[int] = 107 13 | MARTIN_POSITIONS_HEAVY: Final[int] = 113 14 | 15 | # CDR insertions / deletions positions 16 | IMGT_INDEL_POSITIONS: Final[list[int]] = [32, 60, 111] 17 | KABAT_INDEL_POSITIONS_HEAVY: Final[list[int]] = [35, 52, 100] 18 | KABAT_INDEL_POSITIONS_LIGHT: Final[list[int]] = [27, 52, 95] 19 | CHOTHIA_INDEL_POSITIONS_HEAVY: Final[list[int]] = [31, 52, 100] 20 | CHOTHIA_INDEL_POSITIONS_LIGHT: Final[list[int]] = [30, 52, 95] 21 | MARTIN_INDEL_POSITIONS_HEAVY: Final[list[int]] = [31, 52, 100] 22 | MARTIN_INDEL_POSITIONS_LIGHT: Final[list[int]] = [30, 52, 95] 23 | 24 | 25 | # Region position ranges 26 | class MinMaxRange(TypedDict): 27 | min: int 28 | max: int 29 | 30 | 31 | ChainRegions = dict[ShortRegion, MinMaxRange] 32 | Regions = dict[ChainType, ChainRegions] 33 | 34 | IMGT_REGIONS: Final[Regions] = { 35 | ChainType.HEAVY: { 36 | ShortRegion.FW1: {"min": 1, "max": 26}, 37 | ShortRegion.CDR1: {"min": 27, "max": 38}, 38 | ShortRegion.FW2: {"min": 39, "max": 55}, 39 | ShortRegion.CDR2: {"min": 56, "max": 65}, 40 | ShortRegion.FW3: {"min": 66, "max": 104}, 41 | ShortRegion.CDR3: {"min": 105, "max": 117}, 42 | ShortRegion.FW4: {"min": 118, "max": 128}, 43 | }, 44 | ChainType.LIGHT: { 45 | ShortRegion.FW1: {"min": 1, "max": 26}, 46 | ShortRegion.CDR1: {"min": 27, "max": 38}, 47 | ShortRegion.FW2: {"min": 39, "max": 55}, 48 | ShortRegion.CDR2: {"min": 56, "max": 65}, 49 | ShortRegion.FW3: {"min": 66, "max": 104}, 50 | ShortRegion.CDR3: {"min": 105, "max": 117}, 51 | ShortRegion.FW4: {"min": 118, "max": 128}, 52 | }, 53 | } 54 | 55 | KABAT_REGIONS: Final[Regions] = { 56 | ChainType.HEAVY: { 57 | ShortRegion.FW1: {"min": 1, "max": 30}, 58 | ShortRegion.CDR1: {"min": 31, "max": 35}, 59 | ShortRegion.FW2: {"min": 36, "max": 49}, 60 | ShortRegion.CDR2: {"min": 50, "max": 65}, 61 | ShortRegion.FW3: {"min": 66, "max": 94}, 62 | ShortRegion.CDR3: {"min": 95, "max": 102}, 63 | ShortRegion.FW4: {"min": 103, "max": 113}, 64 | }, 65 | ChainType.LIGHT: { 66 | ShortRegion.FW1: {"min": 1, "max": 23}, 67 | ShortRegion.CDR1: {"min": 24, "max": 34}, 68 | ShortRegion.FW2: {"min": 35, "max": 49}, 69 | ShortRegion.CDR2: {"min": 50, "max": 56}, 70 | ShortRegion.FW3: {"min": 57, "max": 88}, 71 | ShortRegion.CDR3: {"min": 89, "max": 97}, 72 | ShortRegion.FW4: {"min": 98, "max": 107}, 73 | }, 74 | } 75 | 76 | 77 | CHOTHIA_REGIONS: Final[Regions] = { 78 | ChainType.HEAVY: { 79 | ShortRegion.FW1: {"min": 1, "max": 25}, 80 | ShortRegion.CDR1: {"min": 26, "max": 32}, 81 | ShortRegion.FW2: {"min": 33, "max": 51}, 82 | ShortRegion.CDR2: {"min": 52, "max": 56}, 83 | ShortRegion.FW3: {"min": 57, "max": 95}, 84 | ShortRegion.CDR3: {"min": 96, "max": 101}, 85 | ShortRegion.FW4: {"min": 102, "max": 113}, 86 | }, 87 | ChainType.LIGHT: { 88 | ShortRegion.FW1: {"min": 1, "max": 25}, 89 | ShortRegion.CDR1: {"min": 26, "max": 32}, 90 | ShortRegion.FW2: {"min": 33, "max": 49}, 91 | ShortRegion.CDR2: {"min": 50, "max": 52}, 92 | ShortRegion.FW3: {"min": 53, "max": 90}, 93 | ShortRegion.CDR3: {"min": 91, "max": 96}, 94 | ShortRegion.FW4: {"min": 97, "max": 107}, 95 | }, 96 | } 97 | 98 | MARTIN_REGIONS: Final[Regions] = { 99 | ChainType.HEAVY: { 100 | ShortRegion.FW1: {"min": 1, "max": 25}, 101 | ShortRegion.CDR1: {"min": 26, "max": 32}, 102 | ShortRegion.FW2: {"min": 33, "max": 51}, 103 | ShortRegion.CDR2: {"min": 52, "max": 56}, 104 | ShortRegion.FW3: {"min": 57, "max": 95}, 105 | ShortRegion.CDR3: {"min": 96, "max": 101}, 106 | ShortRegion.FW4: {"min": 102, "max": 113}, 107 | }, 108 | ChainType.LIGHT: { 109 | ShortRegion.FW1: {"min": 1, "max": 25}, 110 | ShortRegion.CDR1: {"min": 26, "max": 32}, 111 | ShortRegion.FW2: {"min": 33, "max": 49}, 112 | ShortRegion.CDR2: {"min": 50, "max": 52}, 113 | ShortRegion.FW3: {"min": 53, "max": 90}, 114 | ShortRegion.CDR3: {"min": 91, "max": 96}, 115 | ShortRegion.FW4: {"min": 97, "max": 107}, 116 | }, 117 | } 118 | 119 | 120 | @cache 121 | def get_legal_positions(chain_type: ChainType, scheme: Scheme) -> int: 122 | match scheme, chain_type: 123 | case Scheme.IMGT, ChainType.HEAVY: 124 | return IMGT_POSITIONS 125 | case Scheme.IMGT, ChainType.LIGHT: 126 | return IMGT_POSITIONS 127 | case Scheme.KABAT, ChainType.HEAVY: 128 | return KABAT_POSITIONS_HEAVY 129 | case Scheme.KABAT, ChainType.LIGHT: 130 | return KABAT_POSITIONS_LIGHT 131 | case Scheme.CHOTHIA, ChainType.HEAVY: 132 | return CHOTHIA_POSITIONS_HEAVY 133 | case Scheme.CHOTHIA, ChainType.LIGHT: 134 | return CHOTHIA_POSITIONS_LIGHT 135 | case Scheme.MARTIN, ChainType.HEAVY: 136 | return MARTIN_POSITIONS_HEAVY 137 | case Scheme.MARTIN, ChainType.LIGHT: 138 | return MARTIN_POSITIONS_LIGHT 139 | case _: 140 | raise ValueError(f"Unknown scheme {scheme} or chain type {chain_type}") 141 | 142 | 143 | @cache 144 | def get_indels_positions(scheme: Scheme, chain_type: ChainType) -> list[int]: 145 | # this is scheme specific 146 | match (scheme, chain_type): 147 | case Scheme.IMGT, _: 148 | return IMGT_INDEL_POSITIONS 149 | case Scheme.KABAT, ChainType.HEAVY: 150 | return KABAT_INDEL_POSITIONS_HEAVY 151 | case Scheme.KABAT, ChainType.LIGHT: 152 | return KABAT_INDEL_POSITIONS_LIGHT 153 | case Scheme.CHOTHIA, ChainType.HEAVY: 154 | return CHOTHIA_INDEL_POSITIONS_HEAVY 155 | case Scheme.CHOTHIA, ChainType.LIGHT: 156 | return CHOTHIA_INDEL_POSITIONS_LIGHT 157 | case Scheme.MARTIN, ChainType.HEAVY: 158 | return MARTIN_INDEL_POSITIONS_HEAVY 159 | case Scheme.MARTIN, ChainType.LIGHT: 160 | return MARTIN_INDEL_POSITIONS_LIGHT 161 | 162 | raise ValueError(f"Unknown scheme {scheme}") 163 | -------------------------------------------------------------------------------- /riot_na/schemes/smooth_alignment.py: -------------------------------------------------------------------------------- 1 | from functools import partial 2 | from math import ceil, floor 3 | from typing import Callable, Final, Literal, cast 4 | 5 | from riot_na.data.model import AlignmentString, ChainType, Scheme 6 | from riot_na.data.scheme_definitions import get_indels_positions 7 | from riot_na.data.scheme_regions import get_cdr_ranges 8 | from riot_na.schemes.collapse_alignment import collapse_ins_del 9 | 10 | Op = Literal["M", "D", "N", "I", "S"] 11 | M: Final[Op] = "M" 12 | D: Final[Op] = "D" 13 | N: Final[Op] = "N" 14 | I: Final[Op] = "I" 15 | S: Final[Op] = "S" 16 | 17 | 18 | def _reorder_cdr_deletions(relative_indel_position: int, matches: int, remaining_deletions: int) -> str: 19 | # deletions should be put to the left of specified position 20 | # indel_position = matches_before + deletions => matches_before = indel_position - deletions 21 | matches_before = relative_indel_position - remaining_deletions 22 | if matches_before >= 0: 23 | assert matches_before <= matches 24 | return f"{matches_before * M}{remaining_deletions * D}{(matches - matches_before) * M}" 25 | 26 | # if there are too many deletions, just put deletions first, then matches 27 | return f"{remaining_deletions * D}{matches * M}" 28 | 29 | 30 | def _reorder_cdr_deletions_imgt(relative_indel_position: int, matches: int, remaining_deletions: int) -> str: 31 | # deletions should be put in the middle of cdr, starting from indel_position + 1 32 | # for cdr1 and cdr2, and indel_position for cdr3. We can detect cdr number from 33 | # alignment string length, cdr1 and cdr2 have even number of positions, while cdr3 odd. 34 | legal_positions = matches + remaining_deletions 35 | round_fn = ceil if legal_positions % 2 else floor 36 | matches_before = relative_indel_position - round_fn(remaining_deletions / 2) 37 | matches_before = max(matches_before, 0) 38 | assert matches >= matches_before 39 | return f"{matches_before * M}{remaining_deletions * D}{(matches - matches_before) * M}" 40 | 41 | 42 | def _reorder_cdr( 43 | alignment_str: AlignmentString, 44 | relative_indel_position: int, 45 | reorder_buffer_deletions: Callable[[int, int, int], str], 46 | ) -> AlignmentString: 47 | matches = alignment_str.count(M) 48 | dels = alignment_str.count(D) 49 | ins = alignment_str.count(I) 50 | 51 | assert matches + dels + ins == len(alignment_str) 52 | 53 | extra_matches = min(ins, dels) 54 | matches = matches + extra_matches 55 | 56 | # collapse insertions and deletions over the alignment 57 | if ins > dels: 58 | remaining_insertions = ins - dels 59 | # insertions should be put after specified position or in the middle 60 | assert relative_indel_position <= matches 61 | result = f"{relative_indel_position * M}{remaining_insertions * I}{(matches - relative_indel_position) * M}" 62 | 63 | elif dels > ins: 64 | remaining_deletions = dels - ins 65 | result = reorder_buffer_deletions(relative_indel_position, matches, remaining_deletions) 66 | else: 67 | result = matches * M 68 | 69 | return AlignmentString(result) 70 | 71 | 72 | reorder_buffer_other_schemes = cast( 73 | Callable[[AlignmentString, int], AlignmentString], 74 | partial(_reorder_cdr, reorder_buffer_deletions=_reorder_cdr_deletions), 75 | ) 76 | reorder_buffer_imgt = cast( 77 | Callable[[AlignmentString, int], AlignmentString], 78 | partial(_reorder_cdr, reorder_buffer_deletions=_reorder_cdr_deletions_imgt), 79 | ) 80 | 81 | 82 | def smooth_cdr_junctions( 83 | alignment_str: AlignmentString, chain_type: ChainType, scheme: Scheme = Scheme.IMGT 84 | ) -> AlignmentString: 85 | """ 86 | Function for reordering function for reordering cdr insertions and deletions 87 | to match numbering scheme standard. 88 | 89 | Each numbering scheme (IMGT, KABAT, CHOTHIA, MARTIN) specifies legal positions 90 | in CDR regions, where insertions or deletions should be put. This function takes 91 | each CDR and reorders alignment string to match those specifications. 92 | 93 | IMGT standard: https://www.imgt.org/IMGTScientificChart/Numbering/IMGTIGVLsuperfamily.html 94 | KABAT, CHOTHIA, MARTIN: http://www.bioinf.org.uk/abs/info.html 95 | """ 96 | 97 | reorder_buffer_fn: Callable[[AlignmentString, int], AlignmentString] = ( 98 | reorder_buffer_imgt if scheme == Scheme.IMGT else reorder_buffer_other_schemes 99 | ) 100 | scheme_position = 0 101 | smoothed = "" 102 | buffer = "" 103 | regions_it = iter(get_cdr_ranges(scheme, chain_type)) 104 | indel_positions_it = iter(get_indels_positions(scheme, chain_type)) 105 | 106 | region_start, region_end = next(regions_it, (None, None)) 107 | indel_position = next(indel_positions_it, None) 108 | 109 | for op in alignment_str: 110 | if op in (M, D, N): 111 | scheme_position = scheme_position + 1 112 | 113 | # Append the rest of fwr4 114 | if not (region_start and region_end and indel_position): 115 | smoothed = smoothed + op 116 | continue 117 | 118 | # Take only cdr + insertions right before cdr start 119 | if (region_start - 1 == scheme_position and op == I) or region_start <= scheme_position <= region_end: 120 | buffer = buffer + op 121 | else: 122 | if buffer: 123 | # For cases when alignment starts on framework, do nothing. 124 | if set(buffer) == {N}: 125 | smoothed = smoothed + buffer 126 | else: 127 | # +1 because of 1-based indexing 128 | relative_indel_position = indel_position - region_start + 1 129 | fixed_buffer = reorder_buffer_fn( 130 | AlignmentString(buffer.replace(N, D)), 131 | relative_indel_position, 132 | ) 133 | smoothed = smoothed + fixed_buffer 134 | 135 | buffer = "" 136 | region_start, region_end = next(regions_it, (None, None)) 137 | indel_position = next(indel_positions_it, None) 138 | 139 | smoothed = smoothed + op 140 | 141 | # For cases when alignment ends on cdr 142 | if buffer: 143 | assert N not in buffer, "No framework aligned" 144 | assert region_start and region_end and indel_position 145 | 146 | region_length = region_end - region_start + 1 147 | collapsed_buffer = collapse_ins_del(buffer) 148 | missing_deletions_count = region_length - (collapsed_buffer.count(M) + collapsed_buffer.count(D)) 149 | filled_buffer = AlignmentString(f"{collapsed_buffer}{missing_deletions_count * D}") 150 | relative_indel_position = indel_position - region_start + 1 151 | fixed_buffer = reorder_buffer_fn(filled_buffer, relative_indel_position) 152 | smoothed = smoothed + fixed_buffer 153 | 154 | return AlignmentString(smoothed) 155 | -------------------------------------------------------------------------------- /riot_na/databases/gene_db/c_genes/alpaca/igh.fasta: -------------------------------------------------------------------------------- 1 | >ighmu IGH VICUGNA_PACOS 2 | AGCTCGTCTGCCCCGACACTCTTCCCCCTCGCCTCCTGTGAGAGCCCCGTGTCCGACGAGAGCCCAGTGGCCTTGGGCTGCCTAGCCCGGGACTTCCTGCCTGGCTCCATCACCTTCTCCTGGAGCTACCCGAACGGCATCGCGGTCAGTAGCCAGAGCATCAAGACCTTCCCGTCCGTCCTGCGGGAGGGCAAGTATGTGGCCACCTCCCAGGTGCTCCTGCCCTCCCAGAGCGTCCTCCAGGGGTCAGAGCTGATTTGCAAAGTCCAGCACTCCAAGGGGAACTCGGACATGGTTGTGCCCCTCCCAATTTTAGATCTGCCCCCCAGCGTGACACTCTTCATGCCCCCCCGAGATGGCTTCTCTGGCACTTCCAAACGCACGTCCAAGCTCATCTGTCAGGCCACAGACTTCAGCCCCAGGGAGATCTCCGTGTCCTGGTTTCGTGAGGGCAAGCGGCTGGTGTCTGGCTTCATTACGGAAGATGTGGAAGCCTCAAAGTCCAATCCAGGGACCTTCAGTGTCATCAGCATGCTGACCATCACCGACGGCGACTGGTTCAGCCAGGCTGTGTACACCTGCCAGGTGGAGCACAGAGGGATGGTCATCGAGAAGAACGTGTCTTCCCAGTGCAACCCCCCTTCCCCCGGCATCGAGGTCTTCGCCATTCCCCCCTCCTTCTCCGACATCTTCCTCAACAAGTCAGCCAAGCTCACCTGCCTGGTCACAGGCCTGGTCACCTACGACAGCCTGAGAATTTCCTGGACCCGCCAGGGTGAAAAGGCTGTGGATTCCCAGATCATTGACTCCACGATCCTCCCCAACGGCACCTTCAGCGCCACGTGTGTGGCGTCAGTCTGCGTGGAGGACTGGGAGTCAGGAGACAGGTTCACGTGCACGGTGACCCACCTGGATCTGCCCTCACCCCTGAAGCGGAGCATCTTCAAGCCCGAAGTGCACAAGCACATGCCTTCCGTCTACGTGCTGCCGCCGGCCCGGGAGCAGCTGAGCCTGCGGGAGTCAGCCTCCATCACCTGCCTGGTGAAGGGCTTCTCCCCTCCGGACGTGTTTGTGCAGTGGCTGAAGAAGGGGGAGCAGGAGCCCCTGTCCCCTGACAACTACGTGACCAGTGCCCCAGTGCCCGAGCCCAACAGCCCGGGCTACTACTTTGTCCACAGCGTCCTGACGGTGAGCGAGAAGGACTGGAGTGCCGGGGCGACCTACACCTGCGTCGTGGGCCATGAGGCCCTGCCCCACTTGGTGACCGAGAGGACCGTGGACAAGTCCACCGGTAAACCCACCCTGTACAACGTGTCCCTGGTCATGTCCGACACGGCCAGCACCTGCTAC 3 | >ighgama2b IGH VICUGNA_PACOS 4 | GAACCCAAGACACCAAAACCACAACCACAACCACAACCACAACCCCAACCCAATCCTACAACAGAATCCAAGTGTCCCAAATGTCCAGCCCCTGAGCTCCTGGGAGGGCCCTCAGTCTTCATCTTCCCCCCGAAACCCAAGGACGTCCTCTCCATTTCTGGGAGGCCCGAGGTCACGTGCGTTGTGGTAGACGTGGGCCAGGAAGACCCCGAGGTCAGTTTCAACTGGTACATTGATGGCGCTGAGGTGCGAACGGCCAACACGAGGCCAAAAGAGGAACAGTTCAACAGCACGTACCGCGTGGTCAGCGTCCTGCCCATCCAGCACCAGGACTGGCTGACGGGGAAGGAATTCAAGTGCAAGGTCAACAACAAAGCTCTCCCGGCCCCCATCGAGAAGACCATCTCCAAGGCCAAAGGGCAGACCCGGGAGCCGCAGGTGTACACCCTGGCCCCACACCGGGAAGAGCTGGCCAAGGACACCGTGAGCGTAACATGCCTGGTCAAAGGCTTCTACCCACCTGATATCAACGTTGAGTGGCAGAGGAACCGGCAGCCGGAGCCAGAGGGCACCTACGCCACCACGCCACCCCAGCTGGACAACGACGGGACCTACTTCCTATACAGCAAGCTCTCGGTGGGAAAGAACACGTGGCAGCGGGGAGAAACCTTCACCTGTGTGGTGATGCACGAGGCCCTGCACAACCACTACACCCAGAAATCCATCACCCAGTCTTCGGGTAAA 5 | >ighgama1a IGH VICUGNA_PACOS 6 | TCCACCAAGGCCCCATCGGTCTATCCTCTGACTGCTAGATGCGGGGACACGCCTGGCTCCACAGTGGCCTTCGGCTGCCTAGTCTGGGGCTATATCCCTGAGCCGGTGACGGTGACGTGGAACTCGGGCGCCCTGTCCAGCGGCGTCCACACCTTCCCATCAGTCTTCATGTCCTCGGGGCTCTACACCCTCAGCAGCTTGGTGACCATGCCCGCCAGCAGCTCGACCGGCAAGACCTTCATCTGCAACGTAGCCCACCCGGCCAGCAGCACCAAGGTGGACAAGCGTGTGGAACTCAAGACACCCCAACCTCAATCCCAACCAGAATGCCGGTGTCCCAAATGTCCAGCCCCTGAGCTCCTGGGAGGGCCCTCAGTCTTCATCTTCCCCCCGAAACCCAAGGACGTCCTCTCCATTTCTGGGAGGCCCGAGGTCACGTGCGTTGTGGTAGACGTGGGCCAGGAAGACCCCGAGGTTAGTTTCAACTGGTACATTGATGGCGCTGAGGTGCGAACGGCCAACACGAAGCCAAAAGAGGAACAGTTCAACAGCACGTACCGCGTGGTCAGCGTCCTGCCCATCCGGCACCAGGACTGGCTGACGGGGAAGGAATTCAAGTGCAAGGTCAACAACAAAGCTCTCCCAGCCCCCATCGAGAGGACCATCTCCAAGGCCAAAGGGCAGACCCGGGAGCCGCAGGTGTACGCCCTGGCCCCACACCGGGAAGAGCTGGCCAAGGACACCGTGAGCGTAACATGCCTGGTAAAAGACTTCTACCCAGTTGACATCAACATTGAGTGGCAGAGGAACGGGCAGCCAGAGTCAGAGGGCACCTACGCCACCACGCCGCCACAGCTGGACAACGACGGGACCTACTTCCTCTACAGCAAGCTCTCGGTGGGAAAGAACACGTGGCAGCGGGGAGAAACCTTCACCTGTGTGGTGATGCACGAGGCCCTGCCCAACCACTACACCCAGAAATCTATCACCCAGTCTTCGGGTAAA 7 | >ighgama1b IGH VICUGNA_PACOS 8 | TCCACCAAGGCCCCATCGGTCTATCCTCTGACTGCTAGATGCGGGGACACGCCTGGCTCCACAGTGGCCTTCGGCTGCCTGGTCTGGGGCTACATCCCTGAGCCGGTGACGGTGACTTGGAACTCAGGCGCCCTGTCCAGCGGCGTCCACACCTTCCCATCAGTCTTCATGTCCTCGGGGCTCTACTCCCTCAGCAGCTTGGTGACACTGCCCACAAGCAGCTCGACCGGCAAGACCTTCATCTGCAACGTAGCCCACCCGGCCAGCAGCACCAAGGTGGACAAGCGTGTGGAACCACATGGAGGATGCACGTGTCCCCAATGTCCAGCCCCTGAGCTCCCAGGAGGGCCCTCTGTCTTTGTCTTCCCCCCGAAACCCAAGGACGTCCTCTCCATTTCTGGGAGGCCCGAGGTCACGTGCGTTGTAGTGGACGTCGGAAAGGAAGACCCCGAGGTCAATTTCAACTGGTATATTGATGGCGTTGAGGTGCGAACGGCCAATACGAAGCCAAAAGAGGAACAGTTCAACAGCACGTACCGCGTGGTCAGCGTCCTGCCCATCCAGCACCAGGACTGGCTGACGGGGAAGGAATTCAAGTGCAAGGTCAACAACAAAGCTCTCCCAGCCCCTATCGAGAGGACCATCTCCAAGGCCAAAGGGCAGACCCGGGAGCCGCAGGTGTACACCCTGGCCCCACACCGGGAAGAGCTGGCCAAGGACACCGTGAGCGTAACATGCCTGGTCAAAGGCTTCTACCCAGCTGACATCAACGTTGAGTGGCAGAGGAACGGTCAGCCGGAGTCAGAGGGCACCTACGCCAACACGCCGCCACAGCTGGACAACGACGGGACCTACTTCCTCTACAGCAAGCTCTCGGTGGGAAAGAACACGTGGCAGCGGGGAGAAACCTTAACCTGTGTGGTGATGCATGAGGCCCTGCACAACCACTACACCCAGAAATCCATCTCCCAGTCTCCGGGTAAA 9 | >ighgama2c IGH VICUGNA_PACOS 10 | GCGCACCACAGCGAAGACCCCAGCTCCAAGTGTCCCAAATGCCCAGGCCCTGAGCTCCTTGGAGGGCCCACGGTCTTCATCTTCCCTCCGAAACCCAAGGACGTCCTCTCCATCACCCGAAAACCTGAGGTCACGTGCGTTGTGGTGGACGTGGGTAAGGAAGACCCTGAGATCGAGTTCAGCTGGTCCGTGGATGACACAGAGGTACACACGGCTGAGACAAAGCCAAAGGAGGAACAGTTCAACAGCACGTACCGCGTGGTCAGCGTCCTGCCCATCCAGCACCAGGACTGGCTGACGGGGAAGGAATTCAAGTGCAAGGTCAACAACAAAGCTCTCCCAGCCCCCATCGAGAGGACCATCTCCAAGGCCAAAGGGCAGACCCGGGAGCCGCAGGTGTACACCCTGGCCCCACACCGGGAAGAGCTGGCCAAGGACACCGTGAGCGTAACCTGCCTGGTCAAAGGCTTCTTCCCAGCTGACATCAACGTTGAGTGGCAGAGGAATGGGCAGCCAGAGTCAGAGGGCACCTACGCCACCACGCCGCCACAGCTGGACAACGACGGGACCTACTTCCTCTACAGCAAACTCTCCGTGGGAAAGAACACGTGGCAGCAGGGAGAAGTCTTCACCTGTGTGGTGATGCACGAGGCTCTACACAATCACTCCACCCAGAAATCCATCTCCCAGTCTCCGGGTAAA 11 | >ighepsilon IGH VICUGNA_PACOS 12 | GCCTCCACCCAGAAACCAACCGTCTTCCCCTTGACCTGTTGCAAAAACACCACCGATGTCACCGCTGTGGCGCTAGGCTGCCTGGTCACCGGCTATTTCCCGGAGCCAGTGACCGTGACCTGGGACACAGGGTCCCTGAACAGCAGCACCAGGACCTTCCCCGCCATCCAAAACCTGGAATCCAGCCTCTACACCACCAGCAGCCAGGTGACCATCTTGGGCAAGTGGTCCAAGCAGAAATTCACCTGCAGTGTGGCTCACCCTGACTCCAACATCACCATCACCAAGGTCGTCCCTGGGTGCTTCAAGGACTTCCCTGAGCCCTCCGTGAAGCTCTTCCACTCCTCCTGCAACCCCGACGGCGACACCCACACCACCATCCAGCTTCTGTGCCTCATCTCTGGCTACACGCCAGGCAGAATCCAGGTCGCCTGGCTGGAGGACGGGCAGGCGGTCACAGACAGGTTCCCACAGACCGCCAATGACAGACCAGAAGGCAAGCTGGCCTCCACCCACAGCCAGCTCAACATCACGCAGGAGGAGTGGCTGTCCCAGAAAACCTACACCTGCCAGGTCACCTATAACGGCTTCACCTATGAAGACCATGCCCGCAAGTGCACAGAGTCCGACCCCCGCGGTGTGAGCGCCTACCTGATCCCGCCTACTCCCCTCGACCTGTACGTCCACAAGTCGCCCAAGATCACCTGCCTGGTGGTGGACCTGGCCAGAAAGGAAGGCATGAATCTGACCTGGTTTAGGGAGAACAGGGGTCCTGCACAGCCAGACTCACTCGTCATCAAGACCCAGTTCAACAAGACAGTCACTGCCACATCCACCCTGTTGGTGGACGTCCAGGACTGGATTGAGGGCGAGACCTACTACTGCAAAGTGACCCACCCAGACCTGCCCAGGTCAATCCTGCGCTCCATCTCCAAGGCCCCCGGCAAGCGTTTAGCCCCCGAGGTCTACGTGCTCTCGCCACGCAAGGAAGAGCGGGCAGCCAAGGACAAGCTCACCCTCACCTGCCTGGCCCAGAACTTCTTCCCTGAGGACATCTCCGTGCAGTGGCTGAGGAACAATGCCCTGATTCAGACGGACCAGCACAGCACCACGAAGCCCCACAAGGCCAACGGCCCTTCCCCCGCCTTCTTCGTCTACAGCCGCCTGGTGGTCAGCCGGGCCGACTGGGAGCAGAAGAACAAGTTCACCTGCCGAGTGGTCCACGAGGCACTGCCCGGCTCCAGGACCCTCGAGAAATCCGTGTCCAGTGACCTCGGTAAA 13 | >ighalpha IGH VICUGNA_PACOS 14 | TCAGAGCCTGCAACCAGCCCCAGCGTCTTCCCGCTGGGCCCCAGCTATGACAAGGCATCCAGGCAGGTGGTCCTTGCCTGCCTGGTCCACGGCTTCTTCCCACCGGCGCCCCTGAAGGTGACATGGGGCCTCAGTGGCCAGAACGTGTCCGTCATGGACTTCCCCACCGTGCAGCCTGCCTCCGGGGTCCTGTACACCATGAGCAGCCAGCTGACCACGCCAGTTGAGCAGTGCCCAGACAGCGAGATCGTGACATGCCAAGTGCAGCACCTCTCCAGCTCCAGCCAGACAGTGAACGTGCCCTGCAGAGCTCCCACACCCCAGCCCCTGTGCTGCAAGCCCAGCCTGGCCCTGCACCCGCCGGCTCTGGAGGACCTGCTCCTGGGCTCCAACGCCAGCCTCACGTGCACACTGAGTGGCCTCAGAAACCCCGAGGGCGCCCAATTCACCTGGACTCCCTCAGGTGGAAAGGTCGCTGTCCAGCAGTCACCCAAGCATGACCCTTGTGGCTGCTTCAGCGTGTCCAGTGTCCTGCCGGGCTGTGCAGAGCAGTGGAACAGCAAAACGACCTTCTCCTGCAGCGCCACCCACCCAGAATCCGAGAACACGCTAACTGCCACCATCACCAAATCCTTAGAGGACCCCATCCGGCCCCAGGTCCACCTGCTACCGCCGCCGTCGGAGGAGCTGGCCCTCAATGAGATGGTGACGCTGACGTGCGTTGTGCGCGGCTTCAGCCCCAAGGACGTGCTGGTTCGATGGCTGCATGGGAACCAGGAGCTGCCTCGCGAGAAGTACCTGACCTGGAGGCCCCTGCCTGAGCCCGAACAGAGCATCACCACCTACGCCGTGACCAGCCTGCTGCGCGTGGAGGCCGAGGCCTGGAAGCAGGGAGACAACTACTCCTGCATGGTGGGCCACGAGGCCCTGCCCCTGGCCTTCACCCAGAAGACCATCGACCGCCTGTCGGGTAAACCCACCCACGTCAACGTGTCTGTTGTCATGGCGGAGGCAGAAGGCGTCTGCTAC 15 | -------------------------------------------------------------------------------- /riot_na/databases/scheme_mappings/alpaca/kabat/scheme_mapping.csv: -------------------------------------------------------------------------------- 1 | gene_id,scheme_cigar 2 | VH3-S36,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 3 | VH3-S46,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 4 | VH1-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 5 | VHH3-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 6 | VH2-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 7 | VH3-S33,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 8 | VH3-S16,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 9 | VH3-S9,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 10 | VH3-S40,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 11 | VH2-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 12 | VHH3-S12,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 13 | VHH3-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 14 | VHH3-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 15 | VH3-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 16 | VH3-S37,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 17 | VH3-S42,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 18 | VH3-S18,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 19 | VH2-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 20 | VHH3-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 21 | VH3-S29,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 22 | VH3-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 23 | VH1-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 24 | VHH3-S14,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 25 | VHH3-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 26 | VH3-S17,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 27 | VHH3-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 28 | VHH3-S15,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 29 | VH3-S38,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 30 | VH3-S26,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 31 | VH3-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 32 | VH3-S25,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 33 | VH3-S45,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 34 | VH3-S41,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 35 | VHH3-S13,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 36 | VHH3-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 37 | VH2-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 38 | VH3-S34,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 39 | VHH3-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 40 | VHH3-S9,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 41 | VH3-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 42 | VH3-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 43 | VH3-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 44 | VH3-S32,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 45 | VH3-S30,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 46 | vh3-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 47 | vh2-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 48 | vhh3-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 49 | VH3-S28,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 50 | VH3-S43,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 51 | VHH3-S16,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 52 | VH2-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 53 | VH3-S13,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 54 | VH1-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 55 | VH2-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 56 | VH1-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 57 | VHH3-S11,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 58 | VH3-S12,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 59 | VH3-S39,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 60 | VH3-S27,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 61 | VH3-S20,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 62 | VH3-S14,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 63 | VH3-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 64 | VH2-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 65 | VH3-S35,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 66 | VH3-S31,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 67 | VH3-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 68 | VH2-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 69 | VH3-S44,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 70 | vhh3-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 71 | vh3-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 72 | vh3-2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMM 73 | vh1-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 74 | VH3-S11,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 75 | ighJ-1,IIMMMMMMMMMMMMM 76 | ighJ-2,IIIIMMMMMMMMMMMMM 77 | ighJ-3,IIIMMMMMMMMMMMMM 78 | ighJ-4,IIMMMMMMMMMMMMM 79 | ighJ-5,IIIMMMMMMMMMMMM 80 | ighJ-6,IIMMMMMMMMMMMMM 81 | ighJ-7,IIIIMMMMMMMMMMMMM 82 | -------------------------------------------------------------------------------- /riot_na/databases/scheme_mappings/alpaca/martin/scheme_mapping.csv: -------------------------------------------------------------------------------- 1 | gene_id,scheme_cigar 2 | VH3-S36,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 3 | VH3-S46,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 4 | VH1-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 5 | VHH3-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 6 | VH2-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 7 | VH3-S33,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 8 | VH3-S16,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 9 | VH3-S9,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 10 | VH3-S40,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 11 | VH2-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 12 | VHH3-S12,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 13 | VHH3-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 14 | VHH3-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 15 | VH3-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 16 | VH3-S37,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 17 | VH3-S42,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 18 | VH3-S18,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 19 | VH2-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 20 | VHH3-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 21 | VH3-S29,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 22 | VH3-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 23 | VH1-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 24 | VHH3-S14,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 25 | VHH3-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 26 | VH3-S17,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 27 | VHH3-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 28 | VHH3-S15,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 29 | VH3-S38,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 30 | VH3-S26,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 31 | VH3-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 32 | VH3-S25,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 33 | VH3-S45,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 34 | VH3-S41,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 35 | VHH3-S13,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 36 | VHH3-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 37 | VH2-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 38 | VH3-S34,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 39 | VHH3-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 40 | VHH3-S9,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 41 | VH3-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 42 | VH3-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 43 | VH3-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 44 | VH3-S32,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 45 | VH3-S30,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 46 | vh3-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMM 47 | vh2-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMM 48 | vhh3-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMM 49 | VH3-S28,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 50 | VH3-S43,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 51 | VHH3-S16,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 52 | VH2-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 53 | VH3-S13,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 54 | VH1-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 55 | VH2-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 56 | VH1-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 57 | VHH3-S11,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 58 | VH3-S12,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 59 | VH3-S39,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 60 | VH3-S27,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 61 | VH3-S20,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 62 | VH3-S14,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 63 | VH3-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 64 | VH2-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 65 | VH3-S35,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 66 | VH3-S31,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 67 | VH3-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 68 | VH2-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 69 | VH3-S44,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 70 | vhh3-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMM 71 | vh3-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMM 72 | vh3-2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMM 73 | vh1-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMMMM 74 | VH3-S11,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMMMMMMMMMM 75 | ighJ-1,IIMMMMMMMMMMMMM 76 | ighJ-2,IIIIMMMMMMMMMMMMM 77 | ighJ-3,IIIMMMMMMMMMMMMM 78 | ighJ-4,IIMMMMMMMMMMMMM 79 | ighJ-5,IIIMMMMMMMMMMMM 80 | ighJ-6,IIMMMMMMMMMMMMM 81 | ighJ-7,IIIIMMMMMMMMMMMMM 82 | -------------------------------------------------------------------------------- /riot_na/databases/scheme_mappings/alpaca/chothia/scheme_mapping.csv: -------------------------------------------------------------------------------- 1 | gene_id,scheme_cigar 2 | VH3-S36,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 3 | VH3-S46,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 4 | VH1-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 5 | VHH3-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 6 | VH2-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 7 | VH3-S33,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 8 | VH3-S16,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 9 | VH3-S9,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 10 | VH3-S40,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 11 | VH2-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 12 | VHH3-S12,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 13 | VHH3-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 14 | VHH3-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 15 | VH3-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 16 | VH3-S37,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 17 | VH3-S42,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 18 | VH3-S18,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 19 | VH2-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 20 | VHH3-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 21 | VH3-S29,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 22 | VH3-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 23 | VH1-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 24 | VHH3-S14,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 25 | VHH3-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 26 | VH3-S17,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 27 | VHH3-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 28 | VHH3-S15,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 29 | VH3-S38,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 30 | VH3-S26,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 31 | VH3-S7,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 32 | VH3-S25,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 33 | VH3-S45,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 34 | VH3-S41,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 35 | VHH3-S13,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 36 | VHH3-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 37 | VH2-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 38 | VH3-S34,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 39 | VHH3-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 40 | VHH3-S9,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 41 | VH3-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 42 | VH3-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 43 | VH3-S8,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 44 | VH3-S32,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 45 | VH3-S30,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 46 | vh3-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 47 | vh2-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 48 | vhh3-S1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 49 | VH3-S28,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 50 | VH3-S43,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 51 | VHH3-S16,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 52 | VH2-S6,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 53 | VH3-S13,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 54 | VH1-S2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 55 | VH2-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 56 | VH1-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 57 | VHH3-S11,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 58 | VH3-S12,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 59 | VH3-S39,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 60 | VH3-S27,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 61 | VH3-S20,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 62 | VH3-S14,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 63 | VH3-S10,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMDMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 64 | VH2-S4,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 65 | VH3-S35,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 66 | VH3-S31,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 67 | VH3-S5,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 68 | VH2-S3,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 69 | VH3-S44,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 70 | vhh3-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 71 | vh3-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 72 | vh3-2,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMM 73 | vh1-1,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMMMM 74 | VH3-S11,MMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIMMMMMMMMMMMMMMMMMMMMMMMMMMMMMMIIIMMMMMMMMMM 75 | 76 | ighJ-1,IIMMMMMMMMMMMMM 77 | ighJ-2,IIIIMMMMMMMMMMMMM 78 | ighJ-3,IIIMMMMMMMMMMMMM 79 | ighJ-4,IIMMMMMMMMMMMMM 80 | ighJ-5,IIIMMMMMMMMMMMM 81 | ighJ-6,IIMMMMMMMMMMMMM 82 | ighJ-7,IIIIMMMMMMMMMMMMM 83 | -------------------------------------------------------------------------------- /riot_prefiltering/model.rs: -------------------------------------------------------------------------------- 1 | use pyo3::prelude::*; 2 | use serde::Serialize; 3 | use std::cmp::Ordering; 4 | 5 | pub type Kmer = Vec; 6 | pub type ByteSequence = Vec; 7 | pub type Coverage = i32; 8 | pub type RevComp = bool; 9 | pub type GeneId = String; 10 | 11 | pub struct KmerGeneIndexEntry { 12 | pub gene_id: GeneId, 13 | pub position: usize, 14 | } 15 | #[derive(PartialEq, Eq, Debug)] 16 | pub struct KmerMatch { 17 | pub target_position: usize, 18 | pub query_position: usize, 19 | } 20 | 21 | impl PartialOrd for KmerMatch { 22 | fn partial_cmp(&self, other: &Self) -> Option { 23 | Some(self.cmp(other)) 24 | } 25 | } 26 | 27 | impl Ord for KmerMatch { 28 | fn cmp(&self, other: &Self) -> Ordering { 29 | if self.target_position == other.target_position { 30 | return self.query_position.cmp(&other.query_position); 31 | } 32 | self.target_position.cmp(&other.target_position) 33 | } 34 | } 35 | 36 | #[pyclass] 37 | #[derive(PartialEq, Eq, Debug, Clone, Serialize)] 38 | pub struct GeneSegment { 39 | #[pyo3(get)] 40 | pub start_target: usize, 41 | #[pyo3(get)] 42 | pub end_target: usize, 43 | #[pyo3(get)] 44 | pub start_query: usize, 45 | #[pyo3(get)] 46 | pub end_query: usize, 47 | #[pyo3(get)] 48 | pub coverage: Coverage, 49 | #[pyo3(get)] 50 | pub match_count: usize, 51 | #[pyo3(get)] 52 | pub gene_id: GeneId, 53 | } 54 | 55 | #[pymethods] 56 | impl GeneSegment { 57 | fn __str__(&self) -> PyResult { 58 | Ok(format!( 59 | "GeneSegment(target={}..{}, query={}..{}, coverage={}, matches={}, gene_id={})", 60 | self.start_target, self.end_target, self.start_query, self.end_query, self.coverage, self.match_count, self.gene_id 61 | )) 62 | } 63 | fn __repr__(&self) -> PyResult { 64 | Ok(format!( 65 | "GeneSegment(target={}..{}, query={}..{}, coverage={}, matches={}, gene_id={})", 66 | self.start_target, self.end_target, self.start_query, self.end_query, self.coverage, self.match_count, self.gene_id 67 | )) 68 | } 69 | } 70 | 71 | 72 | #[pyclass] 73 | #[derive(PartialEq, Eq, Debug, Clone, Serialize)] 74 | pub struct GeneMatch { 75 | #[pyo3(get)] 76 | pub gene_id: GeneId, 77 | #[pyo3(get)] 78 | pub rev_comp: RevComp, 79 | #[pyo3(get)] 80 | pub coverage: Coverage, 81 | } 82 | 83 | impl PartialOrd for GeneMatch { 84 | fn partial_cmp(&self, other: &Self) -> Option { 85 | Some(self.cmp(other)) 86 | } 87 | } 88 | 89 | impl Ord for GeneMatch { 90 | fn cmp(&self, other: &Self) -> Ordering { 91 | match other.coverage.cmp(&self.coverage) { 92 | std::cmp::Ordering::Equal => self.gene_id.cmp(&other.gene_id), 93 | x => x, 94 | } 95 | } 96 | } 97 | 98 | #[pymethods] 99 | impl GeneMatch { 100 | fn __str__(&self) -> PyResult { 101 | Ok(format!( 102 | "GeneMatch(gene_id={}, rev_comp={}, coverage={})", 103 | self.gene_id, self.rev_comp, self.coverage 104 | )) 105 | } 106 | fn __repr__(&self) -> PyResult { 107 | Ok(format!( 108 | "GeneMatch(gene_id={}, rev_comp={}, coverage={})", 109 | self.gene_id, self.rev_comp, self.coverage 110 | )) 111 | } 112 | } 113 | 114 | #[pyclass] 115 | pub struct PrefilteringResult { 116 | #[pyo3(get)] 117 | pub query: String, 118 | #[pyo3(get)] 119 | pub rev_comp_query: String, 120 | #[pyo3(get)] 121 | pub top_matches: Vec, 122 | } 123 | 124 | #[pymethods] 125 | impl PrefilteringResult { 126 | fn __str__(&self) -> PyResult { 127 | Ok(format!( 128 | "PrefilteringResult(query={}, rev_comp_query={}, top_matches={:?})", 129 | self.query, self.rev_comp_query, self.top_matches 130 | )) 131 | } 132 | fn __repr__(&self) -> PyResult { 133 | Ok(format!( 134 | "PrefilteringResult(query={}, rev_comp_query={}, top_matches={:?})", 135 | self.query, self.rev_comp_query, self.top_matches 136 | )) 137 | } 138 | } 139 | 140 | // New segment-centric data structures 141 | 142 | #[pyclass] 143 | #[derive(PartialEq, Debug, Clone, Serialize)] 144 | pub struct SegmentMatch { 145 | #[pyo3(get)] 146 | pub query_start: usize, 147 | #[pyo3(get)] 148 | pub query_end: usize, 149 | #[pyo3(get)] 150 | pub coverage: Coverage, 151 | #[pyo3(get)] 152 | pub match_count: usize, 153 | #[pyo3(get)] 154 | pub matching_genes: Vec, 155 | #[pyo3(get)] 156 | pub min_target_start: usize, 157 | #[pyo3(get)] 158 | pub max_target_start: usize, 159 | #[pyo3(get)] 160 | pub segment_start: usize, 161 | } 162 | 163 | impl SegmentMatch { 164 | pub(crate) fn recalculate_bounds_and_coverage(&mut self) { 165 | if self.matching_genes.is_empty() { 166 | self.query_start = 0; 167 | self.query_end = 0; 168 | self.coverage = 0; 169 | self.min_target_start = 0; 170 | self.max_target_start = 0; 171 | self.segment_start = 0; 172 | self.match_count = 0; 173 | return; 174 | } 175 | 176 | // Since GeneMatch no longer has coordinates, we only recalculate coverage 177 | // The segment bounds should already be set correctly when the segment is created 178 | let max_cov = self.matching_genes.iter().map(|gene| gene.coverage).max().unwrap_or(0); 179 | self.coverage = max_cov; 180 | self.match_count = self.matching_genes.len(); 181 | } 182 | 183 | } 184 | 185 | impl PartialOrd for SegmentMatch { 186 | fn partial_cmp(&self, other: &Self) -> Option { 187 | Some(self.cmp(other)) 188 | } 189 | } 190 | 191 | impl Ord for SegmentMatch { 192 | fn cmp(&self, other: &Self) -> Ordering { 193 | // Sort by query position first (for natural domain order) 194 | self.coverage.cmp(&other.coverage) 195 | } 196 | } 197 | 198 | impl Eq for SegmentMatch {} 199 | 200 | #[pymethods] 201 | impl SegmentMatch { 202 | fn __str__(&self) -> PyResult { 203 | Ok(format!( 204 | "SegmentMatch(query={}..{}, length={}, coverage={}, matches={}, genes={})", 205 | self.query_start, self.query_end, self.query_length()?, self.coverage, self.match_count, self.matching_genes.len() 206 | )) 207 | } 208 | fn __repr__(&self) -> PyResult { 209 | Ok(format!( 210 | "SegmentMatch(query={}..{}, length={}, coverage={}, matches={}, genes={:?})", 211 | self.query_start, self.query_end, self.query_length()?, self.coverage, self.match_count, self.matching_genes 212 | )) 213 | } 214 | 215 | /// Get the length of the segment in query bases 216 | #[pyo3(name = "query_length")] 217 | fn query_length(&self) -> PyResult { 218 | Ok(self.query_end - self.query_start) 219 | } 220 | } 221 | 222 | #[pyclass] 223 | pub struct PrefilteringSegmentResult { 224 | #[pyo3(get)] 225 | pub query: String, 226 | #[pyo3(get)] 227 | pub rev_comp_query: String, 228 | #[pyo3(get)] 229 | pub segments: Vec, 230 | } 231 | 232 | #[pymethods] 233 | impl PrefilteringSegmentResult { 234 | fn __str__(&self) -> PyResult { 235 | Ok(format!( 236 | "PrefilteringSegmentResult(query={}, rev_comp_query={}, segments={})", 237 | self.query, self.rev_comp_query, self.segments.len() 238 | )) 239 | } 240 | fn __repr__(&self) -> PyResult { 241 | Ok(format!( 242 | "PrefilteringSegmentResult(query={}, segments={:?})", 243 | self.query, self.segments 244 | )) 245 | } 246 | 247 | /// Get the number of distinct domains detected 248 | #[pyo3(name = "domain_count")] 249 | fn domain_count(&self) -> PyResult { 250 | Ok(self.segments.len()) 251 | } 252 | } 253 | --------------------------------------------------------------------------------