├── autocnv ├── __init__.py ├── tests │ ├── __init__.py │ └── test_annotation.py ├── utils.py ├── __main__.py ├── database.py ├── settings.py └── annotate.py ├── requirements.txt ├── docs ├── 涉及数据库.xlsx └── 分析流程图V1.1.tiff ├── database-prepare ├── exon_prep.py └── database-create-flowchart.md ├── README.md └── .gitignore /autocnv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /autocnv/tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas~=1.2.5 2 | pyfaidx 3 | pysam~=0.16.0.1 4 | gtfparse~=1.2.1 -------------------------------------------------------------------------------- /docs/涉及数据库.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhonghua-wang/autocnv/HEAD/docs/涉及数据库.xlsx -------------------------------------------------------------------------------- /docs/分析流程图V1.1.tiff: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zhonghua-wang/autocnv/HEAD/docs/分析流程图V1.1.tiff -------------------------------------------------------------------------------- /autocnv/utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | from pysam import VariantRecord 3 | from json import JSONEncoder 4 | from autopvs1.strength import Strength 5 | from autocnv import settings 6 | 7 | class ACITEncoder(JSONEncoder): 8 | def default(self, o): 9 | if isinstance(o, VariantRecord): 10 | return [o.contig, o.pos, o.ref, o.alts] 11 | if isinstance(o, Strength): 12 | return o.name 13 | return JSONEncoder.default(self, o) 14 | -------------------------------------------------------------------------------- /database-prepare/exon_prep.py: -------------------------------------------------------------------------------- 1 | #%% 2 | from autocnv import settings 3 | from autocnv.database import DataBase 4 | from gtfparse import read_gtf 5 | import pandas as pd 6 | refGene_file = '/Users/zhonghua/data/gene-toolkit/raw-data/hg19.refGene.gtf.gz' 7 | #%% 8 | refGene_df = read_gtf(refGene_file) 9 | refGene_df = refGene_df[refGene_df['feature'] == 'exon'] 10 | #%% 11 | gene_db = pd.read_csv(settings.GENE_DATABASE, sep='\t') 12 | #%% 13 | 14 | merge_df = gene_db.merge(refGene_df, left_on='transcript', right_on='transcript_id')[[ 15 | '#chrom', 'start_y', 'end_y', 'gene_id_x', 'symbol', 'exon_number', 'transcript' 16 | ]] 17 | merge_df.columns = [x.strip('_x').strip('_y') for x in merge_df.columns] 18 | 19 | merge_df.to_csv(settings.GENE_EXON_DATABASE.strip('.gz'), sep='\t', index=False) 20 | #%% 21 | 22 | 23 | -------------------------------------------------------------------------------- /autocnv/__main__.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from autocnv.annotate import AnnotateHelper 3 | 4 | CNVKIT_COL_MAP = { 5 | 'Chr': 'chr', 'Start': 'start', 'End': 'end', 'Detect': 'type' 6 | } 7 | CNV_MAP = { 8 | 'Del': 'del', 'Dup': 'dup' 9 | } 10 | 11 | if __name__ == '__main__': 12 | ap = argparse.ArgumentParser() 13 | ap.add_argument( 14 | '--in', dest='input', required=True, 15 | help='input file (TSV format), columns: chr, start, end, type, error are required' 16 | ) 17 | ap.add_argument( 18 | '--out', dest='output', required=True, 19 | help='annotated result file path' 20 | ) 21 | ap.add_argument('--cnvkit', dest='cnvkit', help='use CNVkit result as input', action='store_true', default=False) 22 | args = ap.parse_args() 23 | anno = AnnotateHelper() 24 | if args.cnvkit: 25 | anno.annotation_file(args.input, args.output, col_map=CNVKIT_COL_MAP, cnv_map=CNV_MAP) 26 | else: 27 | anno.annotation_file(args.input, args.output) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # AutoCNV: an **Auto**matic **C**opy **N**umber **V**ariant Interpretation Tool 2 | 3 | ## Installation 4 | 5 | ### Pre-requirements 6 | 7 | - `pysam`: install `pysam` via conda: 8 | 9 | ```shell 10 | conda config --add channels r 11 | conda config --add channels bioconda 12 | conda install pysam 13 | ``` 14 | For detail install instruction of `pysam`, please refer to [the official document](https://pysam.readthedocs.io/en/latest/installation.html). 15 | 16 | - install [`autopvs1`](https://github.com/JiguangPeng/autopvs1) 17 | - install requirments `pip install -r requirements.txt` 18 | 19 | ### Usage 20 | please refer to the test cases. 21 | 22 | ### Citation 23 | 24 | ``` 25 | @article{Fan2021, 26 | author = {Fan, Chunna and Wang, Zhonghua and Sun, Yan and Sun, Jun and Liu, Xi and Kang, Licheng and Xu, Yingshuo and Yang, Manqiu and Dai, Wentao and Song, Lijie and Wei, Xiaoming and Xiang, Jiale and Huang, Hui and Zhou, Meizhen and Zeng, Fanwei and Huang, Lin and Xu, Zhengfeng and Peng, Zhiyu}, 27 | doi = {10.1186/s12864-021-08011-4}, 28 | isbn = {1286402108011}, 29 | issn = {14712164}, 30 | journal = {BMC Genomics}, 31 | keywords = {AutoCNV,CNV classification,CNV interpretation,Scoring}, 32 | number = {1}, 33 | pages = {1--12}, 34 | pmid = {34615484}, 35 | publisher = {BMC Genomics}, 36 | title = {{AutoCNV: a semiautomatic CNV interpretation system based on the 2019 ACMG/ClinGen Technical Standards for CNVs}}, 37 | volume = {22}, 38 | year = {2021} 39 | } 40 | ``` -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | 27 | # PyInstaller 28 | # Usually these files are written by a python script from a template 29 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 30 | *.manifest 31 | *.spec 32 | 33 | # Installer logs 34 | pip-log.txt 35 | pip-delete-this-directory.txt 36 | 37 | # Unit tests / coverage reports 38 | htmlcov/ 39 | .tox/ 40 | .coverage 41 | .coverage.* 42 | .cache 43 | nosetests.xml 44 | coverage.xml 45 | *,cover 46 | .hypothesis/ 47 | 48 | # Translations 49 | *.mo 50 | *.pot 51 | 52 | # Django stuff: 53 | *.log 54 | local_settings.py 55 | 56 | # Flask stuff: 57 | instance/ 58 | .webassets-cache 59 | 60 | # Scrapy stuff: 61 | .scrapy 62 | 63 | # Sphinx documentation 64 | docs/_build/ 65 | 66 | # PyBuilder 67 | target/ 68 | 69 | # IPython Notebook 70 | .ipynb_checkpoints 71 | 72 | # pyenv 73 | .python-version 74 | 75 | # celery beat schedule file 76 | celerybeat-schedule 77 | 78 | # dotenv 79 | .env 80 | 81 | # virtualenv 82 | venv/ 83 | ENV/ 84 | 85 | # Spyder project settings 86 | .spyderproject 87 | 88 | # Rope project settings 89 | .idea/ 90 | data/ 91 | .code/ 92 | .DS_Store 93 | test_data/ 94 | .vscode/ 95 | raw_data/ 96 | docs/ 97 | example/ 98 | test_debug.py -------------------------------------------------------------------------------- /autocnv/database.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | from collections import namedtuple 3 | import pysam 4 | 5 | 6 | class DataBase: 7 | def __init__(self, path): 8 | self._path = path 9 | self._tbx = pysam.Tabixfile(path) 10 | self._fields = namedtuple('Record', self._tbx.header[-1].strip('#').split('\t')) 11 | 12 | def fetch(self, chrom, start, end): 13 | """ 14 | 查找并生成给定基因组位置的记录 15 | :param chrom: 染色体编号 16 | :param start: 起始位置 17 | :param end: 终止位置 18 | :return: 记录生成器 19 | """ 20 | try: 21 | for record in self._tbx.fetch(chrom, start, end): 22 | chrom, start, end, *fields = record.split('\t') 23 | start, end = int(start), int(end) 24 | # 所得记录按照表头组装为namedtuple方便使用 25 | yield self._fields(chrom, start, end, *fields) 26 | except ValueError: 27 | yield from () 28 | 29 | def overlap(self, chrom, start, end): 30 | """ 31 | 查找并生成给定基因组位置的记录,同时计算两者之间的重叠程度 32 | :param chrom: 染色体编号 33 | :param start: 起始位置 34 | :param end: 终止位置 35 | :return: 记录与重叠程度生成器 36 | """ 37 | length = end - start 38 | for record in self.fetch(chrom, start, end): 39 | _, overlap_start, overlap_end, _ = sorted((start, end, record[1], record[2])) 40 | overlap = overlap_end - overlap_start 41 | yield record, overlap / length, overlap / (record[2] - record[1]) 42 | 43 | def overlap_groups(self, chrom, start, end, key=None): 44 | """ 45 | 查找并生成给定基因组位置的记录,同时计算两者之间的重叠程度,返回按照key方法分组的字典 46 | :param chrom: 染色体编号 47 | :param start: 起始位置 48 | :param end: 终止位置 49 | :param key: 分组依据 50 | :return: 分组后的记录字典 51 | """ 52 | groups = defaultdict(list) 53 | for record in self.overlap(chrom, start, end): 54 | groups[record if key is None else key(record)].append(record) 55 | return dict(groups) 56 | -------------------------------------------------------------------------------- /autocnv/settings.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DEFAULT_SCORE = { 4 | 'del': { 5 | '1A': 0, '1B': -0.6, 6 | '2A': 1, '2B': 0, 7 | '2C': -1, '2C-1': 0.9, '2C-2': 0, 8 | '2D': -1, '2D-1': 0, '2D-2': 0.9, '2D-3': 0.3, '2D-4': 0.9, 9 | '2E': 0, '2F': -1, '2G': 0, '2H': 0.15, '2I': 0, '2J': 0, '2K': 0.45, 10 | '3A': 0, '3B': 0.45, '3C': 0.9, 11 | '4O': -1, 12 | 'PVS1': 0.9, 'PVS1_S': 0.45, 'PVS1_M': 0.3, 'PVS1_P': 0.15, 'PVS1_U': 0 13 | }, 14 | 'dup': { 15 | '1A': 0, '1B': -0.6, 16 | '2A': 1, '2B': 0, 17 | '2C': -1, '2C-1': 0.9, '2C-2': 0, 18 | '2D': -1, '2D-1': 0, '2D-2': 0.9, '2D-3': 0.3, '2D-4': 0.9, 19 | '2E': 0, '2F': -1, '2G': 0, '2H': 0, '2I': 0, '2J': 0, '2K': 0.45, '2L': 0, 20 | '3A': 0, '3B': 0.45, '3C': 0.9, 21 | '4O': -1, 22 | 'PVS1': 0.9, 'PVS1_S': 0.45, 'PVS1_M': 0.3, 'PVS1_P': 0.15, 'PVS1_U': 0 23 | } 24 | } 25 | 26 | 27 | 28 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) 29 | 30 | CYTO_BAND_FILE = os.path.join(BASE_DIR, 'data', 'cyto-band.bed.gz') 31 | 32 | GENE_EXON_DATABASE = os.path.join(BASE_DIR, 'data', 'exon.sorted.bed.gz') 33 | 34 | GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'gene.sorted.bed.gz') 35 | 36 | OMIM_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'omim-gene.sorted.bed.gz') 37 | 38 | FUNC_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'func-region.sorted.gz') 39 | 40 | HI_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-gene.sorted.bed.gz') 41 | 42 | HI_EXON_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-exon.sorted.bed.gz') 43 | 44 | HI_CDS_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-cds.sorted.bed.gz') 45 | 46 | CLINVAR_PATHOGENIC_DATABASE = os.path.join( 47 | BASE_DIR, 'data', 'clinvar-pathogenic.sorted.vcf.gz' 48 | ) 49 | 50 | UHI_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'uhi-gene.sorted.bed.gz') 51 | 52 | HI_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-region.sorted.bed.gz') 53 | 54 | UHI_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'uhi-region.sorted.bed.gz') 55 | 56 | DECIPHER_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'decipher-gene.sorted.bed.gz') 57 | 58 | TS_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'ts-gene.sorted.bed.gz') 59 | 60 | TS_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'ts-region.sorted.bed.gz') 61 | 62 | UTS_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'uts-gene.sorted.bed.gz') 63 | 64 | UTS_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'uts-region.sorted.bed.gz') 65 | 66 | DGV_GAIN_DATABASE = os.path.join(BASE_DIR, 'data', 'dgv-gain.sorted.bed.gz') 67 | 68 | DGV_LOSS_DATABASE = os.path.join(BASE_DIR, 'data', 'dgv-loss.sorted.bed.gz') 69 | 70 | GNOMAD_DEL_DATABASE = os.path.join(BASE_DIR, 'data', 'gnomad-del.sorted.bed.gz') 71 | 72 | GNOMAD_DUP_DATABASE = os.path.join(BASE_DIR, 'data', 'gnomad-dup.sorted.bed.gz') 73 | 74 | CNV_SYNDROME_DEL_DATABASE = os.path.join(BASE_DIR, 'data', 'cnv-syndrome-del.bed.gz') 75 | CNV_SYNDROME_DUP_DATABASE = os.path.join(BASE_DIR, 'data', 'cnv-syndrome-dup.bed.gz') 76 | 77 | 78 | try: 79 | from autocnv.local_settings import * 80 | except ImportError: 81 | pass 82 | -------------------------------------------------------------------------------- /autocnv/tests/test_annotation.py: -------------------------------------------------------------------------------- 1 | from autocnv.annotate import AnnotateHelper 2 | 3 | annotate = AnnotateHelper() 4 | 5 | 6 | def test_annotation1(): 7 | # 包含TBX5基因全部(基因 zoom out 3x) 8 | annotation = annotate.annotate('chr12', 114737220, 114900761, 'del') 9 | assert '2A' in annotation['rules'] 10 | 11 | 12 | def test_annotation2(): 13 | # TBX5基因5’端,覆盖CDS序列 14 | annotation = annotate.annotate('chr12', 114800000, 114900761, 'del') 15 | assert '2C-1' in annotation['rules'] 16 | 17 | 18 | def test_annotation3(): 19 | # TBX5基因5’端,不覆盖CDS序列 20 | annotation = annotate.annotate('chr12', 114842000, 114900761, 'del') 21 | assert '2C-2' in annotation['rules'] 22 | 23 | 24 | def test_annotation4(): 25 | # TBX5基因3'端,仅覆盖UTR区 26 | annotation = annotate.annotate('chr12', 114737220, 114793000, 'del') 27 | assert '2D-1' in annotation['rules'] 28 | 29 | 30 | def test_annotation5(): 31 | # TBX5基因3'端,仅覆盖末位外显子(该外显子含低频致病变异) 32 | annotation = annotate.annotate('chr12', 114737220, 114800000, 'del') 33 | assert '2D-2' in annotation['rules'] 34 | 35 | 36 | def test_annotation6(): 37 | # TBX5基因3'端,覆盖多个exon 38 | annotation = annotate.annotate('chr12', 114737220, 114835000, 'del') 39 | assert '2D-4' in annotation['rules'] 40 | 41 | 42 | def test_annotation7(): 43 | # 包含TBX5中间3个exon 44 | annotation = annotate.annotate('chr12', 114800000, 114835000, 'del') 45 | assert '2E' in annotation['rules'] 46 | 47 | 48 | def test_annotation8(): 49 | # TBX5基因5’端,不覆盖CDS序列;TBX3基因3'端,仅覆盖末位外显子(该外显子含低频致病变异) 50 | annotation = annotate.annotate('chr12', 114842000, 115111000, 'del') 51 | assert '2C-2' in annotation['rules'] 52 | assert '2D-2' in annotation['rules'] 53 | assert '4O' not in annotation['rules'] 54 | 55 | 56 | def test_annotation9(): 57 | # TBX5基因5’端,不覆盖CDS序列;TBX3基因3'端,覆盖多个外显子 58 | annotation = annotate.annotate('chr12', 114842000, 115114000, 'del') 59 | assert '2C-2' in annotation['rules'] 60 | assert '2D-4' in annotation['rules'] 61 | 62 | 63 | def test_annotation10(): 64 | # TBX5基因5’端,覆盖CDS序列;TBX3基因3'端,仅覆盖末位外显子(该外显子含低频致病变异) 65 | annotation = annotate.annotate('chr12', 114800000, 115111000, 'del') 66 | assert '2C-1' in annotation['rules'] 67 | assert '2D-2' in annotation['rules'] 68 | 69 | 70 | def test_annotation11(): 71 | # TBX5基因5’端,覆盖CDS序列;TBX3基因3'端,覆盖多个外显子 72 | annotation = annotate.annotate('chr12', 114800000, 115114000, 'del') 73 | assert '2C-1' in annotation['rules'] 74 | assert '2D-4' in annotation['rules'] 75 | 76 | 77 | def test_annotation12(): 78 | # 包含MECP2基因全部(基因范围 zoom out 1.5x) 79 | annotation = annotate.annotate('chrX', 153268282, 153382170, 'dup') 80 | assert '2A' in annotation['rules'] 81 | assert '2H' in annotation['rules'] 82 | 83 | 84 | def test_annotation13(): 85 | # MECP2基因内CNV 86 | annotation = annotate.annotate('chrX', 153300000, 153363100, 'dup') 87 | assert '2I' in annotation['rules'] 88 | 89 | 90 | def test_4O(): 91 | annotation = annotate.annotate('chr1', 196757278, 196796716, 'del') 92 | assert '4O' in annotation['rules'] 93 | annotation = annotate.annotate('chr15', 22750305, 23226254, 'del') 94 | assert '4O' not in annotation['rules'] 95 | annotation = annotate.annotate('chr1', 25584597, 25767647, 'del') 96 | assert '4O' not in annotation['rules'] 97 | annotation = annotate.annotate('chr1', 148974342, 149441884, 'dup') 98 | assert '4O' in annotation['rules'] 99 | annotation = annotate.annotate('chr15', 22750305, 23226254, 'dup') 100 | assert '4O' not in annotation['rules'] 101 | annotation = annotate.annotate('chr1', 425, 69091, 'dup') 102 | assert '4O' not in annotation['rules'] 103 | 104 | def test_syndrome(): 105 | annotation = annotate.annotate('chr23', 6420555, 8153336, 'del') 106 | 107 | 108 | def test_random(): 109 | annotation = annotate.annotate('chr11', 45904399, 46480747, 'del') 110 | assert '2H' in annotation['rules'] 111 | -------------------------------------------------------------------------------- /database-prepare/database-create-flowchart.md: -------------------------------------------------------------------------------- 1 | # ACIT数据库生成流程 2 | 3 | ## 基因 (Gene) 4 | 5 | ```mermaid 6 | graph TD 7 | 8 | classDef input fill:#e9ffef 9 | 10 | classDef output fill:#ffebe9 11 | 12 | classDef variable fill:#feffe9 13 | 14 | subgraph prepare gene 15 | refgene("refgene") 16 | class refgene input 17 | 18 | filter_length["filter max CDS length"] 19 | refgene --> filter_length 20 | 21 | geneinfo("geneinfo") 22 | class geneinfo input 23 | 24 | merge["merge by name2 & Symbol"] 25 | filter_length --> merge 26 | geneinfo --> merge 27 | 28 | filter_protein["filter type_of_gene == protein-coding"] 29 | merge --> filter_protein 30 | 31 | gene("gene") 32 | class gene output 33 | filter_protein --> gene 34 | end 35 | 36 | subgraph prepare omim gene 37 | omim_gene_list("omim gene list") 38 | class omim_gene_list input 39 | 40 | filter_omim["filter gene linked with disease by omim"] 41 | gene --> filter_omim 42 | omim_gene_list --> filter_omim 43 | 44 | omim_gene("omim_gene") 45 | class omim_gene output 46 | filter_omim --> omim_gene 47 | end 48 | ``` 49 | 50 | ## 单倍体敏感基因(HI Gene) 51 | 52 | ```mermaid 53 | graph TD 54 | 55 | classDef input fill:#e9ffef 56 | 57 | classDef output fill:#ffebe9 58 | 59 | classDef variable fill:#feffe9 60 | 61 | subgraph prepare hi gene 62 | curation_gene("curation_gene") 63 | class curation_gene input 64 | 65 | gene("gene") 66 | class gene variable 67 | 68 | filter_hi["filter Haploinsufficiency Score == 3"] 69 | curation_gene --> filter_hi 70 | 71 | merge["merge by Gene Symbol & name2"] 72 | filter_hi --> merge 73 | gene --> merge 74 | 75 | hi_gene("hi_gene") 76 | class hi_gene output 77 | merge --> hi_gene 78 | 79 | 80 | filter_uhi["filter Haploinsufficiency Score == 40"] 81 | curation_gene --> filter_uhi 82 | 83 | uhi_gene("uhi_gene") 84 | class uhi_gene output 85 | gene --> uhi_gene 86 | filter_uhi --> uhi_gene 87 | end 88 | 89 | subgraph prepare hi exon 90 | exon("extrace last exon") 91 | hi_gene --> exon 92 | 93 | hi_exon("hi_exon") 94 | class hi_exon output 95 | exon --> hi_exon 96 | end 97 | 98 | subgraph prepare clinvar pathogenic variants 99 | all_variants("all clinical pathogenic variants") 100 | class all_variants input 101 | 102 | exon_variants["filter variant in last exon"] 103 | all_variants --> exon_variants 104 | hi_exon --> exon_variants 105 | 106 | pathogenic_exon_variants("variants") 107 | class pathogenic_exon_variants output 108 | exon_variants --> pathogenic_exon_variants 109 | end 110 | 111 | subgraph prepare hi cds 112 | cds["extract CDS"] 113 | hi_gene --> cds 114 | 115 | hi_cds("hi_cds") 116 | class hi_cds output 117 | cds --> hi_cds 118 | end 119 | 120 | ``` 121 | 122 | ## 多倍体敏感基因 (TS Gene) 123 | 124 | ```mermaid 125 | graph TD 126 | 127 | classDef input fill:#e9ffef 128 | 129 | classDef output fill:#ffebe9 130 | 131 | classDef variable fill:#feffe9 132 | 133 | subgraph prepare ts gene 134 | curation_gene("curation_gene") 135 | class curation_gene input 136 | 137 | gene("gene") 138 | class gene variable 139 | 140 | filter_ts["filter Triplosensitivity Score == 3"] 141 | curation_gene --> filter_ts 142 | 143 | merge_ts["merge by Gene Symbol & name2"] 144 | filter_ts --> merge_ts 145 | gene --> merge_ts 146 | 147 | ts_gene("ts_gene") 148 | class ts_gene output 149 | merge_ts --> ts_gene 150 | 151 | filter_uts["filter Triplosensitivity Score == 40"] 152 | curation_gene --> filter_uts 153 | 154 | merge_uts["merge by Gene Symbol & name2"] 155 | gene --> merge_uts 156 | filter_uts --> merge_uts 157 | 158 | uts_gene("uts_gene") 159 | class uts_gene output 160 | merge_uts --> uts_gene 161 | end 162 | ``` 163 | 164 | ## 单倍体敏感区域 (HI region) 165 | 166 | ```mermaid 167 | graph TD 168 | 169 | classDef input fill:#e9ffef 170 | 171 | classDef output fill:#ffebe9 172 | 173 | classDef variable fill:#feffe9 174 | 175 | subgraph prepare hi region 176 | curation("curation_region") 177 | class curation input 178 | 179 | filter_hi["filter Haploisufficiency Score == 3"] 180 | curation --> filter_hi 181 | 182 | hi_region("hi_region") 183 | class hi_region output 184 | filter_hi --> hi_region 185 | 186 | filter_uhi["filter Haploisufficiency Score == 40"] 187 | curation --> filter_uhi 188 | 189 | gene("gene") 190 | class gene variable 191 | 192 | fetch_gene["fetch overlap gene"] 193 | gene --> fetch_gene 194 | filter_uhi --> fetch_gene 195 | 196 | uhi_region("uhi_region") 197 | class uhi_region output 198 | fetch_gene --> uhi_region 199 | end 200 | ``` 201 | 202 | ## 多倍体敏感区域 (TS region) 203 | 204 | ```mermaid 205 | graph TD 206 | 207 | classDef input fill:#e9ffef 208 | 209 | classDef output fill:#ffebe9 210 | 211 | classDef variable fill:#feffe9 212 | 213 | subgraph prepare ts region 214 | curation("curation_region") 215 | class curation input 216 | 217 | filter_ts["filter Triplosensitivity Score == 3"] 218 | curation --> filter_ts 219 | 220 | omim_gene("omim_gene") 221 | class omim_gene variable 222 | 223 | fetch_omim_gene["fetch overlap omim gene"] 224 | filter_ts --> fetch_omim_gene 225 | omim_gene --> fetch_omim_gene 226 | 227 | ts_region("ts_region") 228 | class ts_region output 229 | fetch_omim_gene --> ts_region 230 | 231 | filter_uts["filter Triplosensitivity Score == 40"] 232 | curation --> filter_uts 233 | 234 | gene("gene") 235 | class gene variable 236 | 237 | fetch_gene["fetch overlap gene"] 238 | filter_uts --> fetch_gene 239 | gene --> fetch_gene 240 | 241 | uts_region("uts_region") 242 | class uts_region output 243 | fetch_gene --> uts_region 244 | end 245 | ``` 246 | 247 | ## 预测基因 (decipher) 248 | 249 | ```mermaid 250 | graph TD 251 | 252 | classDef input fill:#e9ffef 253 | 254 | classDef output fill:#ffebe9 255 | 256 | classDef variable fill:#feffe9 257 | 258 | subgraph prepare decipher 259 | predictions("decipher") 260 | class predictions input 261 | 262 | gene("gene") 263 | class gene variable 264 | 265 | merge["merge by sybol & name2"] 266 | predictions --> merge 267 | gene --> merge 268 | 269 | gnomad("gnomad") 270 | class gnomad input 271 | 272 | join_pli["join pLI by name2"] 273 | gnomad --> join_pli 274 | merge --> join_pli 275 | 276 | join_lof["join oe_lof_upper by name2"] 277 | gnomad --> join_lof 278 | join_pli --> join_lof 279 | 280 | filter["filter pLI >= 0.9 & hi_index < 10% & oe_lof_upper < 0.35"] 281 | join_lof --> filter 282 | 283 | decipher("decipher") 284 | class decipher output 285 | filter --> decipher 286 | end 287 | ``` 288 | 289 | ## control 290 | 291 | ```mermaid 292 | graph TD 293 | 294 | classDef input fill:#e9ffef 295 | 296 | classDef output fill:#ffebe9 297 | 298 | classDef variable fill:#feffe9 299 | 300 | subgraph prepare gnomad 301 | 302 | gnomad("gnomad") 303 | class gnomad input 304 | 305 | filter_qc["filter FILTER == PASS & svtype in (DEL, DUP)"] 306 | gnomad --> filter_qc 307 | 308 | subgraph af filters 309 | filter_af["filter N_BI_GENOS >= 1000"] 310 | 311 | filter_afr["filter AFR_N_BI_GENOS >= 1000"] 312 | filter_af -. or .-> filter_afr 313 | 314 | filter_amr["filter AMR_N_BI_GENOS >= 1000"] 315 | filter_afr -. or .-> filter_amr 316 | 317 | filter_eas["filter EAS_N_BI_GENOS >= 1000"] 318 | filter_amr -. or .-> filter_eas 319 | 320 | filter_eur["filter EUR_N_BI_GENOS >= 1000"] 321 | filter_eas -. or .-> filter_eur 322 | end 323 | filter_qc --> filter_af 324 | 325 | fetch_gene_gnomad["fetch ovalap gene"] 326 | filter_eur --> fetch_gene_gnomad 327 | 328 | filter_del["filter svtype == DEL"] 329 | fetch_gene_gnomad --> filter_del 330 | 331 | gnomad_del("gnomad_del") 332 | class gnomad_del output 333 | filter_del --> gnomad_del 334 | 335 | filter_dup["filter svtype == DUP"] 336 | fetch_gene_gnomad --> filter_dup 337 | 338 | gnomad_dup("gnomad_dup") 339 | class gnomad_dup output 340 | filter_dup --> gnomad_dup 341 | end 342 | 343 | subgraph prepare dgv 344 | dgv("dgv") 345 | class dgv input 346 | 347 | filter_dgv["filter freq >= 1% & sample >= 1000"] 348 | dgv --> filter_dgv 349 | 350 | fetch_gene_dgv["fetch overlap gene"] 351 | filter_dgv --> fetch_gene_dgv 352 | 353 | filter_gain["filter type == Gain"] 354 | fetch_gene_dgv --> filter_gain 355 | 356 | dgv_gain("dgv_gain") 357 | class dgv_gain output 358 | filter_gain --> dgv_gain 359 | 360 | filter_loss["filter type == Loss"] 361 | fetch_gene_dgv --> filter_loss 362 | 363 | dgv_loss("dgv_loss") 364 | class dgv_loss output 365 | filter_loss --> dgv_loss 366 | end 367 | ``` 368 | 369 | -------------------------------------------------------------------------------- /autocnv/annotate.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from autocnv.database import DataBase 4 | from pysam import VariantFile 5 | from autocnv import settings 6 | from autopvs1.cnv import CNVRecord, PVS1CNV 7 | from autopvs1.utils import get_transcript 8 | from autopvs1.read_data import transcripts 9 | from autopvs1.strength import Strength 10 | from collections import defaultdict 11 | from itertools import chain 12 | import operator 13 | import pandas as pd 14 | 15 | SEP = '\n' 16 | DEFAULT_EMPTY_VALUE = '-' 17 | NAME_MAP = {'chr': 'chromosome', 'type': 'func'} 18 | 19 | PVS1 = { 20 | Strength.VeryStrong: 'PVS1', Strength.Strong: 'PVS1_S', Strength.Moderate: 'PVS1_M', 21 | Strength.Supporting: 'PVS1_P', Strength.Unmet: 'PVS1_U' 22 | } 23 | 24 | # 计分分组配置,同一组证据仅计算最大分值 25 | SCORE_GROUP = { 26 | 'del': { 27 | rule: 'G1' for rule in ('2A', '2B', '2C-1', '2C-2', '2D-1', '2D-2', '2D-3', '2D-4', '2E') 28 | }, 29 | 'dup': {} 30 | } 31 | 32 | # 致病性判断分级配置 33 | PATHOGENICITY_LEVELS = [ 34 | (operator.ge, 0.99, 'P'), (operator.ge, 0.9, 'LP'), (operator.gt, -0.9, 'VUS'), 35 | (operator.gt, -0.99, 'LB'), (operator.le, -0.99, 'B') 36 | ] 37 | 38 | 39 | class AnnotateHelper: 40 | def __init__(self): 41 | self._gene_database = DataBase(settings.GENE_DATABASE) 42 | self._omim_gene_database = DataBase(settings.OMIM_GENE_DATABASE) 43 | self._func_region_database = DataBase(settings.FUNC_REGION_DATABASE) 44 | self._hi_gene_database = DataBase(settings.HI_GENE_DATABASE) 45 | self._hi_exon_database = DataBase(settings.HI_EXON_DATABASE) 46 | self._hi_cds_database = DataBase(settings.HI_CDS_DATABASE) 47 | self._clinvar_pathogenic_database = VariantFile( 48 | settings.CLINVAR_PATHOGENIC_DATABASE) 49 | self._uhi_gene_database = DataBase(settings.UHI_GENE_DATABASE) 50 | self._hi_region_database = DataBase(settings.HI_REGION_DATABASE) 51 | self._uhi_region_database = DataBase(settings.UHI_REGION_DATABASE) 52 | self._decipher_gene_database = DataBase( 53 | settings.DECIPHER_GENE_DATABASE) 54 | self._ts_gene_database = DataBase(settings.TS_GENE_DATABASE) 55 | self._ts_region_database = DataBase(settings.TS_REGION_DATABASE) 56 | self._uts_gene_database = DataBase(settings.UTS_GENE_DATABASE) 57 | self._uts_region_database = DataBase(settings.UTS_REGION_DATABASE) 58 | self._dgv_gain_database = DataBase(settings.DGV_GAIN_DATABASE) 59 | self._dgv_loss_database = DataBase(settings.DGV_LOSS_DATABASE) 60 | self._gnomad_del_database = DataBase(settings.GNOMAD_DEL_DATABASE) 61 | self._gnomad_dup_database = DataBase(settings.GNOMAD_DUP_DATABASE) 62 | self._cnv_syndrome_del_database = DataBase(settings.CNV_SYNDROME_DEL_DATABASE) 63 | self._cnv_syndrome_dup_database = DataBase(settings.CNV_SYNDROME_DUP_DATABASE) 64 | self._cytoband_database = DataBase(settings.CYTO_BAND_FILE) 65 | self._exon_database = DataBase(settings.GENE_EXON_DATABASE) 66 | 67 | self.serializer = self._serializer # func compatible 68 | 69 | @staticmethod 70 | def _chrom_num(chrom): 71 | return re.sub('chr', '', str(chrom), flags=re.I) 72 | 73 | @staticmethod 74 | def _norm_chrom(ch): 75 | """ 76 | normalize chromosome name, eg. 2 -> chr2, 23 -> chrX 77 | :param ch: input chromosome name 78 | :return: normalized name 79 | >>> norm_chrom(2) 80 | 'chr2' 81 | >>> norm_chrom('Chr23') 82 | 'chrX' 83 | """ 84 | ch = AnnotateHelper._chrom_num(ch) 85 | if ch == '23': 86 | return 'chrX' 87 | if ch == '24': 88 | return 'chrY' 89 | return f'chr{ch}' 90 | 91 | @staticmethod 92 | def _annotate_loss(**annotation): 93 | """ 94 | 计算拷贝数减少的CNV的证据项 95 | :param annotation: 已注释的CNV 96 | :return: 注释后的CNV 97 | """ 98 | loss = dict() 99 | 100 | # Section 1 101 | 102 | if len(annotation['outer_overlap_genes']) + len(annotation['overlap_func_regions']) > 0: 103 | loss['1A'] = True 104 | else: 105 | loss['1B'] = True 106 | 107 | # Section 2 108 | 109 | # hi区域 110 | for region, overlap, coverage in annotation['overlap_hi_regions']: 111 | if coverage == 1: # 完全覆盖区域 112 | loss['2A'] = True 113 | elif len(set(gene.symbol for gene, *_ in annotation['overlap_hi_genes'])) == 0: 114 | # 未覆盖hi基因 115 | loss['2B'] = True 116 | 117 | # hi基因 118 | for gene, overlap, coverage in annotation['overlap_hi_genes']: 119 | if coverage == 1: # 完全覆盖基因 120 | loss['2A'] = True 121 | elif overlap < 1: # 是否位于基因内部 122 | if any( 123 | exon.last_exon == 'True' 124 | for exon, *_ in annotation['overlap_hi_exons'][gene.gene_id] 125 | ): # 是否覆盖末位外显子 126 | if len(annotation['overlap_hi_exons'][gene.gene_id]) >= 2: 127 | # 覆盖超过两个外显子 128 | loss['2D-4'] = True 129 | elif gene.gene_id in annotation['overlap_hi_cds'] \ 130 | and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0: # 是否覆盖CDS 131 | if len(annotation['variants']) > 0: # 末位外显子是否有致病变异 132 | loss['2D-2'] = True 133 | else: # 末尾外显子无致病变异 134 | loss['2D-3'] = True 135 | else: 136 | # 不覆盖CDS区 137 | loss['2D-1'] = True 138 | # 未覆盖末位外显子 139 | elif gene.gene_id in annotation['overlap_hi_cds'] \ 140 | and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0: # 是否覆盖5'端CDS 141 | loss['2C-1'] = True 142 | else: # 未覆盖5'端CDS 143 | loss['2C-2'] = True 144 | # 位于基因内部 145 | else: 146 | cnv = CNVRecord( 147 | annotation['chromosome'], annotation['inner_start'], 148 | annotation['inner_end'], annotation['func'] 149 | ) 150 | tx = get_transcript(gene.transcript, transcripts) 151 | pvs1 = PVS1CNV(cnv, None, tx) 152 | loss['2E'] = True 153 | # loss[PVS1[pvs1.verify_DEL()[0]]] = True 154 | try: # HOTFIX: pvs1 error 155 | loss['pvs1'] = PVS1[pvs1.verify_DEL()[0]] 156 | except: 157 | print(cnv) 158 | 159 | # 包含预测HI基因 160 | if len(annotation['overlap_hi_genes']) + len(annotation['overlap_hi_regions']) == 0 \ 161 | and len(annotation['overlap_decipher_genes']) > 0: 162 | loss['2H'] = True 163 | 164 | # 落入uhi基因 165 | for gene, overlap, coverage in annotation['overlap_uhi_genes']: 166 | if overlap == 1: 167 | loss['2F'] = True 168 | 169 | # 落入uhi区域 170 | genes = set(gene.symbol for gene, * 171 | _ in annotation['outer_overlap_genes']) 172 | for region, overlap, coverage in annotation['overlap_uhi_regions']: 173 | if len(genes - set(region.genes.split(','))) > 0: 174 | loss['2G'] = True 175 | else: 176 | loss['2F'] = True 177 | 178 | # Section 3 179 | 180 | # 覆盖基因个数 181 | gene_count = len(annotation['outer_overlap_genes']) 182 | if gene_count >= 35: 183 | loss['3C'] = True 184 | elif gene_count >= 25: 185 | loss['3B'] = True 186 | elif gene_count >= 0: 187 | loss['3A'] = True 188 | 189 | # Section 4 190 | 191 | # DGV金标和Gnomad 192 | genes = set(gene.symbol for gene, * 193 | _ in annotation['outer_overlap_genes']) 194 | l, m = 0, 0 195 | for record, overlap, coverage in chain( 196 | annotation['dgv_loss_records'], annotation['gnomad_del_records'] 197 | ): 198 | if overlap == 1 and any( 199 | float(v) >= 0.01 for f, v in record._asdict().items() if f.startswith('af') 200 | ): # 完全覆盖待解读CNV且频率大于1% 201 | loss['4O'] = True 202 | break 203 | elif overlap >= 0.5 and len(genes - set(record.genes.split(','))) == 0: 204 | # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因 205 | if any(float(v) < 0.01 for f, v in record._asdict().items() if f.startswith('af')): 206 | # 频率小于1% 207 | m += 1 208 | else: 209 | # 频率大于1% 210 | l += 1 211 | else: 212 | if l > 0 and m == 0: # 存在频率大于1%且不存在小于1%的CNV 213 | loss['4O'] = True 214 | 215 | annotation['rules'] = loss 216 | return annotation 217 | 218 | @staticmethod 219 | def _annotate_gain(**annotation): 220 | """ 221 | 计算拷贝数减少的CNV的证据项 222 | :param annotation: 已注释的CNV 223 | :return: 注释后的CNV 224 | """ 225 | gain = dict() 226 | 227 | # Section 1 228 | 229 | if len(annotation['outer_overlap_genes']) + len(annotation['overlap_func_regions']) > 0: 230 | gain['1A'] = True 231 | else: 232 | gain['1B'] = True 233 | 234 | # Section 2 235 | 236 | # 完全覆盖ts区域 237 | for region, overlap, coverage in annotation['overlap_ts_regions']: 238 | if coverage == 1: # 是否覆盖整改区域 239 | gain['2A'] = True 240 | elif len(set(gene.symbol for gene, *_ in annotation['overlap_ts_genes'])) == 0: 241 | # 未覆盖ts基因 242 | gain['2B'] = True 243 | 244 | for gene, overlap, coverage in annotation['overlap_ts_genes']: 245 | # 覆盖整个基因 246 | if coverage == 1: 247 | gain['2A'] = True 248 | 249 | # 落入uts基因 250 | for gene, overlap, coverage in annotation['overlap_uts_genes']: 251 | if overlap == 1: 252 | gain['2D'] = True 253 | 254 | # 落入uts区域 255 | for region, overlap, coverage in annotation['overlap_uts_regions']: 256 | genes = set(gene.symbol for gene, * 257 | _ in annotation['inner_overlap_genes']) 258 | region_genes = set(region.genes.split(',')) 259 | if overlap == coverage == 1: # 与良性区域完全一致 260 | gain['2C'] = True 261 | elif len(genes - region_genes) > 0: # 编码蛋白基因比良性区域多 262 | gain['2G'] = True 263 | # 破坏蛋白编码基因 264 | elif any(c < 1 for *_, c in annotation['inner_overlap_genes']): 265 | gain['2E'] = True 266 | elif overlap == 1: # 被良性区域完全覆盖 267 | gain['2D'] = True 268 | else: 269 | gain['2F'] = True 270 | 271 | # hi基因 272 | hi_genes = set() 273 | for gene, overlap, coverage in annotation['overlap_hi_genes']: 274 | hi_genes.add(gene.symbol) 275 | if coverage == 1: # 完全覆盖 276 | gain['2H'] = True 277 | elif overlap == 1: # 两端均位于基因内 278 | cnv = CNVRecord( 279 | annotation['chromosome'], annotation['inner_start'], 280 | annotation['inner_end'], annotation['func'] 281 | ) 282 | tx = get_transcript(gene.transcript, transcripts) 283 | pvs1 = PVS1CNV(cnv, None, tx) 284 | gain['2I'] = True 285 | # gain[PVS1[pvs1.verify_DUP()[0]]] = True 286 | gain['pvs1'] = PVS1[pvs1.verify_DUP()[0]] 287 | 288 | # 非hi基因 289 | for gene, overlap, coverage in annotation['inner_overlap_genes']: 290 | if gene.symbol not in hi_genes and coverage != 1: 291 | gain['2L'] = True 292 | annotation['break_point_genes'].append(gene.symbol) 293 | 294 | # Section 3 295 | 296 | # 覆盖基因个数 297 | gene_count = len(annotation['inner_overlap_genes']) 298 | if gene_count >= 50: 299 | gain['3C'] = True 300 | elif gene_count >= 35: 301 | gain['3B'] = True 302 | elif gene_count >= 0: 303 | gain['3A'] = True 304 | 305 | # Section 4 306 | 307 | # DGV金标和Gnomad 308 | genes = set(gene.symbol for gene, * 309 | _ in annotation['outer_overlap_genes']) 310 | l, m = 0, 0 311 | for record, overlap, coverage in chain( 312 | annotation['dgv_gain_records'], annotation['gnomad_dup_records'] 313 | ): 314 | if overlap == 1 and any( 315 | float(v) >= 0.01 for f, v in record._asdict().items() if f.startswith('af') 316 | ): # 完全覆盖待解读CNV且频率大于1% 317 | gain['4O'] = True 318 | break 319 | elif overlap >= 0.5 and len(genes - set(record.genes.split(','))) == 0: 320 | # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因 321 | if any(float(v) < 0.01 for f, v in record._asdict().items() if 322 | f.startswith('af')): 323 | # 频率小于1% 324 | m += 1 325 | else: 326 | # 频率大于1% 327 | l += 1 328 | else: 329 | if l > 0 and m == 0: # 存在频率大于1%且不存在小于1%的CNV 330 | gain['4O'] = True 331 | 332 | annotation['rules'] = gain 333 | return annotation 334 | 335 | @staticmethod 336 | def merge_score(func, **rules): 337 | """ 338 | 整合所有证据项得分 339 | :param func: 变异类型 340 | :param rules: 证据项 341 | :return: 生成各证据项得分 342 | """ 343 | groups = defaultdict(list) 344 | for rule, score in rules.items(): 345 | try: # 需要分组计分的证据项先收集起来 346 | groups[SCORE_GROUP[func][rule]].append(score) 347 | except KeyError: # 无需分组计分的证据项直接计分 348 | yield score 349 | for _, scores in groups.items(): # 分组计分的证据项只计算最大分值 350 | yield max(scores) 351 | 352 | @staticmethod 353 | def judge(func, **rules): 354 | """ 355 | 判断给定的证据项组合最终的致病性 356 | :param func: 变异类型 357 | :param rules: 勾选的证据项 358 | :return: 证据项、得分和致病性 359 | """ 360 | # 获取所有证据项得分 361 | # rules = { 362 | # rule: settings.DEFAULT_SCORE[func][rule] for rule, check in rules.items() if check 363 | # } 364 | rules_value = {} 365 | for rule, check in rules.items(): 366 | if check in PVS1.values(): 367 | rules_value['pvs1'] = settings.DEFAULT_SCORE[func][check] 368 | elif check: 369 | rules_value[rule] = settings.DEFAULT_SCORE[func][rule] 370 | # 整合所有证据项得分 371 | score = sum(AnnotateHelper.merge_score(func, **rules_value)) 372 | # 判断致病性 373 | for op, cutoff, level in PATHOGENICITY_LEVELS[:-1]: 374 | if op(score, cutoff): 375 | pathogenicity = level 376 | break 377 | else: 378 | pathogenicity = PATHOGENICITY_LEVELS[-1][2] 379 | return rules_value, score, pathogenicity 380 | 381 | def annotate(self, chromosome, start, end, func, error=0, **kwargs): 382 | """ 383 | 对给定CNV进行注释 384 | :param chromosome: 染色体编号 385 | :param start: 起始位置 386 | :param end: 终止位置 387 | :param func: 变异类型 388 | :param error: 误差值 389 | :return: 注释结果 390 | """ 391 | chromosome = self._norm_chrom(chromosome) 392 | annotation = dict( 393 | chromosome=chromosome, start=start, end=end, 394 | length=end - start, error=error, 395 | outer_start=start - error, outer_end=end + error, 396 | inner_start=start + error, inner_end=end - error, 397 | func=func, break_point_genes=list() 398 | ) 399 | 400 | annotation['inner_overlap_genes'] = list(self._gene_database.overlap( 401 | chromosome, annotation['inner_start'], annotation['inner_end'], 402 | )) 403 | 404 | annotation['outer_overlap_genes'] = list(self._gene_database.overlap( 405 | chromosome, annotation['outer_start'], annotation['outer_end'], 406 | )) 407 | 408 | annotation['overlap_omim_genes'] = list(self._omim_gene_database.overlap( 409 | chromosome, annotation['inner_start'], annotation['inner_end'] 410 | )) 411 | 412 | annotation['overlap_func_regions'] = list(self._func_region_database.overlap( 413 | chromosome, annotation['outer_start'], annotation['outer_end'] 414 | )) 415 | 416 | annotation['overlap_hi_genes'] = list(self._hi_gene_database.overlap( 417 | chromosome, annotation['inner_start'], annotation['inner_end'] 418 | )) 419 | 420 | annotation['overlap_hi_exons'] = self._hi_exon_database.overlap_groups( 421 | chromosome, annotation['inner_start'], annotation['inner_end'], 422 | lambda record: record[0].gene_id 423 | ) 424 | 425 | annotation['overlap_hi_cds'] = self._hi_cds_database.overlap_groups( 426 | chromosome, annotation['inner_start'], annotation['inner_end'], 427 | lambda record: record[0].gene_id 428 | ) 429 | 430 | try: 431 | annotation['variants'] = list(self._clinvar_pathogenic_database.fetch( 432 | chromosome, annotation['inner_start'], annotation['inner_end']) 433 | ) 434 | except ValueError: 435 | annotation['variants'] = [] 436 | 437 | annotation['overlap_hi_regions'] = list(self._hi_region_database.overlap( 438 | chromosome, annotation['inner_start'], annotation['inner_end'] 439 | )) 440 | 441 | annotation['overlap_decipher_genes'] = list(self._decipher_gene_database.overlap( 442 | chromosome, annotation['inner_start'], annotation['inner_end'] 443 | )) 444 | 445 | annotation['overlap_uhi_genes'] = list(self._uhi_gene_database.overlap( 446 | chromosome, annotation['outer_start'], annotation['outer_end'] 447 | )) 448 | 449 | annotation['overlap_uhi_regions'] = list(self._uhi_region_database.overlap( 450 | chromosome, annotation['outer_start'], annotation['outer_end'] 451 | )) 452 | 453 | annotation['overlap_ts_genes'] = list(self._ts_gene_database.overlap( 454 | chromosome, annotation['inner_start'], annotation['inner_end'] 455 | )) 456 | 457 | annotation['overlap_ts_regions'] = list(self._ts_region_database.overlap( 458 | chromosome, annotation['inner_start'], annotation['inner_end'] 459 | )) 460 | 461 | annotation['overlap_uts_genes'] = list(self._uts_gene_database.overlap( 462 | chromosome, annotation['outer_start'], annotation['outer_end'] 463 | )) 464 | 465 | annotation['overlap_uts_regions'] = list(self._uts_region_database.overlap( 466 | chromosome, annotation['outer_start'], annotation['outer_end'] 467 | )) 468 | 469 | annotation['dgv_gain_records'] = list(self._dgv_gain_database.overlap( 470 | chromosome, annotation['outer_start'], annotation['outer_end'] 471 | )) 472 | 473 | annotation['dgv_loss_records'] = list(self._dgv_loss_database.overlap( 474 | chromosome, annotation['outer_start'], annotation['outer_end'] 475 | )) 476 | 477 | annotation['gnomad_del_records'] = list(self._gnomad_del_database.overlap( 478 | chromosome, annotation['outer_start'], annotation['outer_end'] 479 | )) 480 | 481 | annotation['gnomad_dup_records'] = list(self._gnomad_dup_database.overlap( 482 | chromosome, annotation['outer_start'], annotation['outer_end'] 483 | )) 484 | 485 | annotation['cnv_syndrome_loss'] = list(self._cnv_syndrome_del_database.overlap( 486 | chromosome, annotation['outer_start'], annotation['outer_end'] 487 | )) 488 | annotation['cnv_syndrome_gain'] = list(self._cnv_syndrome_dup_database.overlap( 489 | chromosome, annotation['outer_start'], annotation['outer_end'] 490 | )) 491 | 492 | annotation['cyto_band'] = list(self._cytoband_database.overlap( 493 | chromosome, annotation['outer_start'], annotation['outer_end'] 494 | )) 495 | 496 | annotation['exon'] = list( 497 | self._exon_database.overlap( 498 | chromosome, annotation['outer_start'], annotation['outer_end'] 499 | ) 500 | ) 501 | 502 | if func == 'del': 503 | annotation = self._annotate_loss(**annotation) 504 | elif func == 'dup': 505 | annotation = self._annotate_gain(**annotation) 506 | else: 507 | raise ValueError('Unknown func `{}`'.format(func)) 508 | 509 | annotation['rules'], annotation['score'], annotation['pathogenicity'] = self.judge( 510 | func, **annotation['rules'] 511 | ) 512 | # PVS1 513 | if func == 'del' and '2E' in annotation['rules'].keys(): 514 | annotation['rules']['2E'] = annotation['rules'].get('pvs1') 515 | elif func == 'dup' and '2I' in annotation['rules'].keys(): 516 | annotation['rules']['2I'] = annotation['rules'].get('pvs1') 517 | annotation['pvs1'] = annotation['rules'].pop('pvs1', DEFAULT_EMPTY_VALUE) 518 | 519 | return annotation 520 | 521 | def _serializer(self, anno_result): 522 | seri = {} 523 | cyto_band_li = [x[0].name for x in anno_result['cyto_band']] 524 | if len(cyto_band_li) == 0: 525 | cyto_str = DEFAULT_EMPTY_VALUE 526 | elif len(cyto_band_li) == 1: 527 | cyto_str = f'{AnnotateHelper._chrom_num(anno_result["chromosome"])}{cyto_band_li[0]}' 528 | else: 529 | cyto_str = f'{AnnotateHelper._chrom_num(anno_result["chromosome"])}{cyto_band_li[0]}{cyto_band_li[-1]}' 530 | 531 | seri['cyto_band'] = cyto_str 532 | 533 | seri['inner_gene'] = ','.join( 534 | x[0].symbol for x in anno_result['inner_overlap_genes']) 535 | seri['inner_omim_gene'] = ','.join( 536 | x[0].symbol for x in anno_result['overlap_omim_genes']) 537 | if len(anno_result['exon']) == 0: 538 | seri['exon'] = DEFAULT_EMPTY_VALUE 539 | elif len(anno_result) == 1: 540 | seri['exon'] = f"{anno_result['exon'][0][0].symbol}_EX{anno_result['exon'][0][0].exon_number}" 541 | else: 542 | seri['exon'] = f"{anno_result['exon'][0][0].symbol}_EX{anno_result['exon'][0][0].exon_number}" \ 543 | f"-{anno_result['exon'][-1][0].symbol}_EX{anno_result['exon'][-1][0].exon_number}" 544 | 545 | seri['HI_gene'] = ','.join( 546 | f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_hi_genes']) 547 | seri['HI_region'] = SEP.join( 548 | f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_hi_regions']) 549 | seri['TS_gene'] = ','.join( 550 | f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_ts_genes']) 551 | seri['TS_region'] = ','.join( 552 | f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_ts_regions']) 553 | seri['Pred_HI_gene'] = ','.join( 554 | f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_decipher_genes']) 555 | seri['auto_evidence'] = ','.join(sorted(anno_result['rules'])) 556 | seri['auto_evidence_score'] = ','.join( 557 | f'{k}:{anno_result["rules"][k]}' for k in sorted(anno_result['rules'])) 558 | seri['benign_hi_gene'] = ','.join( 559 | f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uhi_genes']) 560 | seri['benign_hi_region'] = ','.join( 561 | f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uhi_regions']) 562 | seri['benign_ts_gene'] = ','.join( 563 | f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uts_genes']) 564 | seri['benign_ts_region'] = ','.join( 565 | f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uts_regions']) 566 | seri['dgv_loss_records'] = ','.join( 567 | f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['dgv_loss_records'] 568 | ) 569 | seri['dgv_gain_records'] = ','.join( 570 | f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['dgv_gain_records'] 571 | ) 572 | seri['gnomad_loss_records'] = ','.join( 573 | f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in 574 | anno_result['gnomad_del_records'] 575 | ) 576 | seri['gnomad_gain_records'] = ','.join( 577 | f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in 578 | anno_result['gnomad_dup_records'] 579 | ) 580 | seri['cnv_syndrome_gain'] = ','.join( 581 | f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['cnv_syndrome_gain'] 582 | ) 583 | seri['cnv_syndrome_loss'] = ','.join( 584 | f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['cnv_syndrome_loss'] 585 | ) 586 | seri['auto_score'] = anno_result['score'] 587 | seri['auto_pathogenicity'] = anno_result['pathogenicity'] 588 | seri['pvs1'] = anno_result['pvs1'] 589 | return seri 590 | 591 | def _seri_anno(self, seri: pd.Series) -> pd.Series: 592 | anno_result = self.annotate(**seri.rename(NAME_MAP).to_dict()) 593 | return seri.append( 594 | pd.Series(self._serializer(anno_result)).replace('', '-').fillna(DEFAULT_EMPTY_VALUE)) 595 | 596 | def annotation_file(self, file_path, result_path, col_map=None, cnv_map=None): 597 | """ 598 | annotate specified file, required columns: chr, start, end, type, error 599 | :param file_path: input file (TSV) 600 | :param result_path: result file path (TSV) 601 | :return: - 602 | """ 603 | 604 | if file_path.endswith('xlsx'): 605 | input_df = pd.read_excel(file_path) 606 | else: 607 | input_df = pd.read_csv(file_path, sep='\t') 608 | if col_map is not None: 609 | input_df.rename(columns=col_map, inplace=True) 610 | if cnv_map is not None: 611 | input_df['type'] = input_df['type'].map(lambda x: cnv_map.get(x, x)) 612 | if 'error' not in input_df.columns: 613 | input_df['error'] = 0 614 | input_df['chr'] = input_df['chr'].map(self._norm_chrom) 615 | try: 616 | from tqdm import tqdm 617 | tqdm.pandas() 618 | input_df = input_df.progress_apply(self._seri_anno, axis=1) 619 | except ImportError: 620 | input_df = input_df.apply(self._seri_anno, axis=1) 621 | if result_path.endswith('xlsx'): 622 | input_df.to_excel(result_path, index=False) 623 | else: 624 | input_df.to_csv(result_path, sep='\t', index=False) 625 | --------------------------------------------------------------------------------