├── autocnv
    ├── __init__.py
    ├── tests
    │   ├── __init__.py
    │   └── test_annotation.py
    ├── utils.py
    ├── __main__.py
    ├── database.py
    ├── settings.py
    └── annotate.py
├── requirements.txt
├── docs
    ├── 涉及数据库.xlsx
    └── 分析流程图V1.1.tiff
├── database-prepare
    ├── exon_prep.py
    └── database-create-flowchart.md
├── README.md
└── .gitignore


/autocnv/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/autocnv/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pandas~=1.2.5
2 | pyfaidx
3 | pysam~=0.16.0.1
4 | gtfparse~=1.2.1


--------------------------------------------------------------------------------
/docs/涉及数据库.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhonghua-wang/autocnv/HEAD/docs/涉及数据库.xlsx


--------------------------------------------------------------------------------
/docs/分析流程图V1.1.tiff:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zhonghua-wang/autocnv/HEAD/docs/分析流程图V1.1.tiff


--------------------------------------------------------------------------------
/autocnv/utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding:utf-8 -*-
 2 | from pysam import VariantRecord
 3 | from json import JSONEncoder
 4 | from autopvs1.strength import Strength
 5 | from autocnv import settings
 6 | 
 7 | class ACITEncoder(JSONEncoder):
 8 |     def default(self, o):
 9 |         if isinstance(o, VariantRecord):
10 |             return [o.contig, o.pos, o.ref, o.alts]
11 |         if isinstance(o, Strength):
12 |             return o.name
13 |         return JSONEncoder.default(self, o)
14 | 


--------------------------------------------------------------------------------
/database-prepare/exon_prep.py:
--------------------------------------------------------------------------------
 1 | #%%
 2 | from autocnv import settings
 3 | from autocnv.database import DataBase
 4 | from gtfparse import read_gtf
 5 | import pandas as pd
 6 | refGene_file = '/Users/zhonghua/data/gene-toolkit/raw-data/hg19.refGene.gtf.gz'
 7 | #%%
 8 | refGene_df = read_gtf(refGene_file)
 9 | refGene_df = refGene_df[refGene_df['feature'] == 'exon']
10 | #%%
11 | gene_db = pd.read_csv(settings.GENE_DATABASE, sep='\t')
12 | #%%
13 | 
14 | merge_df = gene_db.merge(refGene_df, left_on='transcript', right_on='transcript_id')[[
15 |     '#chrom', 'start_y', 'end_y', 'gene_id_x', 'symbol', 'exon_number', 'transcript'
16 | ]]
17 | merge_df.columns = [x.strip('_x').strip('_y') for x in merge_df.columns]
18 | 
19 | merge_df.to_csv(settings.GENE_EXON_DATABASE.strip('.gz'), sep='\t', index=False)
20 | #%%
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/autocnv/__main__.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from autocnv.annotate import AnnotateHelper
 3 | 
 4 | CNVKIT_COL_MAP = {
 5 |     'Chr': 'chr', 'Start': 'start', 'End': 'end', 'Detect': 'type'
 6 | }
 7 | CNV_MAP = {
 8 |     'Del': 'del', 'Dup': 'dup'
 9 | }
10 | 
11 | if __name__ == '__main__':
12 |     ap = argparse.ArgumentParser()
13 |     ap.add_argument(
14 |         '--in', dest='input', required=True,
15 |         help='input file (TSV format), columns: chr, start, end, type, error are required'
16 |     )
17 |     ap.add_argument(
18 |         '--out', dest='output', required=True,
19 |         help='annotated result file path'
20 |     )
21 |     ap.add_argument('--cnvkit', dest='cnvkit', help='use CNVkit result as input', action='store_true', default=False)
22 |     args = ap.parse_args()
23 |     anno = AnnotateHelper()
24 |     if args.cnvkit:
25 |         anno.annotation_file(args.input, args.output, col_map=CNVKIT_COL_MAP, cnv_map=CNV_MAP)
26 |     else:
27 |         anno.annotation_file(args.input, args.output)


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # AutoCNV: an **Auto**matic **C**opy **N**umber **V**ariant Interpretation Tool
 2 | 
 3 | ## Installation
 4 | 
 5 | ### Pre-requirements
 6 | 
 7 | - `pysam`: install `pysam` via conda:
 8 | 
 9 |   ```shell
10 |   conda config --add channels r
11 |   conda config --add channels bioconda
12 |   conda install pysam
13 |   ```
14 |   For detail install instruction of `pysam`, please refer to [the official document](https://pysam.readthedocs.io/en/latest/installation.html).
15 | 
16 | - install [`autopvs1`](https://github.com/JiguangPeng/autopvs1)
17 | - install requirments `pip install -r requirements.txt`
18 | 
19 | ### Usage
20 | please refer to the test cases.
21 | 
22 | ### Citation
23 | 
24 | ```
25 | @article{Fan2021,
26 |     author = {Fan, Chunna and Wang, Zhonghua and Sun, Yan and Sun, Jun and Liu, Xi and Kang, Licheng and Xu, Yingshuo and Yang, Manqiu and Dai, Wentao and Song, Lijie and Wei, Xiaoming and Xiang, Jiale and Huang, Hui and Zhou, Meizhen and Zeng, Fanwei and Huang, Lin and Xu, Zhengfeng and Peng, Zhiyu},
27 |     doi = {10.1186/s12864-021-08011-4},
28 |     isbn = {1286402108011},
29 |     issn = {14712164},
30 |     journal = {BMC Genomics},
31 |     keywords = {AutoCNV,CNV classification,CNV interpretation,Scoring},
32 |     number = {1},
33 |     pages = {1--12},
34 |     pmid = {34615484},
35 |     publisher = {BMC Genomics},
36 |     title = {{AutoCNV: a semiautomatic CNV interpretation system based on the 2019 ACMG/ClinGen Technical Standards for CNVs}},
37 |     volume = {22},
38 |     year = {2021}
39 | }
40 | ```


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Byte-compiled / optimized / DLL files
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | 
 6 | # C extensions
 7 | *.so
 8 | 
 9 | # Distribution / packaging
10 | .Python
11 | env/
12 | build/
13 | develop-eggs/
14 | dist/
15 | downloads/
16 | eggs/
17 | .eggs/
18 | lib/
19 | lib64/
20 | parts/
21 | sdist/
22 | var/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | 
27 | # PyInstaller
28 | #  Usually these files are written by a python script from a template
29 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
30 | *.manifest
31 | *.spec
32 | 
33 | # Installer logs
34 | pip-log.txt
35 | pip-delete-this-directory.txt
36 | 
37 | # Unit tests / coverage reports
38 | htmlcov/
39 | .tox/
40 | .coverage
41 | .coverage.*
42 | .cache
43 | nosetests.xml
44 | coverage.xml
45 | *,cover
46 | .hypothesis/
47 | 
48 | # Translations
49 | *.mo
50 | *.pot
51 | 
52 | # Django stuff:
53 | *.log
54 | local_settings.py
55 | 
56 | # Flask stuff:
57 | instance/
58 | .webassets-cache
59 | 
60 | # Scrapy stuff:
61 | .scrapy
62 | 
63 | # Sphinx documentation
64 | docs/_build/
65 | 
66 | # PyBuilder
67 | target/
68 | 
69 | # IPython Notebook
70 | .ipynb_checkpoints
71 | 
72 | # pyenv
73 | .python-version
74 | 
75 | # celery beat schedule file
76 | celerybeat-schedule
77 | 
78 | # dotenv
79 | .env
80 | 
81 | # virtualenv
82 | venv/
83 | ENV/
84 | 
85 | # Spyder project settings
86 | .spyderproject
87 | 
88 | # Rope project settings
89 | .idea/
90 | data/
91 | .code/
92 | .DS_Store
93 | test_data/
94 | .vscode/
95 | raw_data/
96 | docs/
97 | example/
98 | test_debug.py


--------------------------------------------------------------------------------
/autocnv/database.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | from collections import namedtuple
 3 | import pysam
 4 | 
 5 | 
 6 | class DataBase:
 7 |     def __init__(self, path):
 8 |         self._path = path
 9 |         self._tbx = pysam.Tabixfile(path)
10 |         self._fields = namedtuple('Record', self._tbx.header[-1].strip('#').split('\t'))
11 | 
12 |     def fetch(self, chrom, start, end):
13 |         """
14 |         查找并生成给定基因组位置的记录
15 |         :param chrom: 染色体编号
16 |         :param start: 起始位置
17 |         :param end: 终止位置
18 |         :return: 记录生成器
19 |         """
20 |         try:
21 |             for record in self._tbx.fetch(chrom, start, end):
22 |                 chrom, start, end, *fields = record.split('\t')
23 |                 start, end = int(start), int(end)
24 |                 # 所得记录按照表头组装为namedtuple方便使用
25 |                 yield self._fields(chrom, start, end, *fields)
26 |         except ValueError:
27 |             yield from ()
28 | 
29 |     def overlap(self, chrom, start, end):
30 |         """
31 |         查找并生成给定基因组位置的记录，同时计算两者之间的重叠程度
32 |         :param chrom: 染色体编号
33 |         :param start: 起始位置
34 |         :param end: 终止位置
35 |         :return: 记录与重叠程度生成器
36 |         """
37 |         length = end - start
38 |         for record in self.fetch(chrom, start, end):
39 |             _, overlap_start, overlap_end, _ = sorted((start, end, record[1], record[2]))
40 |             overlap = overlap_end - overlap_start
41 |             yield record, overlap / length, overlap / (record[2] - record[1])
42 | 
43 |     def overlap_groups(self, chrom, start, end, key=None):
44 |         """
45 |         查找并生成给定基因组位置的记录，同时计算两者之间的重叠程度，返回按照key方法分组的字典
46 |         :param chrom: 染色体编号
47 |         :param start: 起始位置
48 |         :param end: 终止位置
49 |         :param key: 分组依据
50 |         :return: 分组后的记录字典
51 |         """
52 |         groups = defaultdict(list)
53 |         for record in self.overlap(chrom, start, end):
54 |             groups[record if key is None else key(record)].append(record)
55 |         return dict(groups)
56 | 


--------------------------------------------------------------------------------
/autocnv/settings.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | DEFAULT_SCORE = {
 4 |     'del': {
 5 |         '1A': 0, '1B': -0.6,
 6 |         '2A': 1, '2B': 0,
 7 |         '2C': -1, '2C-1': 0.9, '2C-2': 0,
 8 |         '2D': -1, '2D-1': 0, '2D-2': 0.9, '2D-3': 0.3, '2D-4': 0.9,
 9 |         '2E': 0, '2F': -1, '2G': 0, '2H': 0.15, '2I': 0, '2J': 0, '2K': 0.45,
10 |         '3A': 0, '3B': 0.45, '3C': 0.9,
11 |         '4O': -1,
12 |         'PVS1': 0.9, 'PVS1_S': 0.45, 'PVS1_M': 0.3, 'PVS1_P': 0.15, 'PVS1_U': 0
13 |     },
14 |     'dup': {
15 |         '1A': 0, '1B': -0.6,
16 |         '2A': 1, '2B': 0,
17 |         '2C': -1, '2C-1': 0.9, '2C-2': 0,
18 |         '2D': -1, '2D-1': 0, '2D-2': 0.9, '2D-3': 0.3, '2D-4': 0.9,
19 |         '2E': 0, '2F': -1, '2G': 0, '2H': 0, '2I': 0, '2J': 0, '2K': 0.45, '2L': 0,
20 |         '3A': 0, '3B': 0.45, '3C': 0.9,
21 |         '4O': -1,
22 |         'PVS1': 0.9, 'PVS1_S': 0.45, 'PVS1_M': 0.3, 'PVS1_P': 0.15, 'PVS1_U': 0
23 |     }
24 | }
25 | 
26 | 
27 | 
28 | BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
29 | 
30 | CYTO_BAND_FILE = os.path.join(BASE_DIR, 'data', 'cyto-band.bed.gz')
31 | 
32 | GENE_EXON_DATABASE = os.path.join(BASE_DIR, 'data', 'exon.sorted.bed.gz')
33 | 
34 | GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'gene.sorted.bed.gz')
35 | 
36 | OMIM_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'omim-gene.sorted.bed.gz')
37 | 
38 | FUNC_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'func-region.sorted.gz')
39 | 
40 | HI_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-gene.sorted.bed.gz')
41 | 
42 | HI_EXON_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-exon.sorted.bed.gz')
43 | 
44 | HI_CDS_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-cds.sorted.bed.gz')
45 | 
46 | CLINVAR_PATHOGENIC_DATABASE = os.path.join(
47 |     BASE_DIR, 'data', 'clinvar-pathogenic.sorted.vcf.gz'
48 | )
49 | 
50 | UHI_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'uhi-gene.sorted.bed.gz')
51 | 
52 | HI_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'hi-region.sorted.bed.gz')
53 | 
54 | UHI_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'uhi-region.sorted.bed.gz')
55 | 
56 | DECIPHER_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'decipher-gene.sorted.bed.gz')
57 | 
58 | TS_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'ts-gene.sorted.bed.gz')
59 | 
60 | TS_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'ts-region.sorted.bed.gz')
61 | 
62 | UTS_GENE_DATABASE = os.path.join(BASE_DIR, 'data', 'uts-gene.sorted.bed.gz')
63 | 
64 | UTS_REGION_DATABASE = os.path.join(BASE_DIR, 'data', 'uts-region.sorted.bed.gz')
65 | 
66 | DGV_GAIN_DATABASE = os.path.join(BASE_DIR, 'data', 'dgv-gain.sorted.bed.gz')
67 | 
68 | DGV_LOSS_DATABASE = os.path.join(BASE_DIR, 'data', 'dgv-loss.sorted.bed.gz')
69 | 
70 | GNOMAD_DEL_DATABASE = os.path.join(BASE_DIR, 'data', 'gnomad-del.sorted.bed.gz')
71 | 
72 | GNOMAD_DUP_DATABASE = os.path.join(BASE_DIR, 'data', 'gnomad-dup.sorted.bed.gz')
73 | 
74 | CNV_SYNDROME_DEL_DATABASE = os.path.join(BASE_DIR, 'data', 'cnv-syndrome-del.bed.gz')
75 | CNV_SYNDROME_DUP_DATABASE = os.path.join(BASE_DIR, 'data', 'cnv-syndrome-dup.bed.gz')
76 | 
77 | 
78 | try:
79 |     from autocnv.local_settings import *
80 | except ImportError:
81 |     pass
82 | 


--------------------------------------------------------------------------------
/autocnv/tests/test_annotation.py:
--------------------------------------------------------------------------------
  1 | from autocnv.annotate import AnnotateHelper
  2 | 
  3 | annotate = AnnotateHelper()
  4 | 
  5 | 
  6 | def test_annotation1():
  7 |     # 包含TBX5基因全部（基因 zoom out 3x）
  8 |     annotation = annotate.annotate('chr12', 114737220, 114900761, 'del')
  9 |     assert '2A' in annotation['rules']
 10 | 
 11 | 
 12 | def test_annotation2():
 13 |     # TBX5基因5’端，覆盖CDS序列
 14 |     annotation = annotate.annotate('chr12', 114800000, 114900761, 'del')
 15 |     assert '2C-1' in annotation['rules']
 16 | 
 17 | 
 18 | def test_annotation3():
 19 |     # TBX5基因5’端，不覆盖CDS序列
 20 |     annotation = annotate.annotate('chr12', 114842000, 114900761, 'del')
 21 |     assert '2C-2' in annotation['rules']
 22 | 
 23 | 
 24 | def test_annotation4():
 25 |     # TBX5基因3'端，仅覆盖UTR区
 26 |     annotation = annotate.annotate('chr12', 114737220, 114793000, 'del')
 27 |     assert '2D-1' in annotation['rules']
 28 | 
 29 | 
 30 | def test_annotation5():
 31 |     # TBX5基因3'端，仅覆盖末位外显子（该外显子含低频致病变异）
 32 |     annotation = annotate.annotate('chr12', 114737220, 114800000, 'del')
 33 |     assert '2D-2' in annotation['rules']
 34 | 
 35 | 
 36 | def test_annotation6():
 37 |     # TBX5基因3'端，覆盖多个exon
 38 |     annotation = annotate.annotate('chr12', 114737220, 114835000, 'del')
 39 |     assert '2D-4' in annotation['rules']
 40 | 
 41 | 
 42 | def test_annotation7():
 43 |     # 包含TBX5中间3个exon
 44 |     annotation = annotate.annotate('chr12', 114800000, 114835000, 'del')
 45 |     assert '2E' in annotation['rules']
 46 | 
 47 | 
 48 | def test_annotation8():
 49 |     # TBX5基因5’端，不覆盖CDS序列；TBX3基因3'端，仅覆盖末位外显子（该外显子含低频致病变异）
 50 |     annotation = annotate.annotate('chr12', 114842000, 115111000, 'del')
 51 |     assert '2C-2' in annotation['rules']
 52 |     assert '2D-2' in annotation['rules']
 53 |     assert '4O' not in annotation['rules']
 54 | 
 55 | 
 56 | def test_annotation9():
 57 |     # TBX5基因5’端，不覆盖CDS序列；TBX3基因3'端，覆盖多个外显子
 58 |     annotation = annotate.annotate('chr12', 114842000, 115114000, 'del')
 59 |     assert '2C-2' in annotation['rules']
 60 |     assert '2D-4' in annotation['rules']
 61 | 
 62 | 
 63 | def test_annotation10():
 64 |     # TBX5基因5’端，覆盖CDS序列；TBX3基因3'端，仅覆盖末位外显子（该外显子含低频致病变异）
 65 |     annotation = annotate.annotate('chr12', 114800000, 115111000, 'del')
 66 |     assert '2C-1' in annotation['rules']
 67 |     assert '2D-2' in annotation['rules']
 68 | 
 69 | 
 70 | def test_annotation11():
 71 |     # TBX5基因5’端，覆盖CDS序列；TBX3基因3'端，覆盖多个外显子
 72 |     annotation = annotate.annotate('chr12', 114800000, 115114000, 'del')
 73 |     assert '2C-1' in annotation['rules']
 74 |     assert '2D-4' in annotation['rules']
 75 | 
 76 | 
 77 | def test_annotation12():
 78 |     # 包含MECP2基因全部（基因范围 zoom out 1.5x）
 79 |     annotation = annotate.annotate('chrX', 153268282, 153382170, 'dup')
 80 |     assert '2A' in annotation['rules']
 81 |     assert '2H' in annotation['rules']
 82 | 
 83 | 
 84 | def test_annotation13():
 85 |     # MECP2基因内CNV
 86 |     annotation = annotate.annotate('chrX', 153300000, 153363100, 'dup')
 87 |     assert '2I' in annotation['rules']
 88 | 
 89 | 
 90 | def test_4O():
 91 |     annotation = annotate.annotate('chr1', 196757278, 196796716, 'del')
 92 |     assert '4O' in annotation['rules']
 93 |     annotation = annotate.annotate('chr15', 22750305, 23226254, 'del')
 94 |     assert '4O' not in annotation['rules']
 95 |     annotation = annotate.annotate('chr1', 25584597, 25767647, 'del')
 96 |     assert '4O' not in annotation['rules']
 97 |     annotation = annotate.annotate('chr1', 148974342, 149441884, 'dup')
 98 |     assert '4O' in annotation['rules']
 99 |     annotation = annotate.annotate('chr15', 22750305, 23226254, 'dup')
100 |     assert '4O' not in annotation['rules']
101 |     annotation = annotate.annotate('chr1', 425, 69091, 'dup')
102 |     assert '4O' not in annotation['rules']
103 | 
104 | def test_syndrome():
105 |     annotation = annotate.annotate('chr23', 6420555, 8153336, 'del')
106 | 
107 | 
108 | def test_random():
109 |     annotation = annotate.annotate('chr11', 45904399, 46480747, 'del')
110 |     assert '2H' in annotation['rules']
111 | 


--------------------------------------------------------------------------------
/database-prepare/database-create-flowchart.md:
--------------------------------------------------------------------------------
  1 | # ACIT数据库生成流程
  2 | 
  3 | ## 基因 (Gene)
  4 | 
  5 | ```mermaid
  6 | graph TD
  7 | 	
  8 | 	classDef input fill:#e9ffef
  9 | 	
 10 | 	classDef output fill:#ffebe9
 11 | 	
 12 | 	classDef variable fill:#feffe9
 13 | 
 14 | 	subgraph prepare gene
 15 |         refgene("refgene")
 16 |         class refgene input
 17 | 
 18 |         filter_length["filter max CDS length"]
 19 |         refgene --> filter_length
 20 | 
 21 |         geneinfo("geneinfo")
 22 |         class geneinfo input
 23 | 		
 24 |         merge["merge by name2 & Symbol"]
 25 |         filter_length --> merge
 26 |         geneinfo --> merge
 27 | 
 28 |         filter_protein["filter type_of_gene == protein-coding"]
 29 |         merge --> filter_protein
 30 | 
 31 |         gene("gene")
 32 |         class gene output
 33 |         filter_protein --> gene
 34 | 	end
 35 | 	
 36 | 	subgraph prepare omim gene
 37 | 		omim_gene_list("omim gene list")
 38 | 		class omim_gene_list input
 39 | 		
 40 | 		filter_omim["filter gene linked with disease by omim"]
 41 | 		gene --> filter_omim
 42 | 		omim_gene_list --> filter_omim
 43 | 		
 44 | 		omim_gene("omim_gene")
 45 | 		class omim_gene output
 46 | 		filter_omim --> omim_gene
 47 | 	end
 48 | ```
 49 | 
 50 | ## 单倍体敏感基因(HI Gene)
 51 | 
 52 | ```mermaid
 53 | graph TD
 54 | 	
 55 | 	classDef input fill:#e9ffef
 56 | 	
 57 | 	classDef output fill:#ffebe9
 58 | 	
 59 | 	classDef variable fill:#feffe9
 60 | 	
 61 | 	subgraph prepare hi gene
 62 | 		curation_gene("curation_gene")
 63 | 		class curation_gene input
 64 | 		
 65 | 		gene("gene")
 66 | 		class gene variable
 67 | 		
 68 | 		filter_hi["filter Haploinsufficiency Score == 3"]
 69 | 		curation_gene --> filter_hi
 70 | 		
 71 | 		merge["merge by Gene Symbol & name2"]
 72 | 		filter_hi --> merge
 73 | 		gene --> merge
 74 | 		
 75 | 		hi_gene("hi_gene")
 76 | 		class hi_gene output
 77 | 		merge --> hi_gene
 78 | 		
 79 | 		
 80 | 		filter_uhi["filter Haploinsufficiency Score == 40"]
 81 | 		curation_gene --> filter_uhi
 82 | 		
 83 | 		uhi_gene("uhi_gene")
 84 | 		class uhi_gene output
 85 | 		gene --> uhi_gene
 86 | 		filter_uhi --> uhi_gene
 87 | 	end
 88 | 
 89 | 	subgraph prepare hi exon
 90 | 		exon("extrace last exon")
 91 | 		hi_gene --> exon
 92 | 		
 93 | 		hi_exon("hi_exon")
 94 | 		class hi_exon output
 95 | 		exon --> hi_exon
 96 | 	end
 97 | 
 98 | 	subgraph prepare clinvar pathogenic variants
 99 | 		all_variants("all clinical pathogenic variants")
100 | 		class all_variants input
101 | 		
102 | 		exon_variants["filter variant in last exon"]
103 | 		all_variants --> exon_variants
104 | 		hi_exon --> exon_variants
105 | 		
106 | 		pathogenic_exon_variants("variants")
107 | 		class pathogenic_exon_variants output
108 | 		exon_variants --> pathogenic_exon_variants
109 | 	end
110 | 	
111 | 	subgraph prepare hi cds
112 | 		cds["extract CDS"]
113 | 		hi_gene --> cds
114 | 		
115 | 		hi_cds("hi_cds")
116 | 		class hi_cds output
117 | 		cds --> hi_cds
118 | 	end
119 | 	
120 | ```
121 | 
122 | ## 多倍体敏感基因 (TS Gene)
123 | 
124 | ```mermaid
125 | graph TD
126 | 	
127 | 	classDef input fill:#e9ffef
128 | 	
129 | 	classDef output fill:#ffebe9
130 | 	
131 | 	classDef variable fill:#feffe9
132 | 	
133 | 	subgraph prepare ts gene
134 | 		curation_gene("curation_gene")
135 | 		class curation_gene input
136 | 		
137 | 		gene("gene")
138 | 		class gene variable
139 | 		
140 | 		filter_ts["filter Triplosensitivity Score == 3"]
141 | 		curation_gene --> filter_ts
142 | 		
143 | 		merge_ts["merge by Gene Symbol & name2"]
144 | 		filter_ts --> merge_ts
145 | 		gene --> merge_ts
146 | 		
147 | 		ts_gene("ts_gene")
148 | 		class ts_gene output
149 | 		merge_ts --> ts_gene
150 | 		
151 | 		filter_uts["filter Triplosensitivity Score == 40"]
152 | 		curation_gene --> filter_uts
153 | 		
154 | 		merge_uts["merge by Gene Symbol & name2"]
155 | 		gene --> merge_uts
156 | 		filter_uts --> merge_uts
157 | 		
158 | 		uts_gene("uts_gene")
159 | 		class uts_gene output
160 | 		merge_uts --> uts_gene
161 | 	end
162 | ```
163 | 
164 | ## 单倍体敏感区域 (HI region)
165 | 
166 | ```mermaid
167 | graph TD
168 | 	
169 | 	classDef input fill:#e9ffef
170 | 	
171 | 	classDef output fill:#ffebe9
172 | 	
173 | 	classDef variable fill:#feffe9
174 | 	
175 | 	subgraph prepare hi region
176 | 		curation("curation_region")
177 | 		class curation input
178 | 		
179 | 		filter_hi["filter Haploisufficiency Score == 3"]
180 | 		curation --> filter_hi
181 | 		
182 | 		hi_region("hi_region")
183 | 		class hi_region output
184 | 		filter_hi --> hi_region
185 | 		
186 | 		filter_uhi["filter Haploisufficiency Score == 40"]
187 | 		curation --> filter_uhi
188 | 		
189 | 		gene("gene")
190 | 		class gene variable
191 | 		
192 | 		fetch_gene["fetch overlap gene"]
193 | 		gene --> fetch_gene
194 | 		filter_uhi --> fetch_gene
195 | 		
196 | 		uhi_region("uhi_region")
197 | 		class uhi_region output
198 | 		fetch_gene --> uhi_region
199 | 	end
200 | ```
201 | 
202 | ## 多倍体敏感区域 (TS region)
203 | 
204 | ```mermaid
205 | graph TD
206 | 	
207 | 	classDef input fill:#e9ffef
208 | 	
209 | 	classDef output fill:#ffebe9
210 | 	
211 | 	classDef variable fill:#feffe9
212 | 	
213 | 	subgraph prepare ts region
214 | 		curation("curation_region")
215 | 		class curation input
216 | 		
217 | 		filter_ts["filter Triplosensitivity Score == 3"]
218 | 		curation --> filter_ts
219 | 		
220 | 		omim_gene("omim_gene")
221 | 		class omim_gene variable
222 | 		
223 | 		fetch_omim_gene["fetch overlap omim gene"]
224 | 		filter_ts --> fetch_omim_gene
225 | 		omim_gene --> fetch_omim_gene
226 | 		
227 | 		ts_region("ts_region")
228 | 		class ts_region output
229 | 		fetch_omim_gene --> ts_region
230 | 		
231 | 		filter_uts["filter Triplosensitivity Score == 40"]
232 | 		curation --> filter_uts
233 | 		
234 | 		gene("gene")
235 | 		class gene variable
236 | 		
237 | 		fetch_gene["fetch overlap gene"]
238 | 		filter_uts --> fetch_gene
239 | 		gene --> fetch_gene
240 | 		
241 | 		uts_region("uts_region")
242 | 		class uts_region output
243 | 		fetch_gene --> uts_region
244 | 	end
245 | ```
246 | 
247 | ## 预测基因 (decipher)
248 | 
249 | ```mermaid
250 | graph TD
251 | 	
252 | 	classDef input fill:#e9ffef
253 | 	
254 | 	classDef output fill:#ffebe9
255 | 	
256 | 	classDef variable fill:#feffe9
257 | 	
258 | 	subgraph prepare decipher
259 | 		predictions("decipher")
260 | 		class predictions input
261 | 		
262 | 		gene("gene")
263 | 		class gene variable
264 | 		
265 | 		merge["merge by sybol & name2"]
266 | 		predictions --> merge
267 | 		gene --> merge
268 | 		
269 | 		gnomad("gnomad")
270 | 		class gnomad input
271 | 		
272 | 		join_pli["join pLI by name2"]
273 | 		gnomad --> join_pli
274 | 		merge --> join_pli
275 | 		
276 | 		join_lof["join oe_lof_upper by name2"]
277 | 		gnomad --> join_lof
278 | 		join_pli --> join_lof
279 | 		
280 | 		filter["filter pLI >= 0.9 & hi_index < 10% & oe_lof_upper < 0.35"]
281 | 		join_lof --> filter
282 | 		
283 | 		decipher("decipher")
284 | 		class decipher output
285 | 		filter --> decipher
286 | 	end
287 | ```
288 | 
289 | ## control
290 | 
291 | ```mermaid
292 | graph TD
293 | 	
294 | 	classDef input fill:#e9ffef
295 | 	
296 | 	classDef output fill:#ffebe9
297 | 	
298 | 	classDef variable fill:#feffe9
299 | 	
300 | 	subgraph prepare gnomad	
301 | 		
302 | 		gnomad("gnomad")
303 | 		class gnomad input
304 | 		
305 | 		filter_qc["filter FILTER == PASS & svtype in (DEL, DUP)"]
306 | 		gnomad --> filter_qc
307 | 		
308 | 		subgraph af filters
309 | 			filter_af["filter N_BI_GENOS >= 1000"]
310 | 			
311 | 			filter_afr["filter AFR_N_BI_GENOS >= 1000"]
312 | 			filter_af -. or .-> filter_afr
313 | 			
314 | 			filter_amr["filter AMR_N_BI_GENOS >= 1000"]
315 | 			filter_afr -. or .-> filter_amr
316 | 			
317 | 			filter_eas["filter EAS_N_BI_GENOS >= 1000"]
318 | 			filter_amr -. or .-> filter_eas
319 | 			
320 | 			filter_eur["filter EUR_N_BI_GENOS >= 1000"]
321 | 			filter_eas -. or .-> filter_eur
322 | 		end
323 | 		filter_qc --> filter_af
324 | 		
325 | 		fetch_gene_gnomad["fetch ovalap gene"]
326 | 		filter_eur --> fetch_gene_gnomad
327 | 		
328 | 		filter_del["filter svtype == DEL"]
329 | 		fetch_gene_gnomad --> filter_del
330 | 		
331 | 		gnomad_del("gnomad_del")
332 | 		class gnomad_del output
333 | 		filter_del --> gnomad_del
334 | 		
335 | 		filter_dup["filter svtype == DUP"]
336 | 		fetch_gene_gnomad --> filter_dup
337 | 		
338 | 		gnomad_dup("gnomad_dup")
339 | 		class gnomad_dup output
340 | 		filter_dup --> gnomad_dup
341 | 	end
342 | 	
343 | 	subgraph prepare dgv
344 | 		dgv("dgv")
345 | 		class dgv input
346 | 		
347 | 		filter_dgv["filter freq >= 1% & sample >= 1000"]
348 | 		dgv --> filter_dgv
349 | 		
350 | 		fetch_gene_dgv["fetch overlap gene"]
351 | 		filter_dgv --> fetch_gene_dgv
352 | 		
353 | 		filter_gain["filter type == Gain"]
354 | 		fetch_gene_dgv --> filter_gain
355 | 		
356 | 		dgv_gain("dgv_gain")
357 | 		class dgv_gain output
358 | 		filter_gain --> dgv_gain
359 | 		
360 | 		filter_loss["filter type == Loss"]
361 | 		fetch_gene_dgv --> filter_loss
362 | 		
363 | 		dgv_loss("dgv_loss")
364 | 		class dgv_loss output
365 | 		filter_loss --> dgv_loss
366 | 	end
367 | ```
368 | 
369 | 


--------------------------------------------------------------------------------
/autocnv/annotate.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | from autocnv.database import DataBase
  4 | from pysam import VariantFile
  5 | from autocnv import settings
  6 | from autopvs1.cnv import CNVRecord, PVS1CNV
  7 | from autopvs1.utils import get_transcript
  8 | from autopvs1.read_data import transcripts
  9 | from autopvs1.strength import Strength
 10 | from collections import defaultdict
 11 | from itertools import chain
 12 | import operator
 13 | import pandas as pd
 14 | 
 15 | SEP = '\n'
 16 | DEFAULT_EMPTY_VALUE = '-'
 17 | NAME_MAP = {'chr': 'chromosome', 'type': 'func'}
 18 | 
 19 | PVS1 = {
 20 |     Strength.VeryStrong: 'PVS1', Strength.Strong: 'PVS1_S', Strength.Moderate: 'PVS1_M',
 21 |     Strength.Supporting: 'PVS1_P', Strength.Unmet: 'PVS1_U'
 22 | }
 23 | 
 24 | # 计分分组配置，同一组证据仅计算最大分值
 25 | SCORE_GROUP = {
 26 |     'del': {
 27 |         rule: 'G1' for rule in ('2A', '2B', '2C-1', '2C-2', '2D-1', '2D-2', '2D-3', '2D-4', '2E')
 28 |     },
 29 |     'dup': {}
 30 | }
 31 | 
 32 | # 致病性判断分级配置
 33 | PATHOGENICITY_LEVELS = [
 34 |     (operator.ge, 0.99, 'P'), (operator.ge, 0.9, 'LP'), (operator.gt, -0.9, 'VUS'),
 35 |     (operator.gt, -0.99, 'LB'), (operator.le, -0.99, 'B')
 36 | ]
 37 | 
 38 | 
 39 | class AnnotateHelper:
 40 |     def __init__(self):
 41 |         self._gene_database = DataBase(settings.GENE_DATABASE)
 42 |         self._omim_gene_database = DataBase(settings.OMIM_GENE_DATABASE)
 43 |         self._func_region_database = DataBase(settings.FUNC_REGION_DATABASE)
 44 |         self._hi_gene_database = DataBase(settings.HI_GENE_DATABASE)
 45 |         self._hi_exon_database = DataBase(settings.HI_EXON_DATABASE)
 46 |         self._hi_cds_database = DataBase(settings.HI_CDS_DATABASE)
 47 |         self._clinvar_pathogenic_database = VariantFile(
 48 |             settings.CLINVAR_PATHOGENIC_DATABASE)
 49 |         self._uhi_gene_database = DataBase(settings.UHI_GENE_DATABASE)
 50 |         self._hi_region_database = DataBase(settings.HI_REGION_DATABASE)
 51 |         self._uhi_region_database = DataBase(settings.UHI_REGION_DATABASE)
 52 |         self._decipher_gene_database = DataBase(
 53 |             settings.DECIPHER_GENE_DATABASE)
 54 |         self._ts_gene_database = DataBase(settings.TS_GENE_DATABASE)
 55 |         self._ts_region_database = DataBase(settings.TS_REGION_DATABASE)
 56 |         self._uts_gene_database = DataBase(settings.UTS_GENE_DATABASE)
 57 |         self._uts_region_database = DataBase(settings.UTS_REGION_DATABASE)
 58 |         self._dgv_gain_database = DataBase(settings.DGV_GAIN_DATABASE)
 59 |         self._dgv_loss_database = DataBase(settings.DGV_LOSS_DATABASE)
 60 |         self._gnomad_del_database = DataBase(settings.GNOMAD_DEL_DATABASE)
 61 |         self._gnomad_dup_database = DataBase(settings.GNOMAD_DUP_DATABASE)
 62 |         self._cnv_syndrome_del_database = DataBase(settings.CNV_SYNDROME_DEL_DATABASE)
 63 |         self._cnv_syndrome_dup_database = DataBase(settings.CNV_SYNDROME_DUP_DATABASE)
 64 |         self._cytoband_database = DataBase(settings.CYTO_BAND_FILE)
 65 |         self._exon_database = DataBase(settings.GENE_EXON_DATABASE)
 66 | 
 67 |         self.serializer = self._serializer # func compatible
 68 | 
 69 |     @staticmethod
 70 |     def _chrom_num(chrom):
 71 |         return re.sub('chr', '', str(chrom), flags=re.I)
 72 | 
 73 |     @staticmethod
 74 |     def _norm_chrom(ch):
 75 |         """
 76 |         normalize chromosome name, eg. 2 -> chr2, 23 -> chrX
 77 |         :param ch: input chromosome name
 78 |         :return: normalized name
 79 |         >>> norm_chrom(2)
 80 |         'chr2'
 81 |         >>> norm_chrom('Chr23')
 82 |         'chrX'
 83 |         """
 84 |         ch = AnnotateHelper._chrom_num(ch)
 85 |         if ch == '23':
 86 |             return 'chrX'
 87 |         if ch == '24':
 88 |             return 'chrY'
 89 |         return f'chr{ch}'
 90 | 
 91 |     @staticmethod
 92 |     def _annotate_loss(**annotation):
 93 |         """
 94 |         计算拷贝数减少的CNV的证据项
 95 |         :param annotation: 已注释的CNV
 96 |         :return: 注释后的CNV
 97 |         """
 98 |         loss = dict()
 99 | 
100 |         # Section 1
101 | 
102 |         if len(annotation['outer_overlap_genes']) + len(annotation['overlap_func_regions']) > 0:
103 |             loss['1A'] = True
104 |         else:
105 |             loss['1B'] = True
106 | 
107 |         # Section 2
108 | 
109 |         # hi区域
110 |         for region, overlap, coverage in annotation['overlap_hi_regions']:
111 |             if coverage == 1:  # 完全覆盖区域
112 |                 loss['2A'] = True
113 |             elif len(set(gene.symbol for gene, *_ in annotation['overlap_hi_genes'])) == 0:
114 |                 # 未覆盖hi基因
115 |                 loss['2B'] = True
116 | 
117 |         # hi基因
118 |         for gene, overlap, coverage in annotation['overlap_hi_genes']:
119 |             if coverage == 1:  # 完全覆盖基因
120 |                 loss['2A'] = True
121 |             elif overlap < 1:  # 是否位于基因内部
122 |                 if any(
123 |                         exon.last_exon == 'True'
124 |                         for exon, *_ in annotation['overlap_hi_exons'][gene.gene_id]
125 |                 ):  # 是否覆盖末位外显子
126 |                     if len(annotation['overlap_hi_exons'][gene.gene_id]) >= 2:
127 |                         # 覆盖超过两个外显子
128 |                         loss['2D-4'] = True
129 |                     elif gene.gene_id in annotation['overlap_hi_cds'] \
130 |                             and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0:  # 是否覆盖CDS
131 |                         if len(annotation['variants']) > 0:  # 末位外显子是否有致病变异
132 |                             loss['2D-2'] = True
133 |                         else:  # 末尾外显子无致病变异
134 |                             loss['2D-3'] = True
135 |                     else:
136 |                         # 不覆盖CDS区
137 |                         loss['2D-1'] = True
138 |                 # 未覆盖末位外显子
139 |                 elif gene.gene_id in annotation['overlap_hi_cds'] \
140 |                         and len(annotation['overlap_hi_cds'][gene.gene_id]) > 0:  # 是否覆盖5'端CDS
141 |                     loss['2C-1'] = True
142 |                 else:  # 未覆盖5'端CDS
143 |                     loss['2C-2'] = True
144 |             # 位于基因内部
145 |             else:
146 |                 cnv = CNVRecord(
147 |                     annotation['chromosome'], annotation['inner_start'],
148 |                     annotation['inner_end'], annotation['func']
149 |                 )
150 |                 tx = get_transcript(gene.transcript, transcripts)
151 |                 pvs1 = PVS1CNV(cnv, None, tx)
152 |                 loss['2E'] = True
153 |                 # loss[PVS1[pvs1.verify_DEL()[0]]] = True
154 |                 try:  # HOTFIX: pvs1 error
155 |                     loss['pvs1'] = PVS1[pvs1.verify_DEL()[0]]
156 |                 except:
157 |                     print(cnv)
158 | 
159 |         # 包含预测HI基因
160 |         if len(annotation['overlap_hi_genes']) + len(annotation['overlap_hi_regions']) == 0 \
161 |                 and len(annotation['overlap_decipher_genes']) > 0:
162 |             loss['2H'] = True
163 | 
164 |         # 落入uhi基因
165 |         for gene, overlap, coverage in annotation['overlap_uhi_genes']:
166 |             if overlap == 1:
167 |                 loss['2F'] = True
168 | 
169 |         # 落入uhi区域
170 |         genes = set(gene.symbol for gene, *
171 |         _ in annotation['outer_overlap_genes'])
172 |         for region, overlap, coverage in annotation['overlap_uhi_regions']:
173 |             if len(genes - set(region.genes.split(','))) > 0:
174 |                 loss['2G'] = True
175 |             else:
176 |                 loss['2F'] = True
177 | 
178 |         # Section 3
179 | 
180 |         # 覆盖基因个数
181 |         gene_count = len(annotation['outer_overlap_genes'])
182 |         if gene_count >= 35:
183 |             loss['3C'] = True
184 |         elif gene_count >= 25:
185 |             loss['3B'] = True
186 |         elif gene_count >= 0:
187 |             loss['3A'] = True
188 | 
189 |         # Section 4
190 | 
191 |         # DGV金标和Gnomad
192 |         genes = set(gene.symbol for gene, *
193 |         _ in annotation['outer_overlap_genes'])
194 |         l, m = 0, 0
195 |         for record, overlap, coverage in chain(
196 |                 annotation['dgv_loss_records'], annotation['gnomad_del_records']
197 |         ):
198 |             if overlap == 1 and any(
199 |                     float(v) >= 0.01 for f, v in record._asdict().items() if f.startswith('af')
200 |             ):  # 完全覆盖待解读CNV且频率大于1%
201 |                 loss['4O'] = True
202 |                 break
203 |             elif overlap >= 0.5 and len(genes - set(record.genes.split(','))) == 0:
204 |                 # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因
205 |                 if any(float(v) < 0.01 for f, v in record._asdict().items() if f.startswith('af')):
206 |                     # 频率小于1%
207 |                     m += 1
208 |                 else:
209 |                     # 频率大于1%
210 |                     l += 1
211 |         else:
212 |             if l > 0 and m == 0:  # 存在频率大于1%且不存在小于1%的CNV
213 |                 loss['4O'] = True
214 | 
215 |         annotation['rules'] = loss
216 |         return annotation
217 | 
218 |     @staticmethod
219 |     def _annotate_gain(**annotation):
220 |         """
221 |         计算拷贝数减少的CNV的证据项
222 |         :param annotation: 已注释的CNV
223 |         :return: 注释后的CNV
224 |         """
225 |         gain = dict()
226 | 
227 |         # Section 1
228 | 
229 |         if len(annotation['outer_overlap_genes']) + len(annotation['overlap_func_regions']) > 0:
230 |             gain['1A'] = True
231 |         else:
232 |             gain['1B'] = True
233 | 
234 |         # Section 2
235 | 
236 |         # 完全覆盖ts区域
237 |         for region, overlap, coverage in annotation['overlap_ts_regions']:
238 |             if coverage == 1:  # 是否覆盖整改区域
239 |                 gain['2A'] = True
240 |             elif len(set(gene.symbol for gene, *_ in annotation['overlap_ts_genes'])) == 0:
241 |                 # 未覆盖ts基因
242 |                 gain['2B'] = True
243 | 
244 |         for gene, overlap, coverage in annotation['overlap_ts_genes']:
245 |             # 覆盖整个基因
246 |             if coverage == 1:
247 |                 gain['2A'] = True
248 | 
249 |         # 落入uts基因
250 |         for gene, overlap, coverage in annotation['overlap_uts_genes']:
251 |             if overlap == 1:
252 |                 gain['2D'] = True
253 | 
254 |         # 落入uts区域
255 |         for region, overlap, coverage in annotation['overlap_uts_regions']:
256 |             genes = set(gene.symbol for gene, *
257 |             _ in annotation['inner_overlap_genes'])
258 |             region_genes = set(region.genes.split(','))
259 |             if overlap == coverage == 1:  # 与良性区域完全一致
260 |                 gain['2C'] = True
261 |             elif len(genes - region_genes) > 0:  # 编码蛋白基因比良性区域多
262 |                 gain['2G'] = True
263 |             # 破坏蛋白编码基因
264 |             elif any(c < 1 for *_, c in annotation['inner_overlap_genes']):
265 |                 gain['2E'] = True
266 |             elif overlap == 1:  # 被良性区域完全覆盖
267 |                 gain['2D'] = True
268 |             else:
269 |                 gain['2F'] = True
270 | 
271 |         # hi基因
272 |         hi_genes = set()
273 |         for gene, overlap, coverage in annotation['overlap_hi_genes']:
274 |             hi_genes.add(gene.symbol)
275 |             if coverage == 1:  # 完全覆盖
276 |                 gain['2H'] = True
277 |             elif overlap == 1:  # 两端均位于基因内
278 |                 cnv = CNVRecord(
279 |                     annotation['chromosome'], annotation['inner_start'],
280 |                     annotation['inner_end'], annotation['func']
281 |                 )
282 |                 tx = get_transcript(gene.transcript, transcripts)
283 |                 pvs1 = PVS1CNV(cnv, None, tx)
284 |                 gain['2I'] = True
285 |                 # gain[PVS1[pvs1.verify_DUP()[0]]] = True
286 |                 gain['pvs1'] = PVS1[pvs1.verify_DUP()[0]]
287 | 
288 |         # 非hi基因
289 |         for gene, overlap, coverage in annotation['inner_overlap_genes']:
290 |             if gene.symbol not in hi_genes and coverage != 1:
291 |                 gain['2L'] = True
292 |                 annotation['break_point_genes'].append(gene.symbol)
293 | 
294 |         # Section 3
295 | 
296 |         # 覆盖基因个数
297 |         gene_count = len(annotation['inner_overlap_genes'])
298 |         if gene_count >= 50:
299 |             gain['3C'] = True
300 |         elif gene_count >= 35:
301 |             gain['3B'] = True
302 |         elif gene_count >= 0:
303 |             gain['3A'] = True
304 | 
305 |         # Section 4
306 | 
307 |         # DGV金标和Gnomad
308 |         genes = set(gene.symbol for gene, *
309 |         _ in annotation['outer_overlap_genes'])
310 |         l, m = 0, 0
311 |         for record, overlap, coverage in chain(
312 |                 annotation['dgv_gain_records'], annotation['gnomad_dup_records']
313 |         ):
314 |             if overlap == 1 and any(
315 |                     float(v) >= 0.01 for f, v in record._asdict().items() if f.startswith('af')
316 |             ):  # 完全覆盖待解读CNV且频率大于1%
317 |                 gain['4O'] = True
318 |                 break
319 |             elif overlap >= 0.5 and len(genes - set(record.genes.split(','))) == 0:
320 |                 # 与待解读CNV重叠超过50%且覆盖全部蛋白编码基因
321 |                 if any(float(v) < 0.01 for f, v in record._asdict().items() if
322 |                        f.startswith('af')):
323 |                     # 频率小于1%
324 |                     m += 1
325 |                 else:
326 |                     # 频率大于1%
327 |                     l += 1
328 |         else:
329 |             if l > 0 and m == 0:  # 存在频率大于1%且不存在小于1%的CNV
330 |                 gain['4O'] = True
331 | 
332 |         annotation['rules'] = gain
333 |         return annotation
334 | 
335 |     @staticmethod
336 |     def merge_score(func, **rules):
337 |         """
338 |         整合所有证据项得分
339 |         :param func: 变异类型
340 |         :param rules: 证据项
341 |         :return: 生成各证据项得分
342 |         """
343 |         groups = defaultdict(list)
344 |         for rule, score in rules.items():
345 |             try:  # 需要分组计分的证据项先收集起来
346 |                 groups[SCORE_GROUP[func][rule]].append(score)
347 |             except KeyError:  # 无需分组计分的证据项直接计分
348 |                 yield score
349 |         for _, scores in groups.items():  # 分组计分的证据项只计算最大分值
350 |             yield max(scores)
351 | 
352 |     @staticmethod
353 |     def judge(func, **rules):
354 |         """
355 |         判断给定的证据项组合最终的致病性
356 |         :param func: 变异类型
357 |         :param rules: 勾选的证据项
358 |         :return: 证据项、得分和致病性
359 |         """
360 |         # 获取所有证据项得分
361 |         # rules = {
362 |         #     rule: settings.DEFAULT_SCORE[func][rule] for rule, check in rules.items() if check
363 |         # }
364 |         rules_value = {}
365 |         for rule, check in rules.items():
366 |             if check in PVS1.values():
367 |                 rules_value['pvs1'] = settings.DEFAULT_SCORE[func][check]
368 |             elif check:
369 |                 rules_value[rule] = settings.DEFAULT_SCORE[func][rule]
370 |         # 整合所有证据项得分
371 |         score = sum(AnnotateHelper.merge_score(func, **rules_value))
372 |         # 判断致病性
373 |         for op, cutoff, level in PATHOGENICITY_LEVELS[:-1]:
374 |             if op(score, cutoff):
375 |                 pathogenicity = level
376 |                 break
377 |         else:
378 |             pathogenicity = PATHOGENICITY_LEVELS[-1][2]
379 |         return rules_value, score, pathogenicity
380 | 
381 |     def annotate(self, chromosome, start, end, func, error=0, **kwargs):
382 |         """
383 |         对给定CNV进行注释
384 |         :param chromosome: 染色体编号
385 |         :param start: 起始位置
386 |         :param end: 终止位置
387 |         :param func: 变异类型
388 |         :param error: 误差值
389 |         :return: 注释结果
390 |         """
391 |         chromosome = self._norm_chrom(chromosome)
392 |         annotation = dict(
393 |             chromosome=chromosome, start=start, end=end,
394 |             length=end - start, error=error,
395 |             outer_start=start - error, outer_end=end + error,
396 |             inner_start=start + error, inner_end=end - error,
397 |             func=func, break_point_genes=list()
398 |         )
399 | 
400 |         annotation['inner_overlap_genes'] = list(self._gene_database.overlap(
401 |             chromosome, annotation['inner_start'], annotation['inner_end'],
402 |         ))
403 | 
404 |         annotation['outer_overlap_genes'] = list(self._gene_database.overlap(
405 |             chromosome, annotation['outer_start'], annotation['outer_end'],
406 |         ))
407 | 
408 |         annotation['overlap_omim_genes'] = list(self._omim_gene_database.overlap(
409 |             chromosome, annotation['inner_start'], annotation['inner_end']
410 |         ))
411 | 
412 |         annotation['overlap_func_regions'] = list(self._func_region_database.overlap(
413 |             chromosome, annotation['outer_start'], annotation['outer_end']
414 |         ))
415 | 
416 |         annotation['overlap_hi_genes'] = list(self._hi_gene_database.overlap(
417 |             chromosome, annotation['inner_start'], annotation['inner_end']
418 |         ))
419 | 
420 |         annotation['overlap_hi_exons'] = self._hi_exon_database.overlap_groups(
421 |             chromosome, annotation['inner_start'], annotation['inner_end'],
422 |             lambda record: record[0].gene_id
423 |         )
424 | 
425 |         annotation['overlap_hi_cds'] = self._hi_cds_database.overlap_groups(
426 |             chromosome, annotation['inner_start'], annotation['inner_end'],
427 |             lambda record: record[0].gene_id
428 |         )
429 | 
430 |         try:
431 |             annotation['variants'] = list(self._clinvar_pathogenic_database.fetch(
432 |                 chromosome, annotation['inner_start'], annotation['inner_end'])
433 |             )
434 |         except ValueError:
435 |             annotation['variants'] = []
436 | 
437 |         annotation['overlap_hi_regions'] = list(self._hi_region_database.overlap(
438 |             chromosome, annotation['inner_start'], annotation['inner_end']
439 |         ))
440 | 
441 |         annotation['overlap_decipher_genes'] = list(self._decipher_gene_database.overlap(
442 |             chromosome, annotation['inner_start'], annotation['inner_end']
443 |         ))
444 | 
445 |         annotation['overlap_uhi_genes'] = list(self._uhi_gene_database.overlap(
446 |             chromosome, annotation['outer_start'], annotation['outer_end']
447 |         ))
448 | 
449 |         annotation['overlap_uhi_regions'] = list(self._uhi_region_database.overlap(
450 |             chromosome, annotation['outer_start'], annotation['outer_end']
451 |         ))
452 | 
453 |         annotation['overlap_ts_genes'] = list(self._ts_gene_database.overlap(
454 |             chromosome, annotation['inner_start'], annotation['inner_end']
455 |         ))
456 | 
457 |         annotation['overlap_ts_regions'] = list(self._ts_region_database.overlap(
458 |             chromosome, annotation['inner_start'], annotation['inner_end']
459 |         ))
460 | 
461 |         annotation['overlap_uts_genes'] = list(self._uts_gene_database.overlap(
462 |             chromosome, annotation['outer_start'], annotation['outer_end']
463 |         ))
464 | 
465 |         annotation['overlap_uts_regions'] = list(self._uts_region_database.overlap(
466 |             chromosome, annotation['outer_start'], annotation['outer_end']
467 |         ))
468 | 
469 |         annotation['dgv_gain_records'] = list(self._dgv_gain_database.overlap(
470 |             chromosome, annotation['outer_start'], annotation['outer_end']
471 |         ))
472 | 
473 |         annotation['dgv_loss_records'] = list(self._dgv_loss_database.overlap(
474 |             chromosome, annotation['outer_start'], annotation['outer_end']
475 |         ))
476 | 
477 |         annotation['gnomad_del_records'] = list(self._gnomad_del_database.overlap(
478 |             chromosome, annotation['outer_start'], annotation['outer_end']
479 |         ))
480 | 
481 |         annotation['gnomad_dup_records'] = list(self._gnomad_dup_database.overlap(
482 |             chromosome, annotation['outer_start'], annotation['outer_end']
483 |         ))
484 | 
485 |         annotation['cnv_syndrome_loss'] = list(self._cnv_syndrome_del_database.overlap(
486 |             chromosome, annotation['outer_start'], annotation['outer_end']
487 |         ))
488 |         annotation['cnv_syndrome_gain'] = list(self._cnv_syndrome_dup_database.overlap(
489 |             chromosome, annotation['outer_start'], annotation['outer_end']
490 |         ))
491 | 
492 |         annotation['cyto_band'] = list(self._cytoband_database.overlap(
493 |             chromosome, annotation['outer_start'], annotation['outer_end']
494 |         ))
495 | 
496 |         annotation['exon'] = list(
497 |             self._exon_database.overlap(
498 |                 chromosome, annotation['outer_start'], annotation['outer_end']
499 |             )
500 |         )
501 | 
502 |         if func == 'del':
503 |             annotation = self._annotate_loss(**annotation)
504 |         elif func == 'dup':
505 |             annotation = self._annotate_gain(**annotation)
506 |         else:
507 |             raise ValueError('Unknown func `{}`'.format(func))
508 | 
509 |         annotation['rules'], annotation['score'], annotation['pathogenicity'] = self.judge(
510 |             func, **annotation['rules']
511 |         )
512 |         # PVS1
513 |         if func == 'del' and '2E' in annotation['rules'].keys():
514 |             annotation['rules']['2E'] = annotation['rules'].get('pvs1')
515 |         elif func == 'dup' and '2I' in annotation['rules'].keys():
516 |             annotation['rules']['2I'] = annotation['rules'].get('pvs1')
517 |         annotation['pvs1'] = annotation['rules'].pop('pvs1', DEFAULT_EMPTY_VALUE)
518 | 
519 |         return annotation
520 | 
521 |     def _serializer(self, anno_result):
522 |         seri = {}
523 |         cyto_band_li = [x[0].name for x in anno_result['cyto_band']]
524 |         if len(cyto_band_li) == 0:
525 |             cyto_str = DEFAULT_EMPTY_VALUE
526 |         elif len(cyto_band_li) == 1:
527 |             cyto_str = f'{AnnotateHelper._chrom_num(anno_result["chromosome"])}{cyto_band_li[0]}'
528 |         else:
529 |             cyto_str = f'{AnnotateHelper._chrom_num(anno_result["chromosome"])}{cyto_band_li[0]}{cyto_band_li[-1]}'
530 | 
531 |         seri['cyto_band'] = cyto_str
532 | 
533 |         seri['inner_gene'] = ','.join(
534 |             x[0].symbol for x in anno_result['inner_overlap_genes'])
535 |         seri['inner_omim_gene'] = ','.join(
536 |             x[0].symbol for x in anno_result['overlap_omim_genes'])
537 |         if len(anno_result['exon']) == 0:
538 |             seri['exon'] = DEFAULT_EMPTY_VALUE
539 |         elif len(anno_result) == 1:
540 |             seri['exon'] = f"{anno_result['exon'][0][0].symbol}_EX{anno_result['exon'][0][0].exon_number}"
541 |         else:
542 |             seri['exon'] = f"{anno_result['exon'][0][0].symbol}_EX{anno_result['exon'][0][0].exon_number}" \
543 |                            f"-{anno_result['exon'][-1][0].symbol}_EX{anno_result['exon'][-1][0].exon_number}"
544 | 
545 |         seri['HI_gene'] = ','.join(
546 |             f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_hi_genes'])
547 |         seri['HI_region'] = SEP.join(
548 |             f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_hi_regions'])
549 |         seri['TS_gene'] = ','.join(
550 |             f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_ts_genes'])
551 |         seri['TS_region'] = ','.join(
552 |             f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_ts_regions'])
553 |         seri['Pred_HI_gene'] = ','.join(
554 |             f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_decipher_genes'])
555 |         seri['auto_evidence'] = ','.join(sorted(anno_result['rules']))
556 |         seri['auto_evidence_score'] = ','.join(
557 |             f'{k}:{anno_result["rules"][k]}' for k in sorted(anno_result['rules']))
558 |         seri['benign_hi_gene'] = ','.join(
559 |             f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uhi_genes'])
560 |         seri['benign_hi_region'] = ','.join(
561 |             f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uhi_regions'])
562 |         seri['benign_ts_gene'] = ','.join(
563 |             f'{x[0].symbol}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uts_genes'])
564 |         seri['benign_ts_region'] = ','.join(
565 |             f'{x[0].name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['overlap_uts_regions'])
566 |         seri['dgv_loss_records'] = ','.join(
567 |             f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['dgv_loss_records']
568 |         )
569 |         seri['dgv_gain_records'] = ','.join(
570 |             f'{x[0].id}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in anno_result['dgv_gain_records']
571 |         )
572 |         seri['gnomad_loss_records'] = ','.join(
573 |             f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in
574 |             anno_result['gnomad_del_records']
575 |         )
576 |         seri['gnomad_gain_records'] = ','.join(
577 |             f'{x[0].chrom}:{x[0].start}-{x[0].end}(af: {float(x[0].af):.2e})({x[1]:.2%};{x[2]:.2%})' for x in
578 |             anno_result['gnomad_dup_records']
579 |         )
580 |         seri['cnv_syndrome_gain'] = ','.join(
581 |             f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['cnv_syndrome_gain']
582 |         )
583 |         seri['cnv_syndrome_loss'] = ','.join(
584 |             f'{x[0].disease_name}({x[1]:.2%};{x[2]:.2%})' for x in anno_result['cnv_syndrome_loss']
585 |         )
586 |         seri['auto_score'] = anno_result['score']
587 |         seri['auto_pathogenicity'] = anno_result['pathogenicity']
588 |         seri['pvs1'] = anno_result['pvs1']
589 |         return seri
590 | 
591 |     def _seri_anno(self, seri: pd.Series) -> pd.Series:
592 |         anno_result = self.annotate(**seri.rename(NAME_MAP).to_dict())
593 |         return seri.append(
594 |             pd.Series(self._serializer(anno_result)).replace('', '-').fillna(DEFAULT_EMPTY_VALUE))
595 | 
596 |     def annotation_file(self, file_path, result_path, col_map=None, cnv_map=None):
597 |         """
598 |         annotate specified file, required columns: chr, start, end, type, error
599 |         :param file_path: input file (TSV)
600 |         :param result_path: result file path (TSV)
601 |         :return: -
602 |         """
603 | 
604 |         if file_path.endswith('xlsx'):
605 |             input_df = pd.read_excel(file_path)
606 |         else:
607 |             input_df = pd.read_csv(file_path, sep='\t')
608 |         if col_map is not None:
609 |             input_df.rename(columns=col_map, inplace=True)
610 |         if cnv_map is not None:
611 |             input_df['type'] = input_df['type'].map(lambda x: cnv_map.get(x, x))
612 |         if 'error' not in input_df.columns:
613 |             input_df['error'] = 0
614 |         input_df['chr'] = input_df['chr'].map(self._norm_chrom)
615 |         try:
616 |             from tqdm import tqdm
617 |             tqdm.pandas()
618 |             input_df = input_df.progress_apply(self._seri_anno, axis=1)
619 |         except ImportError:
620 |             input_df = input_df.apply(self._seri_anno, axis=1)
621 |         if result_path.endswith('xlsx'):
622 |             input_df.to_excel(result_path, index=False)
623 |         else:
624 |             input_df.to_csv(result_path, sep='\t', index=False)
625 | 


--------------------------------------------------------------------------------