├── lib2pass ├── __init__.py ├── merge.py ├── filter.py ├── minimap2.py ├── seqlr.py ├── fastaparse.py ├── decisiontree.py ├── bamparse.py └── main.py ├── 2passtools.yml ├── setup.py ├── LICENSE ├── .gitignore └── README.md /lib2pass/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /2passtools.yml: -------------------------------------------------------------------------------- 1 | name: 2passtools 2 | channels: 3 | - conda-forge 4 | - bioconda 5 | dependencies: 6 | - python=3.6 7 | - pip 8 | - numpy 9 | - click 10 | - click-log 11 | - pysam>=0.15 12 | - ncls 13 | - scikit-learn>=0.22 14 | - pip: 15 | - . 16 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | setup( 5 | name='2passtools', 6 | version='0.3.1', 7 | description=( 8 | 'two pass alignment of long noisy reads' 9 | ), 10 | author='Matthew Parker', 11 | entry_points={ 12 | 'console_scripts': [ 13 | '2passtools = lib2pass.main:main', 14 | ] 15 | }, 16 | packages=[ 17 | 'lib2pass', 18 | ], 19 | install_requires=[ 20 | 'numpy', 21 | 'click', 22 | 'click-log', 23 | 'pysam', 24 | 'ncls', 25 | 'scikit-learn' 26 | ], 27 | ) 28 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Matthew Parker 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lib2pass/merge.py: -------------------------------------------------------------------------------- 1 | from .bamparse import merge_intron_res, build_donor_acceptor_ncls, assign_primary 2 | 3 | 4 | def read_junc_bed(bed_fn): 5 | motifs = {} 6 | lengths = {} 7 | counts = {} 8 | intron_jads = {} 9 | with open(bed_fn) as bed: 10 | for record in bed: 11 | (chrom, start, end, motif, count, strand, 12 | jad, *_) = record.split() 13 | start = int(start) 14 | end = int(end) 15 | ln = end - start 16 | count = int(count) 17 | jad = int(jad) 18 | i = (chrom, start, end, strand) 19 | motifs[i] = motif 20 | lengths[i] = ln 21 | counts[i] = count 22 | intron_jads[i] = jad 23 | return motifs, lengths, counts, intron_jads 24 | 25 | 26 | def get_merged_juncs(junc_bed_fns, primary_splice_local_dist=20): 27 | 28 | res = [read_junc_bed(fn) for fn in junc_bed_fns] 29 | motifs, lengths, counts, intron_jads = merge_intron_res(res) 30 | 31 | introns = list(motifs.keys()) 32 | motifs = [motifs[i] for i in introns] 33 | lengths = [lengths[i] for i in introns] 34 | counts = [counts[i] for i in introns] 35 | jad_label = [intron_jads[i] for i in introns] 36 | 37 | itrees = build_donor_acceptor_ncls( 38 | introns, counts, jad_label, primary_splice_local_dist 39 | ) 40 | is_primary_donor = [] 41 | is_primary_acceptor = [] 42 | for i in introns: 43 | d, a = assign_primary(*i, itrees) 44 | is_primary_donor.append(d) 45 | is_primary_acceptor.append(a) 46 | 47 | return (introns, motifs, lengths, counts, jad_label, 48 | is_primary_donor, is_primary_acceptor) 49 | -------------------------------------------------------------------------------- /lib2pass/filter.py: -------------------------------------------------------------------------------- 1 | import math 2 | from collections import defaultdict 3 | from functools import partial 4 | 5 | 6 | def nullable(val, cast): 7 | if val is None: 8 | return None 9 | else: 10 | return cast(val) 11 | 12 | 13 | def eval_feature_expression( 14 | record, 15 | expression): 16 | ( 17 | motif, count, jad_score, 18 | is_primary_donor, is_primary_acceptor, 19 | dt1_pred, 20 | donor_seq_score, acceptor_seq_score, 21 | dt2_pred 22 | ) = record 23 | safe_dict = { 24 | 'motif': motif, 25 | 'is_GTAG': motif == 'GTAG', 26 | 'is_GCAG': motif == 'GCAG', 27 | 'is_ATAG': motif == 'ATAG', 28 | 'motif_regex_match': lambda expr: bool(re.match(expr, motif)), 29 | 'count': count, 30 | 'jad': jad_score, 31 | 'primary_donor': bool(is_primary_donor), 32 | 'primary_acceptor': bool(is_primary_acceptor), 33 | 'donor_seq_score': donor_seq_score, 34 | 'acceptor_seq_score': acceptor_seq_score, 35 | 'decision_tree_1_pred': bool(dt1_pred), 36 | 'decision_tree_2_pred': bool(dt2_pred), 37 | 'sum': sum, 38 | 'pow': pow, 39 | 'min': min, 40 | 'max': max, 41 | 'math': math, 42 | 'bool': bool, 43 | 'int': partial(nullable, int), 44 | 'str': partial(nullable, str), 45 | 'float': partial(nullable, float), 46 | 'len': partial(nullable, len), 47 | } 48 | res = eval(expression, {"__builtins__": None}, safe_dict) 49 | if not isinstance(res, bool): 50 | res = bool(res) 51 | return res 52 | 53 | 54 | def read_junc_bed(bed_fn): 55 | records = {} 56 | with open(bed_fn) as bed: 57 | for record in bed: 58 | (chrom, start, end, motif, count, strand, 59 | jad, is_pd, is_pa, dt1, lra, lrd, dt2) = record.split() 60 | records[(chrom, start, end, strand)] = ( 61 | motif, int(count), 62 | int(jad), int(is_pd), int(is_pa), 63 | int(dt1), float(lrd), float(lra), int(dt2) 64 | ) 65 | return records 66 | 67 | 68 | def apply_eval_expression(bed_fn, expression): 69 | for (chrom, start, end, strand), scores in read_junc_bed(bed_fn).items(): 70 | r = eval_feature_expression( 71 | scores, expression 72 | ) 73 | yield chrom, start, end, strand, r -------------------------------------------------------------------------------- /lib2pass/minimap2.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import tempfile 4 | 5 | MINIMAP2 = os.path.abspath( 6 | os.path.split(__file__)[0] + 7 | '/../external/minimap2/minimap2' 8 | ) 9 | 10 | 11 | def subprocess_command(cmd, stdout_fn): 12 | with open(stdout_fn, 'w') as s: 13 | proc = subprocess.Popen( 14 | cmd, 15 | stdout=s, 16 | stderr=subprocess.PIPE 17 | ) 18 | _, stderr = proc.communicate() 19 | if proc.returncode: 20 | raise subprocess.CalledProcessError(stderr.decode()) 21 | else: 22 | return stderr.decode() 23 | 24 | 25 | def map_with_minimap2(fastq_fn, reference_fn, output_fn, threads=1, 26 | use_canon=False, noncanon_pen=9, 27 | junc_bed=None, junc_bonus=9): 28 | if not os.path.exists(fastq_fn): 29 | raise OSError('fastq_fn not found') 30 | elif not os.path.exists(reference_fn): 31 | raise OSError('reference_fn not found') 32 | splice_flank = 'yes' if use_canon else 'no' 33 | noncanon_pen = noncanon_pen if use_canon else 0 34 | use_canon = 'f' if use_canon else 'n' 35 | s_handle, sam_fn = tempfile.mkstemp(suffix='.sam') 36 | b_handle, bam_fn = tempfile.mkstemp(suffix='.bam') 37 | 38 | # run minimap 39 | minimap2_cmd = [ 40 | MINIMAP2, f'-t{threads}', '-k14', '-w5', '--splice', 41 | '-g2000', '-G10000', '-A1', '-B2', '-O2,32', '-E1,0', 42 | f'-C{noncanon_pen}', f'--splice-flank={splice_flank}', f'-u{use_canon}', 43 | '-z200', '-L', '--cs=long', '-a' 44 | ] 45 | if junc_bed is not None: 46 | minimap2_cmd += ['--junc-bed', junc_bed, f'--junc-bonus={junc_bonus}'] 47 | minimap2_cmd += [reference_fn, fastq_fn] 48 | minimap2_stderr = subprocess_command(minimap2_cmd, sam_fn) 49 | 50 | # run samtools view 51 | samtools_view_cmd = ['samtools', 'view', '-bS', sam_fn] 52 | samtools_view_stderr = subprocess_command(samtools_view_cmd, bam_fn) 53 | 54 | # clean up minimap2 output 55 | os.close(s_handle) 56 | os.remove(sam_fn) 57 | 58 | # run samtools sort 59 | samtools_sort_cmd = ['samtools', 'sort', '-@', str(threads), '-o', '-', bam_fn] 60 | samtools_sort_stderr = subprocess_command(samtools_sort_cmd, output_fn) 61 | 62 | # clean up samtools view output 63 | os.close(b_handle) 64 | os.remove(bam_fn) 65 | 66 | # run samtools index 67 | samtools_index_cmd = ['samtools', 'index', output_fn] 68 | subprocess.check_call(samtools_index_cmd) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | cover/ 54 | 55 | # Translations 56 | *.mo 57 | *.pot 58 | 59 | # Django stuff: 60 | *.log 61 | local_settings.py 62 | db.sqlite3 63 | db.sqlite3-journal 64 | 65 | # Flask stuff: 66 | instance/ 67 | .webassets-cache 68 | 69 | # Scrapy stuff: 70 | .scrapy 71 | 72 | # Sphinx documentation 73 | docs/_build/ 74 | 75 | # PyBuilder 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | -------------------------------------------------------------------------------- /lib2pass/seqlr.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import random 3 | import logging 4 | import itertools as it 5 | 6 | import numpy as np 7 | 8 | from sklearn.linear_model import LogisticRegression 9 | from sklearn.model_selection import KFold 10 | 11 | from joblib import Parallel, delayed 12 | 13 | from .fastaparse import get_junction_seqs 14 | 15 | 16 | log = logging.getLogger('2passtools') 17 | 18 | 19 | SEQ_OHE = {'A': [1, 0, 0], 20 | 'C': [0, 1, 0], 21 | 'G': [0, 0, 1], 22 | 'T': [0, 0, 0]} 23 | 24 | 25 | def one_hot_sequence(seq): 26 | ohe = [] 27 | for base in seq: 28 | try: 29 | ohe.append(SEQ_OHE[base]) 30 | except KeyError: 31 | ohe.append(SEQ_OHE[random.choice('ACGT')]) 32 | return np.array(ohe) 33 | 34 | 35 | def train_and_predict(X_train, y_train, X_test): 36 | lr = LogisticRegression( 37 | solver='lbfgs', penalty='l2', 38 | max_iter=500, n_jobs=1) 39 | lr.fit(X_train, y_train) 40 | return lr.predict_proba(X_test)[:, 1] 41 | 42 | 43 | def kfold_oob_prediction(X_data, y_data, n_splits, processes=1): 44 | idx = [] 45 | preds = [] 46 | kf = KFold(n_splits=n_splits, shuffle=True) 47 | kf_idx = list(kf.split(X_data)) 48 | with Parallel(n_jobs=min(n_splits, processes)) as pool: 49 | preds = pool( 50 | delayed(train_and_predict)(X_data[train_idx], 51 | y_data[train_idx], 52 | X_data[test_idx]) 53 | for train_idx, test_idx in kf_idx 54 | ) 55 | test_idx = [tst for trn, tst in kf_idx] 56 | test_idx = np.concatenate(test_idx) 57 | preds = np.concatenate(preds) 58 | return preds[np.argsort(test_idx)] 59 | 60 | 61 | def predict_splice_junctions_from_seq(introns, labels, fasta_fn, window_size, 62 | n_splits, processes): 63 | log.info(f'Fetching junction sequences from {fasta_fn}') 64 | (donors, donor_seqs, donor_labels, 65 | acceptors, acceptor_seqs, acceptor_labels) = get_junction_seqs( 66 | introns, labels, fasta_fn, window_size, processes 67 | ) 68 | log.info(f'Identified {len(donors):d} unique donors and {len(acceptors):d} unique acceptors') 69 | donor_seq_ohe = np.array([one_hot_sequence(seq).ravel() for seq in donor_seqs]) 70 | donor_labels = np.array(donor_labels) 71 | log.info(f'Scoring donor sequences with LR...') 72 | donor_preds = kfold_oob_prediction( 73 | donor_seq_ohe, donor_labels, n_splits, processes 74 | ) 75 | donor_preds = {k: v for k, v in zip(donors, donor_preds)} 76 | acceptor_seq_ohe = np.array([one_hot_sequence(seq).ravel() for seq in acceptor_seqs]) 77 | acceptor_labels = np.array(acceptor_labels) 78 | log.info(f'Scoring acceptor sequences with LR...') 79 | acceptor_preds = kfold_oob_prediction( 80 | acceptor_seq_ohe, acceptor_labels, n_splits, processes 81 | ) 82 | acceptor_preds = {k: v for k, v in zip(acceptors, acceptor_preds)} 83 | donor_preds, acceptor_preds = get_donor_acceptor_preds_for_introns( 84 | introns, donor_preds, acceptor_preds 85 | ) 86 | return donor_preds, acceptor_preds 87 | 88 | 89 | def get_donor_acceptor_preds_for_introns(introns, donor_preds, acceptor_preds): 90 | intron_donor_preds = [] 91 | intron_acceptor_preds = [] 92 | for chrom, start, end, strand in introns: 93 | if strand == '+': 94 | donor_pos = start 95 | acceptor_pos = end 96 | else: 97 | donor_pos = end 98 | acceptor_pos = start 99 | intron_donor_preds.append(donor_preds[(chrom, donor_pos, strand)]) 100 | intron_acceptor_preds.append(acceptor_preds[(chrom, acceptor_pos, strand)]) 101 | return intron_donor_preds, intron_acceptor_preds -------------------------------------------------------------------------------- /lib2pass/fastaparse.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | 3 | import numpy as np 4 | 5 | import pysam 6 | from joblib import Parallel, delayed 7 | 8 | 9 | RC = str.maketrans('ACGT', 'TGCA') 10 | 11 | 12 | def rev_comp(seq): 13 | return seq.translate(RC)[::-1] 14 | 15 | 16 | def fetch_padded(fasta, chrom, pos, w): 17 | clen = fasta.get_reference_length(chrom) 18 | left = pos - w 19 | right = pos + w 20 | if left < 0: 21 | lpad = abs(left) 22 | left = 0 23 | else: 24 | lpad = 0 25 | if right > clen: 26 | rpad = right - clen 27 | right = clen 28 | else: 29 | rpad = 0 30 | seq = fasta.fetch(chrom, left, right) 31 | if lpad: 32 | seq = 'N' * lpad + seq 33 | if rpad: 34 | seq = seq + 'N' * rpad 35 | return seq 36 | 37 | 38 | def _get_junc_seqs(bed_records, fasta_fn, window_size): 39 | intron_donor_labels = defaultdict(lambda: 0) 40 | intron_acceptor_labels = defaultdict(lambda: 0) 41 | intron_donor_seqs = {} 42 | intron_acceptor_seqs = {} 43 | w = window_size // 2 44 | with pysam.FastaFile(fasta_fn) as fasta: 45 | for chrom, start, end, strand, label in bed_records: 46 | donor_seq = fetch_padded(fasta, chrom, start, w) 47 | acceptor_seq = fetch_padded(fasta, chrom, end, w) 48 | if strand == '-': 49 | donor_seq, acceptor_seq = ( 50 | rev_comp(acceptor_seq), rev_comp(donor_seq) 51 | ) 52 | donor_pos = end 53 | acceptor_pos = start 54 | else: 55 | donor_pos = start 56 | acceptor_pos = end 57 | 58 | intron_donor_seqs[(chrom, donor_pos, strand)] = donor_seq 59 | intron_donor_labels[(chrom, donor_pos, strand)] |= label 60 | intron_acceptor_seqs[(chrom, acceptor_pos, strand)] = acceptor_seq 61 | intron_acceptor_labels[(chrom, acceptor_pos, strand)] |= label 62 | 63 | return (intron_donor_seqs, intron_donor_labels, 64 | intron_acceptor_seqs, intron_acceptor_labels) 65 | 66 | 67 | def chunk_records(introns, labels, processes): 68 | records = [] 69 | for (chrom, start, end, strand), lab in zip(introns, labels): 70 | records.append((chrom, start, end, strand, lab)) 71 | nrecords = len(records) 72 | n, r = divmod(nrecords, processes) 73 | split_points = ([0] + r * [n + 1] + (processes - r) * [n]) 74 | split_points = np.cumsum(split_points) 75 | for i in range(processes): 76 | start = split_points[i] 77 | end = split_points[i + 1] 78 | yield records[start: end] 79 | 80 | 81 | def or_update(d1, d2): 82 | for k, v in d2.items(): 83 | d1[k] |= v 84 | return d1 85 | 86 | 87 | def merge_parallel_junc_res(res): 88 | donor_seqs = {} 89 | donor_labels = defaultdict(lambda: 0) 90 | acceptor_seqs = {} 91 | acceptor_labels = defaultdict(lambda: 0) 92 | 93 | for ds, dl, as_, al in res: 94 | donor_seqs.update(ds) 95 | acceptor_seqs.update(as_) 96 | donor_labels = or_update(donor_labels, dl) 97 | acceptor_labels = or_update(acceptor_labels, al) 98 | 99 | donors = list(donor_seqs.keys()) 100 | donor_seqs = [donor_seqs[d] for d in donors] 101 | donor_labels = [donor_labels[d] for d in donors] 102 | 103 | acceptors = list(acceptor_seqs.keys()) 104 | acceptor_seqs = [acceptor_seqs[a] for a in acceptors] 105 | acceptor_labels = [acceptor_labels[a] for a in acceptors] 106 | 107 | return (donors, donor_seqs, donor_labels, 108 | acceptors, acceptor_seqs, acceptor_labels) 109 | 110 | 111 | def get_junction_seqs(introns, labels, fasta_fn, window_size, processes=12): 112 | with Parallel(n_jobs=processes) as pool: 113 | res = pool( 114 | delayed(_get_junc_seqs)(introns, fasta_fn, window_size) 115 | for introns in chunk_records(introns, labels, processes) 116 | ) 117 | 118 | (donors, donor_seqs, donor_labels, 119 | acceptors, acceptor_seqs, acceptor_labels) = merge_parallel_junc_res(res) 120 | 121 | return (donors, donor_seqs, donor_labels, 122 | acceptors, acceptor_seqs, acceptor_labels) -------------------------------------------------------------------------------- /lib2pass/decisiontree.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from operator import itemgetter 3 | import re 4 | import numpy as np 5 | 6 | from sklearn.preprocessing import quantile_transform 7 | from sklearn.tree import DecisionTreeClassifier, export_text 8 | from sklearn.ensemble import ExtraTreesClassifier 9 | 10 | 11 | log = logging.getLogger('2passtools') 12 | 13 | 14 | DT1_DENOVO_FEATURES = [ 15 | 'is_canonical_motif', 'jad', 16 | 'is_primary_donor', 'is_primary_acceptor', 17 | 'intron_length_quantile', 18 | ] 19 | 20 | DT2_DENOVO_FEATURES = [ 21 | 'jad', 'is_primary_donor', 'is_primary_acceptor', 22 | 'intron_length_quantile', 23 | 'donor_lr_score', 'acceptor_lr_score', 24 | ] 25 | 26 | 27 | def format_feature_importances(feature_names, feature_importances, width=10): 28 | max_size = max(feature_importances) 29 | point_size = max_size / width 30 | pad_to = max([len(x) for x in feature_names]) 31 | feature_importances = {fn: fi for fn, fi in zip(feature_names, feature_importances)} 32 | feature_importances = sorted(feature_importances.items(), key=itemgetter(1), reverse=True) 33 | fmt = '' 34 | for fn, fi in feature_importances: 35 | rpad = ' ' * (pad_to - len(fn)) 36 | fn += rpad 37 | bar = '*' * int(round(fi / point_size)) 38 | fmt += f'{fn} {bar} {fi:.1f}\n' 39 | return fmt 40 | 41 | 42 | def _de_novo_pred(X, y, feature_names, classifier='decision_tree'): 43 | if classifier == 'random_forest': 44 | log.info('Using extremely random forest') 45 | clf = ExtraTreesClassifier(n_estimators=250, bootstrap=True, oob_score=True) 46 | clf.fit(X, y) 47 | log.debug('Feature importance:') 48 | log.debug(format_feature_importances(feature_names, clf.feature_importances_)) 49 | pred = clf.oob_decision_function_[:, 1] 50 | # in the unlikely event dt1_pred contains NaNs 51 | # (can happen when n_estimators is not big enough) 52 | pred[np.isnan(pred)] = 0 53 | pred = pred >= 0.5 54 | 55 | else: 56 | clf = DecisionTreeClassifier( 57 | max_depth=5, 58 | min_samples_split=100, 59 | min_impurity_decrease=0.005, 60 | ) 61 | clf.fit(X, y) 62 | log.debug('Tree structure:') 63 | log.debug(export_text(clf, feature_names=feature_names)) 64 | pred = clf.predict(X) 65 | return pred.astype(int) 66 | 67 | 68 | def dt1_pred(intron_motif, jad_labels, is_primary_donor, is_primary_acceptor, 69 | motif_regex='GTAG|GCAG|ATAG', jad_size_threshold=4): 70 | motif_regex = re.compile(motif_regex) 71 | is_canon = np.asarray([bool(motif_regex.match(m)) for m in intron_motif]) 72 | 73 | jad_labels = np.asarray(jad_labels) >= jad_size_threshold 74 | 75 | is_primary_donor = np.asarray(is_primary_donor, dtype=bool) 76 | is_primary_acceptor = np.asarray(is_primary_acceptor, dtype=bool) 77 | 78 | is_primary = is_primary_donor & is_primary_acceptor 79 | return (jad_labels & is_canon) | (is_primary & is_canon) 80 | 81 | 82 | def dt1_de_novo_pred(intron_motif, intron_lengths, 83 | jad_labels, is_primary_donor, is_primary_acceptor, 84 | is_annot, motif_regex='GTAG|GCAG|ATAG', 85 | classifier='decision_tree'): 86 | motif_regex = re.compile(motif_regex) 87 | is_canon = np.asarray([int(bool(motif_regex.match(m))) for m in intron_motif]) 88 | 89 | jad_labels = np.asarray(jad_labels) 90 | 91 | is_primary_donor = np.asarray(is_primary_donor) 92 | is_primary_acceptor = np.asarray(is_primary_acceptor) 93 | 94 | intron_length_quantile = quantile_transform( 95 | np.asarray(intron_lengths).reshape(-1, 1) 96 | ).ravel() 97 | 98 | X = np.stack( 99 | [ 100 | is_canon, jad_labels, 101 | is_primary_donor, is_primary_acceptor, 102 | intron_length_quantile 103 | ], 104 | axis=1 105 | ) 106 | y = np.asarray(is_annot, dtype=np.int) 107 | pred = _de_novo_pred(X, y, DT1_DENOVO_FEATURES, classifier=classifier) 108 | return pred 109 | 110 | 111 | def dt2_pred(jad_labels, 112 | is_primary_donor, 113 | is_primary_acceptor, 114 | donor_lr_score, 115 | acceptor_lr_score, 116 | low_conf_thresh=0.1, 117 | high_conf_thresh=0.6, 118 | jad_size_threshold=4): 119 | 120 | jad_labels = np.asarray(jad_labels) >= jad_size_threshold 121 | is_primary_donor = np.asarray(is_primary_donor, dtype=bool) 122 | is_primary_acceptor = np.asarray(is_primary_acceptor, dtype=bool) 123 | donor_lr_score = np.asarray(donor_lr_score, dtype=np.float64) 124 | acceptor_lr_score = np.asarray(acceptor_lr_score, dtype=np.float64) 125 | 126 | is_primary = is_primary_donor & is_primary_acceptor 127 | 128 | seq_low_conf = ((donor_lr_score >= low_conf_thresh) & 129 | (acceptor_lr_score >= low_conf_thresh)) 130 | seq_high_conf = ((donor_lr_score >= high_conf_thresh) & 131 | (acceptor_lr_score >= high_conf_thresh)) 132 | 133 | return (jad_labels & seq_low_conf) | (is_primary & seq_high_conf) 134 | 135 | 136 | def dt2_de_novo_pred(intron_lengths, jad_labels, 137 | is_primary_donor, is_primary_acceptor, 138 | donor_lr_score, acceptor_lr_score, 139 | is_annot, classifier='decision_tree'): 140 | jad_labels = np.asarray(jad_labels) 141 | 142 | is_primary_donor = np.asarray(is_primary_donor) 143 | is_primary_acceptor = np.asarray(is_primary_acceptor) 144 | 145 | intron_length_quantile = quantile_transform( 146 | np.asarray(intron_lengths).reshape(-1, 1) 147 | ).ravel() 148 | 149 | X = np.stack( 150 | [ 151 | jad_labels, is_primary_donor, is_primary_acceptor, 152 | intron_length_quantile, donor_lr_score, acceptor_lr_score, 153 | ], 154 | axis=1 155 | ) 156 | y = np.asarray(is_annot, dtype=np.int) 157 | pred = _de_novo_pred(X, y, DT2_DENOVO_FEATURES, classifier=classifier) 158 | return pred -------------------------------------------------------------------------------- /lib2pass/bamparse.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict, Counter 2 | import re 3 | 4 | import numpy as np 5 | import pysam 6 | from joblib import Parallel, delayed 7 | from ncls import NCLS 8 | 9 | 10 | CS_SPLITTER = '([-+*~=:])' 11 | RC = str.maketrans('ACGTN', 'TGCAN') 12 | 13 | def parse_cs_tag(cs_tag): 14 | ''' 15 | generalisable function for parsing minimap2 cs tag (long and short form) 16 | ''' 17 | cs_tag = re.split(CS_SPLITTER, cs_tag)[1:] 18 | cs_ops = cs_tag[::2] 19 | cs_info = cs_tag[1::2] 20 | cs_parsed = [] 21 | for op, info in zip(cs_ops, cs_info): 22 | if op == '=': 23 | # long form match 24 | cs_parsed.append(('=', len(info), info)) 25 | elif op == ':': 26 | # short form match 27 | cs_parsed.append(('=', int(info), None)) 28 | elif op == '*': 29 | # mismatch 30 | ref = info[0] 31 | alt = info[1] 32 | cs_parsed.append((op, 1, (ref, alt))) 33 | elif op == '+' or op == '-': 34 | cs_parsed.append((op, len(info), info)) 35 | elif op == '~': 36 | donor_motif, intron_length, acceptor_motif = re.match( 37 | '^([acgtn]{2})([0-9]+)([acgtn]{2})', info).groups() 38 | motif = (donor_motif + acceptor_motif).upper() 39 | intron_length = int(intron_length) 40 | cs_parsed.append((op, intron_length, motif)) 41 | return cs_parsed 42 | 43 | 44 | def get_junction_overhang_size(overhang_cs): 45 | ''' 46 | for cs tag split at intron (and reoriented so nearest op to intron is first) 47 | returns the overhang size (number of nt which match before 48 | first mismatch, insertion or deletion) 49 | ''' 50 | try: 51 | return overhang_cs[0][1] if overhang_cs[0][0] == '=' else 0 52 | except IndexError: 53 | # sometimes when junctions are provided minimap2 can produce alignments 54 | # where an annotated junction is used with no overhang on the other 55 | # side!! 56 | return 0 57 | 58 | 59 | def infer_strand_from_intron_motifs(intron_motifs, read_strand): 60 | strand_counts = Counter() 61 | for motif in intron_motifs: 62 | if re.match('G[TC]AG', motif): 63 | strand_counts['+'] += 1 64 | elif re.match('CT[AG]C', motif): 65 | strand_counts['-'] += 1 66 | else: 67 | strand_counts['.'] += 1 68 | 69 | if strand_counts['+'] == strand_counts['-']: 70 | return read_strand 71 | elif strand_counts['+'] > strand_counts['-']: 72 | return '+' 73 | else: 74 | return '-' 75 | 76 | 77 | def find_introns(aln, stranded=True): 78 | ''' 79 | use the cs tag to find introns and their match overhangs in the alignment 80 | ''' 81 | introns = [] 82 | intron_motifs = [] 83 | chrom = aln.reference_name 84 | start = aln.reference_start 85 | end = aln.reference_end 86 | read_strand = '+-'[aln.is_reverse] 87 | pos = start 88 | cs_tag = parse_cs_tag(aln.get_tag('cs')) 89 | for i, (op, ln, info) in enumerate(cs_tag): 90 | if op == '+': 91 | # insertion does not consume reference 92 | continue 93 | elif op in ('=', '*', '-'): 94 | # match, mismatch, deletion consume reference 95 | pos += ln 96 | elif op == '~': 97 | # intron consumes reference and is recorded 98 | left = pos 99 | right = left + ln 100 | left_tag = cs_tag[:i][::-1] 101 | right_tag = cs_tag[i + 1:] 102 | junc_overhang = min( 103 | get_junction_overhang_size(left_tag), 104 | get_junction_overhang_size(right_tag) 105 | ) 106 | # info is intron motif 107 | introns.append([left, right, junc_overhang, ln, info]) 108 | intron_motifs.append(info) 109 | pos = right 110 | 111 | # infer strand and yield introns 112 | if stranded: 113 | strand = read_strand 114 | else: 115 | strand = infer_strand_from_intron_motifs(intron_motifs, read_strand) 116 | 117 | n_introns = len(introns) 118 | for i, (start, end, overhang, length, motif) in enumerate(introns, 1): 119 | if strand == '-': 120 | motif = motif.translate(RC)[::-1] 121 | yield chrom, start, end, strand, motif, overhang, length 122 | 123 | 124 | def build_donor_acceptor_ncls(introns, intron_counts, intron_jads, dist=20): 125 | donor_invs = defaultdict(Counter) 126 | acceptor_invs = defaultdict(Counter) 127 | donor_inv_jads = defaultdict(Counter) 128 | acceptor_inv_jads = defaultdict(Counter) 129 | 130 | for (chrom, start, end, strand), count, jad in zip(introns, intron_counts, intron_jads): 131 | if strand == '+': 132 | donor_inv = (start - dist, start + dist, start) 133 | acceptor_inv = (end - dist, end + dist, end) 134 | else: 135 | donor_inv = (end - dist, end + dist, end) 136 | acceptor_inv = (start - dist, start + dist, start) 137 | donor_invs[(chrom, strand)][donor_inv] += count 138 | acceptor_invs[(chrom, strand)][acceptor_inv] += count 139 | 140 | # jad is used to break count ties 141 | donor_inv_jads[(chrom, strand)][donor_inv] = max( 142 | donor_inv_jads[(chrom, strand)][donor_inv], jad 143 | ) 144 | acceptor_inv_jads[(chrom, strand)][acceptor_inv] = max( 145 | acceptor_inv_jads[(chrom, strand)][acceptor_inv], jad 146 | ) 147 | 148 | da_itree = {} 149 | for label, invs, inv_jads in zip(['donor', 'acceptor'], 150 | [donor_invs, acceptor_invs], 151 | [donor_inv_jads, acceptor_inv_jads]): 152 | da_itree[label] = {} 153 | for chrom, pos in invs.items(): 154 | jads = [inv_jads[chrom][i] for i in pos] 155 | starts, ends, mids, counts = zip(*[(s, e, m, c) for (s, e, m), c in pos.items()]) 156 | starts = np.array(starts, dtype=np.int64) 157 | ends = np.array(ends, dtype=np.int64) 158 | idx = np.array(mids, dtype=np.int64) 159 | counts = {i: (c, j) for i, c, j in zip(mids, counts, jads)} 160 | itree = NCLS(starts, ends, idx) 161 | da_itree[label][chrom] = (itree, counts) 162 | return da_itree 163 | 164 | 165 | def assign_primary(chrom, start, end, strand, inv_trees): 166 | donor_pos = start if strand == '+' else end 167 | acceptor_pos = end if strand == '+' else start 168 | 169 | is_primary = {} 170 | for label, pos, in zip(['donor', 'acceptor'], 171 | [donor_pos, acceptor_pos]): 172 | itree, counts = inv_trees[label][(chrom, strand)] 173 | max_count = 0 174 | max_jad = 0 175 | for _, _, ov_pos in itree.find_overlap(pos, pos): 176 | if ov_pos != pos: 177 | c, j = counts[ov_pos] 178 | max_count = max(max_count, c) 179 | max_jad = max(max_jad, j) 180 | if max_count < counts[pos][0]: 181 | is_primary[label] = True 182 | elif (max_count == counts[pos][0]) & (max_jad < counts[pos][1]): 183 | # break count ties with jad 184 | is_primary[label] = True 185 | else: 186 | is_primary[label] = False 187 | return is_primary['donor'], is_primary['acceptor'] 188 | 189 | 190 | def fetch_introns_for_interval(bam_fn, chrom, start, end, stranded): 191 | motifs = {} 192 | lengths = {} 193 | counts = Counter() 194 | intron_jads = Counter() 195 | with pysam.AlignmentFile(bam_fn) as bam: 196 | for aln in bam.fetch(chrom, start, end): 197 | # to prevent double counting of introns, ignore alns 198 | # which start before beginning of specified interval 199 | if aln.reference_start < start: 200 | continue 201 | for *i, m, ov, ln in find_introns(aln, stranded): 202 | i = tuple(i) 203 | motifs[i] = m 204 | lengths[i] = ln 205 | counts[i] += 1 206 | intron_jads[i] = max(intron_jads[i], ov) 207 | return motifs, lengths, counts, intron_jads 208 | 209 | 210 | def get_bam_intervals(bam_fn, batch_size): 211 | with pysam.AlignmentFile(bam_fn) as bam: 212 | references = {ref: ln for ref, ln in zip(bam.references, bam.lengths)} 213 | for ref, ref_len in references.items(): 214 | for i in range(0, ref_len, batch_size): 215 | query = (ref, i, min(ref_len, i + batch_size)) 216 | yield query 217 | 218 | 219 | def merge_intron_res(res): 220 | motifs = {} 221 | lengths = {} 222 | counts = Counter() 223 | intron_jads = Counter() 224 | for m, l, c, j in res: 225 | motifs.update(m) 226 | lengths.update(l) 227 | counts += c 228 | for i, jad in j.items(): 229 | intron_jads[i] = max(intron_jads[i], jad) 230 | return motifs, lengths, counts, intron_jads 231 | 232 | 233 | def parse_introns(bam_fn, primary_splice_local_dist, 234 | stranded, batch_size, processes): 235 | ''' 236 | find all introns in the dataset, label them as positive or negative 237 | training examples using the simple jad filter and then extract their 238 | sequences from the reference for training the neural network 239 | ''' 240 | with Parallel(n_jobs=processes) as pool: 241 | res = pool( 242 | delayed(fetch_introns_for_interval)( 243 | bam_fn, *inv, stranded) 244 | for inv in get_bam_intervals(bam_fn, batch_size) 245 | ) 246 | motifs, lengths, counts, intron_jads = merge_intron_res(res) 247 | 248 | introns = list(motifs.keys()) 249 | motifs = [motifs[i] for i in introns] 250 | lengths = [lengths[i] for i in introns] 251 | counts = [counts[i] for i in introns] 252 | jad_label = [intron_jads[i] for i in introns] 253 | 254 | itrees = build_donor_acceptor_ncls( 255 | introns, counts, jad_label, primary_splice_local_dist 256 | ) 257 | is_primary_donor = [] 258 | is_primary_acceptor = [] 259 | for i in introns: 260 | d, a = assign_primary(*i, itrees) 261 | is_primary_donor.append(d) 262 | is_primary_acceptor.append(a) 263 | 264 | return (introns, motifs, lengths, counts, jad_label, 265 | is_primary_donor, is_primary_acceptor) 266 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 2passtools 2 | 3 | [![DOI](https://zenodo.org/badge/242980365.svg)](https://zenodo.org/badge/latestdoi/242980365) 4 | 5 | A package for filtering splice junctions extracted from noisy long read alignments generated using minimap2. These can then be used to perform second pass alignment with minimap2, feeding in the junctions using the `--junc-bed` flag. 6 | 7 | ## Installation: 8 | 9 | 2passtools has been tested with python 3.6, and requires `numpy`, `scikit-learn`, `pysam`, `NCLS` and `click`. The easiest way to install it is using the conda environment yaml provided: 10 | 11 | ``` 12 | 13 | git clone https://www.github.com/bartongroup/2passtools 14 | cd 2passtools 15 | conda env create -f 2passtools.yml 16 | 17 | source activate 2passtools 18 | ``` 19 | 20 | Alternatively 2passtools and the required packages can be installed using pip: 21 | 22 | ``` 23 | pip install git+git://github.com/bartongroup/2passtools.git 24 | ``` 25 | 26 | 27 | ## Use: 28 | 29 | 2passtools has three commands.... 30 | 31 | NB: There is a [snakemake](https://www.github.com/bartongroup/two_pass_alignment_pipeline) pipeline which can be used to run the benchmarking scripts used in the manuscript. 32 | 33 | ### `score`: 34 | 35 | The `2passtools score` command requires as input a long read sequencing bam file aligned using minimap2 and a reference fasta file. It then extracts junction metrics and sequence information and uses it to score splice junctions found in the alignments. The output of `score` is a BED file with multiple columns corresponding to different metrics and model scores (see output below). This format cannot be passed to minimap2 directly as (A) it has not yet been filtered and (B) the extra column format is not supported by minimap2 which requires 6-column bed. Filtering and reformatting can be done using `2passtools filter`. 36 | 37 | If you already have a reference annotation but want to discover novel splice junctions, consider using the **annotation-aided mode** of `2passtools score`. It takes an additional input: a bed file containing high-confidence splice junctions from an existing reference annotation. `2passtools` will use these as positive examples to train *de novo* models to detect novel splice junctions. It works best if the existing annotation is relatively complete, but there are significant numbers of novel splice junctions: if the annotation is too incomplete it is better to run 2passtools using the pre-trained model, and if there are very few novel splice junctions, it is better just to do reference-guided alignment (without `2passtools`). There are experiments which might help you guide your decision in the [Genome Biology paper](https://doi.org/10.1186/s13059-021-02296-0). 38 | 39 | #### Options: 40 | 41 | ``` 42 | $ 2passtools score --help 43 | Usage: 2passtools score [OPTIONS] BAM_FN 44 | 45 | 2passtools score: A tool for extracting and scores junctions from a bam 46 | file aligned with minimap2. Filtered junctions can be used to realign 47 | reads in a second pass with minimap2. 48 | 49 | Bam file must be mapped with minimap2 and have the long form CS tag, e.g. 50 | 51 | minimap2 -a --cs=long -k14 -x splice ref.fa reads.fq 52 | 53 | Options: 54 | -o, --output-bed-fn TEXT Output file path [required] 55 | -f, --ref-fasta-fn TEXT Path to the fasta file that reads were 56 | mapped to [required] 57 | 58 | -a, --annot-bed-fn TEXT Optional BED file containing annotated 59 | junctions 60 | 61 | -j, --jad-size-threshold INTEGER 62 | JAD to threshold at in the decision tree 63 | -d, --primary-splice-local-dist INTEGER 64 | Distance to search for alternative 65 | donor/acceptors when calculating primary d/a 66 | 67 | -m, --canonical-motifs TEXT Intron motifs considered canonical in 68 | organism. Should be four char DNA motifs 69 | separated by vertical bar only 70 | 71 | -w, --lr-window-size INTEGER Sequence size to extract to train logistic 72 | regression models 73 | 74 | -k, --lr-kfold INTEGER Number of cross validation k-folds for 75 | logistic regression models 76 | 77 | -lt, --lr-low-confidence-threshold FLOAT 78 | Logistic regression low confidence threshold 79 | for decision tree 2 80 | 81 | -ht, --lr-high-confidence-threshold FLOAT 82 | Logistic regression high confidence 83 | threshold for decision tree 2 84 | 85 | -c, --classifier-type [decision_tree|random_forest] 86 | When annotated juncs are available, train 87 | this classifier type 88 | 89 | --keep-all-annot / --filter-annot 90 | When annotated juncs are available, always 91 | keep all annotated juncs 92 | 93 | --stranded / --unstranded Whether input data is stranded or 94 | unstranded. direct RNA is stranded, cDNA 95 | often isn't 96 | 97 | -p, --processes INTEGER 98 | -s, --random-seed INTEGER 99 | -v, --verbosity LVL Either CRITICAL, ERROR, WARNING, INFO or 100 | DEBUG 101 | 102 | --help Show this message and exit. 103 | ``` 104 | 105 | #### Output: 106 | 107 | A 13-column BED file format with the following values: 108 | 109 | ``` 110 | 1. chrom (string) 111 | 2. start (integer) 112 | 3. end (integer) 113 | 4. intron-motif (four char string) 114 | 5. supporting read count (integer) 115 | 6. strand (string, either '+' or '-') 116 | 7. junction alignment distance metric (integer) 117 | 8. primary donor metric (integer, either 0 or 1) 118 | 9. primary acceptor metric (integer, either 0 or 1) 119 | 10. decision tree 1 output (integer, either 0 or 1) 120 | 11. logistic regression model donor score (float) 121 | 12. logistic regression model acceptor score (float) 122 | 13. decision tree 2 output (integer, either 0 or 1) 123 | ``` 124 | 125 | ### `filter`: 126 | 127 | The `2passtools filter` command can be used to filter the 13-column bed file using any expression utilising the metrics or model outputs. The expression should be a valid python expression which evaluates to `True` or `False` for each junction, and can use any of the following safe variables and functions: 128 | 129 | * `motif`: The intron motif in ACGTN alphabet (`str`), 130 | * `is_GTAG`: The intron motif is GU/AG (`bool`), 131 | * `is_GCAG`: The intron motif is GC/AG (`bool`), 132 | * `is_ATAG`: The intron motif is AU/AG (`bool`), 133 | * `motif_regex_match`: safe function allowing regex matching of motif, e.g. `motif_regex_match("G[CT]AG")` (`func`), 134 | * `count`: The supporting read count (`int`), 135 | * `jad`: The junction alignment distance metric (`int`), 136 | * `primary_donor`: The primary donor metric (`bool`), 137 | * `primary_acceptor`: The primary acceptor metric (`bool`), 138 | * `donor_seq_score`: The logistic regression model donor score (`float`), 139 | * `acceptor_seq_score`: The logistic regression model acceptor score (`float`), 140 | * `decision_tree_1_pred`: Decision tree model 1 output (`bool`), 141 | * `decision_tree_2_pred`: Decision tree model 2 output (`bool`), 142 | * `sum`, `pow`, `min`, `max`, `len`: python functions, 143 | * `math`: The python `math` module, any function from it is useable, 144 | * `bool`, `int`, `str`, `float`: python functions. 145 | 146 | For example: 147 | 148 | * `2passtools filter --exprs 'jad > 3'` filters for junction alignment distance of 4 nt or more. 149 | * `2passtools filter --exprs 'decision_tree_2_pred'` filters for junctions that pass the second decision tree model. 150 | 151 | etc. 152 | 153 | #### Usage: 154 | 155 | ``` 156 | $ 2passtools filter --help 157 | Usage: 2passtools filter [OPTIONS] BED_FN 158 | 159 | 2passtools filter: Convenience tool to filter a junction bed and produce 160 | 6-column bed format which is compatible with minimap2. 161 | 162 | Options: 163 | -o, --output-bed-fn TEXT [required] 164 | --exprs TEXT 165 | -v, --verbosity LVL Either CRITICAL, ERROR, WARNING, INFO or DEBUG 166 | --help Show this message and exit. 167 | ``` 168 | 169 | ### `merge`: 170 | 171 | The `2passtools merge` command is similar to `score`, but takes multiple 13-column bed files produced by `score` and merges them, recalculating metrics and model stats, to produce a unified junction set. This is useful for making sure all replicates are aligned similarly, and often alignment is improved by borrowing power across replicates. Output is in the same 13-column BED format as `score`. 172 | 173 | #### `Usage`: 174 | 175 | ``` 176 | $ 2passtools merge --help 177 | Usage: 2passtools merge [OPTIONS] BED_FNS... 178 | 179 | 2passtools merge: Merges bed files produced by 2passtools score on 180 | individual replicates and recalculates junction strength metrics. Can be 181 | used to create a unified junction set to realign reads from different 182 | replicates. 183 | 184 | Bed files should be in the 13 column format produced by 2passtools score. 185 | 186 | Options: 187 | -o, --output-bed-fn TEXT Output file path [required] 188 | -f, --ref-fasta-fn TEXT Path to the fasta file that reads were 189 | mapped to [required] 190 | 191 | -a, --annot-bed-fn TEXT Optional BED file containing annotated 192 | junctions 193 | 194 | -j, --jad-size-threshold INTEGER 195 | JAD to threshold at in the decision tree 196 | -d, --primary-splice-local-dist INTEGER 197 | Distance to search for alternative 198 | donor/acceptors when calculating primary d/a 199 | 200 | -m, --canonical-motifs TEXT Intron motifs considered canonical in 201 | organism. Should be four char DNA motifs 202 | separated by vertical bar only 203 | 204 | -w, --lr-window-size INTEGER Sequence size to extract to train logistic 205 | regression models 206 | 207 | -k, --lr-kfold INTEGER Number of cross validation k-folds for 208 | logistic regression models 209 | 210 | -lt, --lr-low-confidence-threshold FLOAT 211 | Logistic regression low confidence threshold 212 | for decision tree 2 213 | 214 | -ht, --lr-high-confidence-threshold FLOAT 215 | Logistic regression high confidence 216 | threshold for decision tree 2 217 | 218 | -c, --classifier-type [decision_tree|random_forest] 219 | When annotated juncs are available, train 220 | this classifier type 221 | 222 | --keep-all-annot / --filter-annot 223 | When annotated juncs are available, always 224 | keep all annotated juncs 225 | 226 | -p, --processes INTEGER 227 | -s, --random-seed INTEGER 228 | -v, --verbosity LVL Either CRITICAL, ERROR, WARNING, INFO or 229 | DEBUG 230 | 231 | --help Show this message and exit. 232 | ``` 233 | 234 | 235 | ### Citing `2passtools`: 236 | 237 | The `2passtools` manuscript is published in Genome Biology (Open access): 238 | 239 | > Parker, M.T., Knop, K., Barton, G.J. et al. 2passtools: two-pass alignment using machine-learning-filtered splice junctions increases the accuracy of intron detection in long-read RNA sequencing. Genome Biol 22, 72 (2021). https://doi.org/10.1186/s13059-021-02296-0 240 | -------------------------------------------------------------------------------- /lib2pass/main.py: -------------------------------------------------------------------------------- 1 | ''' 2 | lib2pass.main: contains the command line interface for 2passtools 3 | ''' 4 | import os 5 | import logging 6 | 7 | import click 8 | import click_log 9 | 10 | import numpy as np 11 | from sklearn.metrics import confusion_matrix 12 | from .bamparse import parse_introns 13 | from .seqlr import predict_splice_junctions_from_seq 14 | from .decisiontree import dt1_pred, dt1_de_novo_pred, dt2_pred, dt2_de_novo_pred 15 | from .merge import get_merged_juncs 16 | from .filter import apply_eval_expression 17 | 18 | log = logging.getLogger('2passtools') 19 | click_log.basic_config(log) 20 | 21 | 22 | def read_annot_juncs_bed(bed_fn): 23 | annot_introns = set() 24 | with open(bed_fn) as bed: 25 | for record in bed: 26 | chrom, start, end, _, _, strand, *_ = record.split() 27 | start = int(start) 28 | end = int(end) 29 | annot_introns.add((chrom, start, end, strand)) 30 | return annot_introns 31 | 32 | 33 | def _all_predictions(introns, motifs, lengths, counts, 34 | jad_labels, is_primary_donor, is_primary_acceptor, 35 | ref_fasta_fn, annot_bed_fn, 36 | canonical_motifs, jad_size_threshold, 37 | lr_window_size, lr_kfold, 38 | lr_low_confidence_threshold, 39 | lr_high_confidence_threshold, 40 | classifier, keep_all_annot, 41 | processes): 42 | ''' 43 | Takes as input the alignment metrics extracted either from a bam file (2passtools score) 44 | or previously created junction bed file (2passtools merge). Calculates decision tree 45 | score one, then extracts junction sequences from the fasta file and calculates 46 | decision tree score two. 47 | ''' 48 | log.info(f'Identified {len(introns):d} introns') 49 | if annot_bed_fn is None: 50 | log.info('Applying pretrained filter dt1') 51 | dt1_labels = dt1_pred( 52 | motifs, jad_labels, is_primary_donor, is_primary_acceptor, 53 | motif_regex=canonical_motifs, 54 | jad_size_threshold=jad_size_threshold, 55 | ) 56 | log.info(f'{sum(dt1_labels):d} introns pass filter dt1') 57 | lr_donor_labels, lr_acceptor_labels = predict_splice_junctions_from_seq( 58 | introns, dt1_labels, ref_fasta_fn, 59 | lr_window_size, lr_kfold, 60 | processes 61 | ) 62 | dt2_labels = dt2_pred( 63 | jad_labels, is_primary_donor, is_primary_acceptor, 64 | lr_donor_labels, lr_acceptor_labels, 65 | lr_low_confidence_threshold, lr_high_confidence_threshold, 66 | jad_size_threshold=jad_size_threshold 67 | ) 68 | else: 69 | log.info(f'Annotated introns file {annot_bed_fn} provided') 70 | annot_introns = read_annot_juncs_bed(annot_bed_fn) 71 | log.info(f'Identified {len(annot_introns)} annotated introns') 72 | is_annot = [i in annot_introns for i in introns] 73 | dt1_labels = dt1_de_novo_pred( 74 | motifs, lengths, jad_labels, 75 | is_primary_donor, is_primary_acceptor, 76 | is_annot, 77 | motif_regex=canonical_motifs, 78 | classifier=classifier 79 | ) 80 | cm = confusion_matrix(is_annot, dt1_labels) 81 | log.debug('Decision tree 1 confusion matrix:') 82 | log.debug(cm) 83 | lr_donor_labels, lr_acceptor_labels = predict_splice_junctions_from_seq( 84 | introns, dt1_labels, ref_fasta_fn, 85 | lr_window_size, lr_kfold, 86 | processes 87 | ) 88 | dt2_labels = dt2_de_novo_pred( 89 | lengths, jad_labels, 90 | is_primary_donor, is_primary_acceptor, 91 | lr_donor_labels, lr_acceptor_labels, 92 | is_annot, classifier=classifier 93 | ) 94 | cm = confusion_matrix(is_annot, dt2_labels) 95 | log.debug('Decision tree 2 confusion matrix:') 96 | log.debug(cm) 97 | log.info(f'{sum(dt2_labels):d} introns pass filter dt2') 98 | if annot_bed_fn is not None and keep_all_annot: 99 | log.info('Adding all annotated introns to results') 100 | dt1_labels[is_annot == 1] = 1 101 | dt2_labels[is_annot == 1] = 1 102 | return ( 103 | introns, motifs, lengths, counts, jad_labels, 104 | is_primary_donor, is_primary_acceptor, 105 | dt1_labels, 106 | lr_donor_labels, lr_acceptor_labels, 107 | dt2_labels 108 | ) 109 | 110 | 111 | def validate_motif_regex(ctx, param, value): 112 | if not set(value).issubset(set('ACGT|')): 113 | raise click.BadParameter('unrecognised motifs, use only ACGT and | to separate') 114 | for m in value.split('|'): 115 | if not len(m) == 4: 116 | raise click.BadParameter('all motifs should be 4 nt') 117 | else: 118 | return value 119 | 120 | 121 | @click.group() 122 | def main(): 123 | pass 124 | 125 | 126 | SCORE_MERGE_COMMON_OPTIONS = [ 127 | click.option('-o', '--output-bed-fn', required=True, help='Output file path'), 128 | click.option('-f', '--ref-fasta-fn', required=True, type=str, 129 | help='Path to the fasta file that reads were mapped to'), 130 | click.option('-a', '--annot-bed-fn', required=False, type=str, default=None, 131 | help='Optional BED file containing annotated junctions'), 132 | click.option('-j', '--jad-size-threshold', default=4, help='JAD to threshold at in the decision tree'), 133 | click.option('-d', '--primary-splice-local-dist', default=20, 134 | help='Distance to search for alternative donor/acceptors when calculating primary d/a'), 135 | click.option('-m', '--canonical-motifs', default='GTAG|GCAG|ATAG', callback=validate_motif_regex, 136 | help=('Intron motifs considered canonical in organism. ' 137 | 'Should be four char DNA motifs separated by vertical bar only')), 138 | click.option('-w', '--lr-window-size', default=128, type=int, 139 | help='Sequence size to extract to train logistic regression models'), 140 | click.option('-k', '--lr-kfold', default=6, type=int, 141 | help='Number of cross validation k-folds for logistic regression models'), 142 | click.option('-lt', '--lr-low-confidence-threshold', default=0.1, type=float, 143 | help='Logistic regression low confidence threshold for decision tree 2'), 144 | click.option('-ht', '--lr-high-confidence-threshold', default=0.6, type=float, 145 | help='Logistic regression high confidence threshold for decision tree 2'), 146 | click.option('-c', '--classifier-type', default='decision_tree', 147 | type=click.Choice(['decision_tree', 'random_forest']), 148 | help='When annotated juncs are available, train this classifier type'), 149 | click.option('--keep-all-annot/--filter-annot', default=True, 150 | help='When annotated juncs are available, always keep all annotated juncs'), 151 | ] 152 | 153 | def _common_options(common_options): 154 | def _apply_common_options(func): 155 | for option in reversed(common_options): 156 | func = option(func) 157 | return func 158 | return _apply_common_options 159 | 160 | 161 | @main.command() 162 | @click.argument('bam-fn', required=True, nargs=1) 163 | @_common_options(SCORE_MERGE_COMMON_OPTIONS) 164 | @click.option('--stranded/--unstranded', default=True, 165 | help=('Whether input data is stranded or unstranded. ' 166 | 'direct RNA is stranded, cDNA often isn\'t')) 167 | @click.option('-p', '--processes', default=1) 168 | @click.option('-s', '--random-seed', default=None, type=int) 169 | @click_log.simple_verbosity_option(log) 170 | def score(bam_fn, output_bed_fn, ref_fasta_fn, annot_bed_fn, 171 | jad_size_threshold, 172 | primary_splice_local_dist, canonical_motifs, 173 | lr_window_size, lr_kfold, 174 | lr_low_confidence_threshold, lr_high_confidence_threshold, 175 | classifier_type, keep_all_annot, stranded, processes, random_seed): 176 | ''' 177 | 2passtools score: A tool for extracting and scores junctions from a bam file 178 | aligned with minimap2. Filtered junctions can be used to realign reads in 179 | a second pass with minimap2. 180 | 181 | Bam file must be mapped with minimap2 and have the long form CS tag, e.g. 182 | 183 | minimap2 -a --cs=long -k14 -x splice ref.fa reads.fq 184 | ''' 185 | 186 | if random_seed is not None: 187 | np.random.seed(random_seed) 188 | 189 | log.info(f'Parsing BAM file: {bam_fn}') 190 | (introns, motifs, lengths, 191 | counts, jad_labels, 192 | is_primary_donor, is_primary_acceptor) = parse_introns( 193 | bam_fn, 194 | primary_splice_local_dist, 195 | stranded, 196 | 1_000_000, processes 197 | ) 198 | res = zip(*_all_predictions( 199 | introns, motifs, lengths, counts, jad_labels, 200 | is_primary_donor, is_primary_acceptor, 201 | ref_fasta_fn, annot_bed_fn, 202 | canonical_motifs, jad_size_threshold, 203 | lr_window_size, lr_kfold, 204 | lr_low_confidence_threshold, 205 | lr_high_confidence_threshold, 206 | classifier_type, keep_all_annot, processes 207 | )) 208 | log.info(f'Writing results to {output_bed_fn}') 209 | with open(output_bed_fn, 'w') as bed: 210 | for i, motif, _, c, jad, pd, pa, d1, lrd, lra, d2 in res: 211 | chrom, start, end, strand = i 212 | bed.write( 213 | f'{chrom:s}\t{start:d}\t{end:d}\t{motif:s}\t{c:d}\t{strand:s}\t' 214 | f'{jad:d}\t{pd:d}\t{pa:d}\t{d1:d}\t' 215 | f'{lrd:.3f}\t{lra:.3f}\t{d2:d}\n' 216 | ) 217 | 218 | 219 | @main.command() 220 | @click.argument('bed-fns', required=True, nargs=-1) 221 | @_common_options(SCORE_MERGE_COMMON_OPTIONS) 222 | @click.option('-p', '--processes', default=1) 223 | @click.option('-s', '--random-seed', default=None, type=int) 224 | @click_log.simple_verbosity_option(log) 225 | def merge(bed_fns, output_bed_fn, ref_fasta_fn, annot_bed_fn, 226 | jad_size_threshold, primary_splice_local_dist, canonical_motifs, 227 | lr_window_size, lr_kfold, 228 | lr_low_confidence_threshold, lr_high_confidence_threshold, 229 | classifier_type, keep_all_annot, processes, random_seed): 230 | ''' 231 | 2passtools merge: Merges bed files produced by 2passtools score on individual 232 | replicates and recalculates junction strength metrics. Can be used to create 233 | a unified junction set to realign reads from different replicates. 234 | 235 | Bed files should be in the 13 column format produced by 2passtools score. 236 | ''' 237 | if random_seed is not None: 238 | np.random.seed(random_seed) 239 | 240 | log.info(f'Parsing {len(bed_fns):d} BED files') 241 | (introns, motifs, lengths, 242 | counts, jad_labels, 243 | is_primary_donor, is_primary_acceptor) = get_merged_juncs( 244 | bed_fns, primary_splice_local_dist 245 | ) 246 | res = zip(*_all_predictions( 247 | introns, motifs, lengths, counts, jad_labels, 248 | is_primary_donor, is_primary_acceptor, 249 | ref_fasta_fn, annot_bed_fn, 250 | canonical_motifs, jad_size_threshold, 251 | lr_window_size, lr_kfold, 252 | lr_low_confidence_threshold, 253 | lr_high_confidence_threshold, 254 | classifier_type, keep_all_annot, processes 255 | )) 256 | log.info(f'Writing results to {output_bed_fn}') 257 | with open(output_bed_fn, 'w') as bed: 258 | for i, motif, _, c, jad, pd, pa, d1, lrd, lra, d2 in res: 259 | chrom, start, end, strand = i 260 | bed.write( 261 | f'{chrom:s}\t{start:d}\t{end:d}\t{motif:s}\t{c:d}\t{strand:s}\t' 262 | f'{jad:d}\t{pd:d}\t{pa:d}\t{d1:d}\t' 263 | f'{lrd:.3f}\t{lra:.3f}\t{d2:d}\n' 264 | ) 265 | 266 | 267 | @main.command() 268 | @click.argument('bed-fn', nargs=1, required=True) 269 | @click.option('-o', '--output-bed-fn', required=True) 270 | @click.option('--exprs', required=False, default="decision_tree_2_pred") 271 | @click_log.simple_verbosity_option(log) 272 | def filter(bed_fn, output_bed_fn, exprs): 273 | ''' 274 | 2passtools filter: Convenience tool to filter a junction bed and produce 275 | 6-column bed format which is compatible with minimap2. 276 | ''' 277 | with open(output_bed_fn, 'w') as bed: 278 | for chrom, start, end, strand, decision in apply_eval_expression(bed_fn, exprs): 279 | if decision: 280 | record = f'{chrom}\t{start}\t{end}\tintron\t0\t{strand}\n' 281 | bed.write(record) 282 | 283 | 284 | @main.command() 285 | @click_log.simple_verbosity_option(log) 286 | def mm2pass(): 287 | raise NotImplementedError('TODO: implement convience tool to wrap ' 288 | 'minimap2 and run two pass alignment') 289 | 290 | 291 | if __name__ == '__main__': 292 | main() --------------------------------------------------------------------------------