├── lib2pass
    ├── __init__.py
    ├── merge.py
    ├── filter.py
    ├── minimap2.py
    ├── seqlr.py
    ├── fastaparse.py
    ├── decisiontree.py
    ├── bamparse.py
    └── main.py
├── 2passtools.yml
├── setup.py
├── LICENSE
├── .gitignore
└── README.md


/lib2pass/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/2passtools.yml:
--------------------------------------------------------------------------------
 1 | name: 2passtools
 2 | channels:
 3 |  - conda-forge
 4 |  - bioconda
 5 | dependencies:
 6 |  - python=3.6
 7 |  - pip
 8 |  - numpy
 9 |  - click
10 |  - click-log
11 |  - pysam>=0.15
12 |  - ncls
13 |  - scikit-learn>=0.22
14 |  - pip:
15 |    - .
16 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | setup(
 5 |     name='2passtools',
 6 |     version='0.3.1',
 7 |     description=(
 8 |         'two pass alignment of long noisy reads'
 9 |     ),
10 |     author='Matthew Parker',
11 |     entry_points={
12 |         'console_scripts': [
13 |             '2passtools = lib2pass.main:main',
14 |         ]
15 |     },
16 |     packages=[
17 |         'lib2pass',
18 |     ],
19 |     install_requires=[
20 |         'numpy',
21 |         'click',
22 |         'click-log',
23 |         'pysam',
24 |         'ncls',
25 |         'scikit-learn'
26 |     ],
27 | )
28 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Matthew Parker
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lib2pass/merge.py:
--------------------------------------------------------------------------------
 1 | from .bamparse import merge_intron_res, build_donor_acceptor_ncls, assign_primary
 2 | 
 3 | 
 4 | def read_junc_bed(bed_fn):
 5 |     motifs = {}
 6 |     lengths = {}
 7 |     counts = {}
 8 |     intron_jads = {}
 9 |     with open(bed_fn) as bed:
10 |         for record in bed:
11 |             (chrom, start, end, motif, count, strand,
12 |              jad, *_) = record.split()
13 |             start = int(start)
14 |             end = int(end)
15 |             ln = end - start
16 |             count = int(count)
17 |             jad = int(jad)
18 |             i = (chrom, start, end, strand)
19 |             motifs[i] = motif
20 |             lengths[i] = ln
21 |             counts[i] = count
22 |             intron_jads[i] = jad
23 |     return motifs, lengths, counts, intron_jads
24 | 
25 | 
26 | def get_merged_juncs(junc_bed_fns, primary_splice_local_dist=20):
27 | 
28 |     res = [read_junc_bed(fn) for fn in junc_bed_fns]
29 |     motifs, lengths, counts, intron_jads = merge_intron_res(res)
30 | 
31 |     introns = list(motifs.keys())
32 |     motifs = [motifs[i] for i in introns]
33 |     lengths = [lengths[i] for i in introns]
34 |     counts = [counts[i] for i in introns]
35 |     jad_label = [intron_jads[i] for i in introns]
36 | 
37 |     itrees = build_donor_acceptor_ncls(
38 |         introns, counts, jad_label, primary_splice_local_dist
39 |     )
40 |     is_primary_donor = []
41 |     is_primary_acceptor = []
42 |     for i in introns:
43 |         d, a = assign_primary(*i, itrees)
44 |         is_primary_donor.append(d)
45 |         is_primary_acceptor.append(a)
46 | 
47 |     return (introns, motifs, lengths, counts, jad_label,
48 |             is_primary_donor, is_primary_acceptor)
49 | 


--------------------------------------------------------------------------------
/lib2pass/filter.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | from collections import defaultdict
 3 | from functools import partial
 4 | 
 5 | 
 6 | def nullable(val, cast):
 7 |     if val is None:
 8 |         return None
 9 |     else:
10 |         return cast(val)
11 | 
12 | 
13 | def eval_feature_expression(
14 |         record,
15 |         expression):
16 |     (
17 |         motif, count, jad_score,
18 |         is_primary_donor, is_primary_acceptor,
19 |         dt1_pred,
20 |         donor_seq_score, acceptor_seq_score,
21 |         dt2_pred
22 |     ) = record
23 |     safe_dict = {
24 |         'motif': motif,
25 |         'is_GTAG': motif == 'GTAG',
26 |         'is_GCAG': motif == 'GCAG',
27 |         'is_ATAG': motif == 'ATAG',
28 |         'motif_regex_match': lambda expr: bool(re.match(expr, motif)),
29 |         'count': count,
30 |         'jad': jad_score,
31 |         'primary_donor': bool(is_primary_donor),
32 |         'primary_acceptor': bool(is_primary_acceptor),
33 |         'donor_seq_score': donor_seq_score,
34 |         'acceptor_seq_score': acceptor_seq_score,
35 |         'decision_tree_1_pred': bool(dt1_pred),
36 |         'decision_tree_2_pred': bool(dt2_pred),
37 |         'sum': sum,
38 |         'pow': pow,
39 |         'min': min,
40 |         'max': max,
41 |         'math': math,
42 |         'bool': bool,
43 |         'int': partial(nullable, int),
44 |         'str': partial(nullable, str),
45 |         'float': partial(nullable, float),
46 |         'len': partial(nullable, len),
47 |     }
48 |     res = eval(expression, {"__builtins__": None}, safe_dict)
49 |     if not isinstance(res, bool):
50 |         res = bool(res)
51 |     return res
52 | 
53 | 
54 | def read_junc_bed(bed_fn):
55 |     records = {}
56 |     with open(bed_fn) as bed:
57 |         for record in bed:
58 |             (chrom, start, end, motif, count, strand,
59 |              jad, is_pd, is_pa, dt1, lra, lrd, dt2) = record.split()
60 |             records[(chrom, start, end, strand)] = (
61 |                 motif, int(count),
62 |                 int(jad), int(is_pd), int(is_pa),
63 |                 int(dt1), float(lrd), float(lra), int(dt2)
64 |             )
65 |     return records
66 | 
67 | 
68 | def apply_eval_expression(bed_fn, expression):
69 |     for (chrom, start, end, strand), scores in read_junc_bed(bed_fn).items():
70 |         r = eval_feature_expression(
71 |             scores, expression
72 |         )
73 |         yield chrom, start, end, strand, r


--------------------------------------------------------------------------------
/lib2pass/minimap2.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import subprocess
 3 | import tempfile
 4 | 
 5 | MINIMAP2 = os.path.abspath(
 6 |     os.path.split(__file__)[0] + 
 7 |     '/../external/minimap2/minimap2'
 8 | )
 9 | 
10 | 
11 | def subprocess_command(cmd, stdout_fn):
12 |     with open(stdout_fn, 'w') as s:
13 |         proc = subprocess.Popen(
14 |             cmd,
15 |             stdout=s,
16 |             stderr=subprocess.PIPE
17 |         )
18 |         _, stderr = proc.communicate()
19 |         if proc.returncode:
20 |             raise subprocess.CalledProcessError(stderr.decode())
21 |         else:
22 |             return stderr.decode()
23 | 
24 | 
25 | def map_with_minimap2(fastq_fn, reference_fn, output_fn, threads=1,
26 |                       use_canon=False, noncanon_pen=9,
27 |                       junc_bed=None, junc_bonus=9):
28 |     if not os.path.exists(fastq_fn):
29 |         raise OSError('fastq_fn not found')
30 |     elif not os.path.exists(reference_fn):
31 |         raise OSError('reference_fn not found')
32 |     splice_flank = 'yes' if use_canon else 'no'
33 |     noncanon_pen = noncanon_pen if use_canon else 0
34 |     use_canon = 'f' if use_canon else 'n'
35 |     s_handle, sam_fn = tempfile.mkstemp(suffix='.sam')
36 |     b_handle, bam_fn = tempfile.mkstemp(suffix='.bam')
37 | 
38 |     # run minimap
39 |     minimap2_cmd = [
40 |         MINIMAP2, f'-t{threads}', '-k14', '-w5', '--splice',
41 |         '-g2000', '-G10000', '-A1', '-B2', '-O2,32', '-E1,0',
42 |         f'-C{noncanon_pen}', f'--splice-flank={splice_flank}', f'-u{use_canon}',
43 |         '-z200', '-L', '--cs=long', '-a'
44 |     ]
45 |     if junc_bed is not None:
46 |         minimap2_cmd += ['--junc-bed', junc_bed, f'--junc-bonus={junc_bonus}']
47 |     minimap2_cmd += [reference_fn, fastq_fn]
48 |     minimap2_stderr = subprocess_command(minimap2_cmd, sam_fn)
49 | 
50 |     # run samtools view
51 |     samtools_view_cmd = ['samtools', 'view', '-bS', sam_fn]
52 |     samtools_view_stderr = subprocess_command(samtools_view_cmd, bam_fn)
53 | 
54 |     # clean up minimap2 output
55 |     os.close(s_handle)
56 |     os.remove(sam_fn)
57 | 
58 |     # run samtools sort
59 |     samtools_sort_cmd = ['samtools', 'sort', '-@', str(threads), '-o', '-', bam_fn]
60 |     samtools_sort_stderr = subprocess_command(samtools_sort_cmd, output_fn)
61 | 
62 |     # clean up samtools view output
63 |     os.close(b_handle)
64 |     os.remove(bam_fn)
65 | 
66 |     # run samtools index
67 |     samtools_index_cmd = ['samtools', 'index', output_fn]
68 |     subprocess.check_call(samtools_index_cmd)


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | cover/
 54 | 
 55 | # Translations
 56 | *.mo
 57 | *.pot
 58 | 
 59 | # Django stuff:
 60 | *.log
 61 | local_settings.py
 62 | db.sqlite3
 63 | db.sqlite3-journal
 64 | 
 65 | # Flask stuff:
 66 | instance/
 67 | .webassets-cache
 68 | 
 69 | # Scrapy stuff:
 70 | .scrapy
 71 | 
 72 | # Sphinx documentation
 73 | docs/_build/
 74 | 
 75 | # PyBuilder
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 


--------------------------------------------------------------------------------
/lib2pass/seqlr.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import random
  3 | import logging
  4 | import itertools as it
  5 | 
  6 | import numpy as np
  7 | 
  8 | from sklearn.linear_model import LogisticRegression
  9 | from sklearn.model_selection import KFold
 10 | 
 11 | from joblib import Parallel, delayed
 12 | 
 13 | from .fastaparse import get_junction_seqs
 14 | 
 15 | 
 16 | log = logging.getLogger('2passtools')
 17 | 
 18 | 
 19 | SEQ_OHE = {'A': [1, 0, 0],
 20 |            'C': [0, 1, 0],
 21 |            'G': [0, 0, 1],
 22 |            'T': [0, 0, 0]}
 23 | 
 24 | 
 25 | def one_hot_sequence(seq):
 26 |     ohe = []
 27 |     for base in seq:
 28 |         try:
 29 |             ohe.append(SEQ_OHE[base])
 30 |         except KeyError:
 31 |             ohe.append(SEQ_OHE[random.choice('ACGT')])
 32 |     return np.array(ohe)
 33 | 
 34 | 
 35 | def train_and_predict(X_train, y_train, X_test):
 36 |     lr = LogisticRegression(
 37 |         solver='lbfgs', penalty='l2',
 38 |         max_iter=500, n_jobs=1)
 39 |     lr.fit(X_train, y_train)
 40 |     return lr.predict_proba(X_test)[:, 1]
 41 | 
 42 | 
 43 | def kfold_oob_prediction(X_data, y_data, n_splits, processes=1):
 44 |     idx = []
 45 |     preds = []
 46 |     kf = KFold(n_splits=n_splits, shuffle=True)
 47 |     kf_idx = list(kf.split(X_data))
 48 |     with Parallel(n_jobs=min(n_splits, processes)) as pool:
 49 |         preds = pool(
 50 |             delayed(train_and_predict)(X_data[train_idx],
 51 |                                        y_data[train_idx],
 52 |                                        X_data[test_idx])
 53 |             for train_idx, test_idx in kf_idx
 54 |         )
 55 |     test_idx = [tst for trn, tst in kf_idx]
 56 |     test_idx = np.concatenate(test_idx)
 57 |     preds = np.concatenate(preds)
 58 |     return preds[np.argsort(test_idx)]
 59 | 
 60 | 
 61 | def predict_splice_junctions_from_seq(introns, labels, fasta_fn, window_size,
 62 |                                       n_splits, processes):
 63 |     log.info(f'Fetching junction sequences from {fasta_fn}')
 64 |     (donors, donor_seqs, donor_labels,
 65 |      acceptors, acceptor_seqs, acceptor_labels) = get_junction_seqs(
 66 |         introns, labels, fasta_fn, window_size, processes
 67 |     )
 68 |     log.info(f'Identified {len(donors):d} unique donors and {len(acceptors):d} unique acceptors')
 69 |     donor_seq_ohe = np.array([one_hot_sequence(seq).ravel() for seq in donor_seqs])
 70 |     donor_labels = np.array(donor_labels)
 71 |     log.info(f'Scoring donor sequences with LR...')
 72 |     donor_preds = kfold_oob_prediction(
 73 |         donor_seq_ohe, donor_labels, n_splits, processes
 74 |     )
 75 |     donor_preds = {k: v for k, v in zip(donors, donor_preds)}
 76 |     acceptor_seq_ohe = np.array([one_hot_sequence(seq).ravel() for seq in acceptor_seqs])
 77 |     acceptor_labels = np.array(acceptor_labels)
 78 |     log.info(f'Scoring acceptor sequences with LR...')
 79 |     acceptor_preds = kfold_oob_prediction(
 80 |         acceptor_seq_ohe, acceptor_labels, n_splits, processes
 81 |     )
 82 |     acceptor_preds = {k: v for k, v in zip(acceptors, acceptor_preds)}
 83 |     donor_preds, acceptor_preds = get_donor_acceptor_preds_for_introns(
 84 |         introns, donor_preds, acceptor_preds
 85 |     )
 86 |     return donor_preds, acceptor_preds
 87 | 
 88 | 
 89 | def get_donor_acceptor_preds_for_introns(introns, donor_preds, acceptor_preds):
 90 |     intron_donor_preds = []
 91 |     intron_acceptor_preds = []
 92 |     for chrom, start, end, strand in introns:
 93 |         if strand == '+':
 94 |             donor_pos = start
 95 |             acceptor_pos = end
 96 |         else:
 97 |             donor_pos = end
 98 |             acceptor_pos = start
 99 |         intron_donor_preds.append(donor_preds[(chrom, donor_pos, strand)])
100 |         intron_acceptor_preds.append(acceptor_preds[(chrom, acceptor_pos, strand)])
101 |     return intron_donor_preds, intron_acceptor_preds


--------------------------------------------------------------------------------
/lib2pass/fastaparse.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | 
  3 | import numpy as np
  4 | 
  5 | import pysam
  6 | from joblib import Parallel, delayed
  7 | 
  8 | 
  9 | RC = str.maketrans('ACGT', 'TGCA')
 10 | 
 11 | 
 12 | def rev_comp(seq):
 13 |     return seq.translate(RC)[::-1]
 14 | 
 15 | 
 16 | def fetch_padded(fasta, chrom, pos, w):
 17 |     clen = fasta.get_reference_length(chrom)
 18 |     left = pos - w
 19 |     right = pos + w
 20 |     if left < 0:
 21 |         lpad = abs(left)
 22 |         left = 0
 23 |     else:
 24 |         lpad = 0
 25 |     if right > clen:
 26 |         rpad = right - clen
 27 |         right = clen
 28 |     else:
 29 |         rpad = 0
 30 |     seq = fasta.fetch(chrom, left, right)
 31 |     if lpad:
 32 |         seq = 'N' * lpad + seq
 33 |     if rpad:
 34 |         seq = seq + 'N' * rpad
 35 |     return seq
 36 |     
 37 | 
 38 | def _get_junc_seqs(bed_records, fasta_fn, window_size):
 39 |     intron_donor_labels = defaultdict(lambda: 0)
 40 |     intron_acceptor_labels = defaultdict(lambda: 0)
 41 |     intron_donor_seqs = {}
 42 |     intron_acceptor_seqs = {}
 43 |     w = window_size // 2
 44 |     with pysam.FastaFile(fasta_fn) as fasta:
 45 |         for chrom, start, end, strand, label in bed_records:
 46 |             donor_seq = fetch_padded(fasta, chrom, start, w)
 47 |             acceptor_seq = fetch_padded(fasta, chrom, end, w)
 48 |             if strand == '-':
 49 |                 donor_seq, acceptor_seq = (
 50 |                     rev_comp(acceptor_seq), rev_comp(donor_seq)
 51 |                 )
 52 |                 donor_pos = end
 53 |                 acceptor_pos = start
 54 |             else:
 55 |                 donor_pos = start
 56 |                 acceptor_pos = end
 57 | 
 58 |             intron_donor_seqs[(chrom, donor_pos, strand)] = donor_seq
 59 |             intron_donor_labels[(chrom, donor_pos, strand)] |= label
 60 |             intron_acceptor_seqs[(chrom, acceptor_pos, strand)] = acceptor_seq
 61 |             intron_acceptor_labels[(chrom, acceptor_pos, strand)] |= label
 62 | 
 63 |     return (intron_donor_seqs, intron_donor_labels,
 64 |             intron_acceptor_seqs, intron_acceptor_labels)
 65 | 
 66 | 
 67 | def chunk_records(introns, labels, processes):
 68 |     records = []
 69 |     for (chrom, start, end, strand), lab in zip(introns, labels):
 70 |         records.append((chrom, start, end, strand, lab))
 71 |     nrecords = len(records)
 72 |     n, r = divmod(nrecords, processes)
 73 |     split_points = ([0] + r * [n + 1] + (processes - r) * [n])
 74 |     split_points = np.cumsum(split_points)
 75 |     for i in range(processes):
 76 |         start = split_points[i]
 77 |         end = split_points[i + 1]
 78 |         yield records[start: end]
 79 | 
 80 | 
 81 | def or_update(d1, d2):
 82 |     for k, v in d2.items():
 83 |         d1[k] |= v
 84 |     return d1
 85 | 
 86 | 
 87 | def merge_parallel_junc_res(res):
 88 |     donor_seqs = {}
 89 |     donor_labels = defaultdict(lambda: 0)
 90 |     acceptor_seqs = {}
 91 |     acceptor_labels = defaultdict(lambda: 0)
 92 | 
 93 |     for ds, dl, as_, al in res:
 94 |         donor_seqs.update(ds)
 95 |         acceptor_seqs.update(as_)
 96 |         donor_labels = or_update(donor_labels, dl)
 97 |         acceptor_labels = or_update(acceptor_labels, al)
 98 |     
 99 |     donors = list(donor_seqs.keys())
100 |     donor_seqs = [donor_seqs[d] for d in donors]
101 |     donor_labels = [donor_labels[d] for d in donors]
102 | 
103 |     acceptors = list(acceptor_seqs.keys())
104 |     acceptor_seqs = [acceptor_seqs[a] for a in acceptors]
105 |     acceptor_labels = [acceptor_labels[a] for a in acceptors]
106 | 
107 |     return (donors, donor_seqs, donor_labels,
108 |             acceptors, acceptor_seqs, acceptor_labels)
109 | 
110 | 
111 | def get_junction_seqs(introns, labels, fasta_fn, window_size, processes=12):
112 |     with Parallel(n_jobs=processes) as pool:
113 |         res = pool(
114 |             delayed(_get_junc_seqs)(introns, fasta_fn, window_size)
115 |             for introns in chunk_records(introns, labels, processes)
116 |         )
117 | 
118 |     (donors, donor_seqs, donor_labels,
119 |      acceptors, acceptor_seqs, acceptor_labels) = merge_parallel_junc_res(res)
120 | 
121 |     return (donors, donor_seqs, donor_labels,
122 |             acceptors, acceptor_seqs, acceptor_labels)


--------------------------------------------------------------------------------
/lib2pass/decisiontree.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from operator import itemgetter
  3 | import re
  4 | import numpy as np
  5 | 
  6 | from sklearn.preprocessing import quantile_transform
  7 | from sklearn.tree import DecisionTreeClassifier, export_text
  8 | from sklearn.ensemble import ExtraTreesClassifier
  9 | 
 10 | 
 11 | log = logging.getLogger('2passtools')
 12 | 
 13 | 
 14 | DT1_DENOVO_FEATURES = [
 15 |     'is_canonical_motif', 'jad',
 16 |     'is_primary_donor', 'is_primary_acceptor',
 17 |     'intron_length_quantile',
 18 | ]
 19 | 
 20 | DT2_DENOVO_FEATURES = [
 21 |     'jad', 'is_primary_donor', 'is_primary_acceptor',
 22 |     'intron_length_quantile',
 23 |     'donor_lr_score', 'acceptor_lr_score',
 24 | ]
 25 | 
 26 | 
 27 | def format_feature_importances(feature_names, feature_importances, width=10):
 28 |     max_size = max(feature_importances)
 29 |     point_size = max_size / width
 30 |     pad_to = max([len(x) for x in feature_names])
 31 |     feature_importances = {fn: fi for fn, fi in zip(feature_names, feature_importances)}
 32 |     feature_importances = sorted(feature_importances.items(), key=itemgetter(1), reverse=True)
 33 |     fmt = ''
 34 |     for fn, fi in feature_importances:
 35 |         rpad = ' ' * (pad_to - len(fn))
 36 |         fn += rpad
 37 |         bar = '*' * int(round(fi / point_size))
 38 |         fmt += f'{fn} {bar} {fi:.1f}\n'
 39 |     return fmt
 40 | 
 41 | 
 42 | def _de_novo_pred(X, y, feature_names, classifier='decision_tree'):
 43 |     if classifier == 'random_forest':
 44 |         log.info('Using extremely random forest')
 45 |         clf = ExtraTreesClassifier(n_estimators=250, bootstrap=True, oob_score=True)
 46 |         clf.fit(X, y)
 47 |         log.debug('Feature importance:')
 48 |         log.debug(format_feature_importances(feature_names, clf.feature_importances_))
 49 |         pred = clf.oob_decision_function_[:, 1]
 50 |         # in the unlikely event dt1_pred contains NaNs
 51 |         # (can happen when n_estimators is not big enough)
 52 |         pred[np.isnan(pred)] = 0
 53 |         pred = pred >= 0.5
 54 | 
 55 |     else:
 56 |         clf = DecisionTreeClassifier(
 57 |             max_depth=5,
 58 |             min_samples_split=100,
 59 |             min_impurity_decrease=0.005,
 60 |         )
 61 |         clf.fit(X, y)
 62 |         log.debug('Tree structure:')
 63 |         log.debug(export_text(clf, feature_names=feature_names))
 64 |         pred = clf.predict(X)
 65 |     return pred.astype(int)
 66 | 
 67 | 
 68 | def dt1_pred(intron_motif, jad_labels, is_primary_donor, is_primary_acceptor,
 69 |              motif_regex='GTAG|GCAG|ATAG', jad_size_threshold=4):
 70 |     motif_regex = re.compile(motif_regex)
 71 |     is_canon = np.asarray([bool(motif_regex.match(m)) for m in intron_motif])
 72 | 
 73 |     jad_labels = np.asarray(jad_labels) >= jad_size_threshold
 74 | 
 75 |     is_primary_donor = np.asarray(is_primary_donor, dtype=bool)
 76 |     is_primary_acceptor = np.asarray(is_primary_acceptor, dtype=bool)
 77 | 
 78 |     is_primary = is_primary_donor & is_primary_acceptor
 79 |     return (jad_labels & is_canon) | (is_primary & is_canon)
 80 | 
 81 | 
 82 | def dt1_de_novo_pred(intron_motif, intron_lengths,
 83 |                      jad_labels, is_primary_donor, is_primary_acceptor,
 84 |                      is_annot, motif_regex='GTAG|GCAG|ATAG',
 85 |                      classifier='decision_tree'):
 86 |     motif_regex = re.compile(motif_regex)
 87 |     is_canon = np.asarray([int(bool(motif_regex.match(m))) for m in intron_motif])
 88 | 
 89 |     jad_labels = np.asarray(jad_labels)
 90 | 
 91 |     is_primary_donor = np.asarray(is_primary_donor)
 92 |     is_primary_acceptor = np.asarray(is_primary_acceptor)
 93 | 
 94 |     intron_length_quantile = quantile_transform(
 95 |         np.asarray(intron_lengths).reshape(-1, 1)
 96 |     ).ravel()
 97 | 
 98 |     X = np.stack(
 99 |         [
100 |             is_canon, jad_labels,
101 |             is_primary_donor, is_primary_acceptor,
102 |             intron_length_quantile
103 |         ],
104 |         axis=1
105 |     )
106 |     y = np.asarray(is_annot, dtype=np.int)
107 |     pred = _de_novo_pred(X, y, DT1_DENOVO_FEATURES, classifier=classifier)
108 |     return pred
109 | 
110 | 
111 | def dt2_pred(jad_labels,
112 |              is_primary_donor,
113 |              is_primary_acceptor,
114 |              donor_lr_score,
115 |              acceptor_lr_score,
116 |              low_conf_thresh=0.1,
117 |              high_conf_thresh=0.6,
118 |              jad_size_threshold=4):
119 | 
120 |     jad_labels = np.asarray(jad_labels) >= jad_size_threshold
121 |     is_primary_donor = np.asarray(is_primary_donor, dtype=bool)
122 |     is_primary_acceptor = np.asarray(is_primary_acceptor, dtype=bool)
123 |     donor_lr_score = np.asarray(donor_lr_score, dtype=np.float64)
124 |     acceptor_lr_score = np.asarray(acceptor_lr_score, dtype=np.float64)
125 | 
126 |     is_primary = is_primary_donor & is_primary_acceptor
127 | 
128 |     seq_low_conf = ((donor_lr_score >= low_conf_thresh) &
129 |                     (acceptor_lr_score >= low_conf_thresh))
130 |     seq_high_conf = ((donor_lr_score >= high_conf_thresh) &
131 |                      (acceptor_lr_score >= high_conf_thresh))
132 | 
133 |     return (jad_labels & seq_low_conf) | (is_primary & seq_high_conf)
134 | 
135 | 
136 | def dt2_de_novo_pred(intron_lengths, jad_labels,
137 |                      is_primary_donor, is_primary_acceptor,
138 |                      donor_lr_score, acceptor_lr_score,
139 |                      is_annot, classifier='decision_tree'):
140 |     jad_labels = np.asarray(jad_labels)
141 | 
142 |     is_primary_donor = np.asarray(is_primary_donor)
143 |     is_primary_acceptor = np.asarray(is_primary_acceptor)
144 | 
145 |     intron_length_quantile = quantile_transform(
146 |         np.asarray(intron_lengths).reshape(-1, 1)
147 |     ).ravel()
148 | 
149 |     X = np.stack(
150 |         [
151 |             jad_labels, is_primary_donor, is_primary_acceptor,
152 |             intron_length_quantile, donor_lr_score, acceptor_lr_score,
153 |         ],
154 |         axis=1
155 |     )
156 |     y = np.asarray(is_annot, dtype=np.int)
157 |     pred = _de_novo_pred(X, y, DT2_DENOVO_FEATURES, classifier=classifier)
158 |     return pred


--------------------------------------------------------------------------------
/lib2pass/bamparse.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict, Counter
  2 | import re
  3 | 
  4 | import numpy as np
  5 | import pysam
  6 | from joblib import Parallel, delayed
  7 | from ncls import NCLS
  8 | 
  9 | 
 10 | CS_SPLITTER = '([-+*~=:])'
 11 | RC = str.maketrans('ACGTN', 'TGCAN')
 12 | 
 13 | def parse_cs_tag(cs_tag):
 14 |     '''
 15 |     generalisable function for parsing minimap2 cs tag (long and short form)
 16 |     '''
 17 |     cs_tag = re.split(CS_SPLITTER, cs_tag)[1:]
 18 |     cs_ops = cs_tag[::2]
 19 |     cs_info = cs_tag[1::2]
 20 |     cs_parsed = []
 21 |     for op, info in zip(cs_ops, cs_info):
 22 |         if op == '=':
 23 |             # long form match
 24 |             cs_parsed.append(('=', len(info), info))
 25 |         elif op == ':':
 26 |             # short form match
 27 |             cs_parsed.append(('=', int(info), None))
 28 |         elif op == '*':
 29 |             # mismatch
 30 |             ref = info[0]
 31 |             alt = info[1]
 32 |             cs_parsed.append((op, 1, (ref, alt)))
 33 |         elif op == '+' or op == '-':
 34 |             cs_parsed.append((op, len(info), info))
 35 |         elif op == '~':
 36 |             donor_motif, intron_length, acceptor_motif = re.match(
 37 |                 '^([acgtn]{2})([0-9]+)([acgtn]{2})', info).groups()
 38 |             motif = (donor_motif + acceptor_motif).upper()
 39 |             intron_length = int(intron_length)
 40 |             cs_parsed.append((op, intron_length, motif))
 41 |     return cs_parsed
 42 | 
 43 | 
 44 | def get_junction_overhang_size(overhang_cs):
 45 |     '''
 46 |     for cs tag split at intron (and reoriented so nearest op to intron is first)
 47 |     returns the overhang size (number of nt which match before
 48 |     first mismatch, insertion or deletion)
 49 |     '''
 50 |     try:
 51 |         return overhang_cs[0][1] if overhang_cs[0][0] == '=' else 0
 52 |     except IndexError:
 53 |         # sometimes when junctions are provided minimap2 can produce alignments
 54 |         # where an annotated junction is used with no overhang on the other
 55 |         # side!!
 56 |         return 0
 57 | 
 58 | 
 59 | def infer_strand_from_intron_motifs(intron_motifs, read_strand):
 60 |     strand_counts = Counter()
 61 |     for motif in intron_motifs:
 62 |         if re.match('G[TC]AG', motif):
 63 |             strand_counts['+'] += 1
 64 |         elif re.match('CT[AG]C', motif):
 65 |             strand_counts['-'] += 1
 66 |         else:
 67 |             strand_counts['.'] += 1
 68 | 
 69 |     if strand_counts['+'] == strand_counts['-']:
 70 |         return read_strand
 71 |     elif strand_counts['+'] > strand_counts['-']:
 72 |         return '+'
 73 |     else:
 74 |         return '-'
 75 |         
 76 | 
 77 | def find_introns(aln, stranded=True):
 78 |     '''
 79 |     use the cs tag to find introns and their match overhangs in the alignment
 80 |     '''
 81 |     introns = []
 82 |     intron_motifs = []
 83 |     chrom = aln.reference_name
 84 |     start = aln.reference_start
 85 |     end = aln.reference_end
 86 |     read_strand = '+-'[aln.is_reverse]
 87 |     pos = start
 88 |     cs_tag = parse_cs_tag(aln.get_tag('cs'))
 89 |     for i, (op, ln, info) in enumerate(cs_tag):
 90 |         if op == '+':
 91 |             # insertion does not consume reference
 92 |             continue
 93 |         elif op in ('=', '*', '-'):
 94 |             # match, mismatch, deletion consume reference
 95 |             pos += ln
 96 |         elif op == '~':
 97 |             # intron consumes reference and is recorded
 98 |             left = pos
 99 |             right = left + ln
100 |             left_tag = cs_tag[:i][::-1]
101 |             right_tag = cs_tag[i + 1:]
102 |             junc_overhang = min(
103 |                 get_junction_overhang_size(left_tag),
104 |                 get_junction_overhang_size(right_tag)
105 |             )
106 |             # info is intron motif
107 |             introns.append([left, right, junc_overhang, ln, info])
108 |             intron_motifs.append(info)
109 |             pos = right
110 | 
111 |     # infer strand and yield introns
112 |     if stranded:
113 |         strand = read_strand
114 |     else:
115 |         strand = infer_strand_from_intron_motifs(intron_motifs, read_strand)
116 | 
117 |     n_introns = len(introns)
118 |     for i, (start, end, overhang, length, motif) in enumerate(introns, 1):
119 |         if strand == '-':
120 |             motif = motif.translate(RC)[::-1]
121 |         yield chrom, start, end, strand, motif, overhang, length
122 | 
123 | 
124 | def build_donor_acceptor_ncls(introns, intron_counts, intron_jads, dist=20):
125 |     donor_invs = defaultdict(Counter)
126 |     acceptor_invs = defaultdict(Counter)
127 |     donor_inv_jads = defaultdict(Counter)
128 |     acceptor_inv_jads = defaultdict(Counter)
129 | 
130 |     for (chrom, start, end, strand), count, jad in zip(introns, intron_counts, intron_jads):
131 |         if strand == '+':
132 |             donor_inv = (start - dist, start + dist, start)
133 |             acceptor_inv = (end - dist, end + dist, end)
134 |         else:
135 |             donor_inv = (end - dist, end + dist, end)
136 |             acceptor_inv = (start - dist, start + dist, start)
137 |         donor_invs[(chrom, strand)][donor_inv] += count
138 |         acceptor_invs[(chrom, strand)][acceptor_inv] += count
139 | 
140 |         # jad is used to break count ties
141 |         donor_inv_jads[(chrom, strand)][donor_inv] = max(
142 |             donor_inv_jads[(chrom, strand)][donor_inv], jad
143 |         )
144 |         acceptor_inv_jads[(chrom, strand)][acceptor_inv] = max(
145 |             acceptor_inv_jads[(chrom, strand)][acceptor_inv], jad
146 |         )
147 | 
148 |     da_itree = {}
149 |     for label, invs, inv_jads in zip(['donor', 'acceptor'],
150 |                                      [donor_invs, acceptor_invs],
151 |                                      [donor_inv_jads, acceptor_inv_jads]):
152 |         da_itree[label] = {}
153 |         for chrom, pos in invs.items():
154 |             jads = [inv_jads[chrom][i] for i in pos]
155 |             starts, ends, mids, counts = zip(*[(s, e, m, c) for (s, e, m), c in pos.items()])
156 |             starts = np.array(starts, dtype=np.int64)
157 |             ends = np.array(ends, dtype=np.int64)
158 |             idx = np.array(mids, dtype=np.int64)
159 |             counts = {i: (c, j) for i, c, j in zip(mids, counts, jads)}
160 |             itree = NCLS(starts, ends, idx)
161 |             da_itree[label][chrom] = (itree, counts)
162 |     return da_itree
163 | 
164 | 
165 | def assign_primary(chrom, start, end, strand, inv_trees):
166 |     donor_pos = start if strand == '+' else end
167 |     acceptor_pos = end if strand == '+' else start
168 | 
169 |     is_primary = {}
170 |     for label, pos, in zip(['donor', 'acceptor'],
171 |                            [donor_pos, acceptor_pos]):
172 |         itree, counts = inv_trees[label][(chrom, strand)]
173 |         max_count = 0
174 |         max_jad = 0
175 |         for _, _, ov_pos in itree.find_overlap(pos, pos):
176 |             if ov_pos != pos:
177 |                 c, j = counts[ov_pos]
178 |                 max_count = max(max_count, c)
179 |                 max_jad = max(max_jad, j)
180 |         if max_count < counts[pos][0]:
181 |             is_primary[label] = True
182 |         elif (max_count == counts[pos][0]) & (max_jad < counts[pos][1]):
183 |             # break count ties with jad
184 |             is_primary[label] = True
185 |         else:
186 |             is_primary[label] = False
187 |     return is_primary['donor'], is_primary['acceptor']
188 | 
189 | 
190 | def fetch_introns_for_interval(bam_fn, chrom, start, end, stranded):
191 |     motifs = {}
192 |     lengths = {}
193 |     counts = Counter()
194 |     intron_jads = Counter()
195 |     with pysam.AlignmentFile(bam_fn) as bam:
196 |         for aln in bam.fetch(chrom, start, end):
197 |             # to prevent double counting of introns, ignore alns
198 |             # which start before beginning of specified interval
199 |             if aln.reference_start < start:
200 |                 continue
201 |             for *i, m, ov, ln in find_introns(aln, stranded):
202 |                 i = tuple(i)
203 |                 motifs[i] = m
204 |                 lengths[i] = ln
205 |                 counts[i] += 1
206 |                 intron_jads[i] = max(intron_jads[i], ov)
207 |     return motifs, lengths, counts, intron_jads
208 | 
209 | 
210 | def get_bam_intervals(bam_fn, batch_size):
211 |     with pysam.AlignmentFile(bam_fn) as bam:
212 |         references = {ref: ln for ref, ln in zip(bam.references, bam.lengths)}
213 |     for ref, ref_len in references.items():
214 |         for i in range(0, ref_len, batch_size):
215 |             query = (ref, i, min(ref_len, i + batch_size))
216 |             yield query
217 | 
218 | 
219 | def merge_intron_res(res):
220 |     motifs = {}
221 |     lengths = {}
222 |     counts = Counter()
223 |     intron_jads = Counter()
224 |     for m, l, c, j in res:
225 |         motifs.update(m)
226 |         lengths.update(l)
227 |         counts += c
228 |         for i, jad in j.items():
229 |             intron_jads[i] = max(intron_jads[i], jad)
230 |     return motifs, lengths, counts, intron_jads
231 | 
232 | 
233 | def parse_introns(bam_fn, primary_splice_local_dist,
234 |                   stranded, batch_size, processes):
235 |     '''
236 |     find all introns in the dataset, label them as positive or negative
237 |     training examples using the simple jad filter and then extract their
238 |     sequences from the reference for training the neural network
239 |     '''
240 |     with Parallel(n_jobs=processes) as pool:
241 |         res = pool(
242 |             delayed(fetch_introns_for_interval)(
243 |                     bam_fn, *inv, stranded)
244 |             for inv in get_bam_intervals(bam_fn, batch_size)
245 |         )
246 |     motifs, lengths, counts, intron_jads = merge_intron_res(res)
247 | 
248 |     introns = list(motifs.keys())
249 |     motifs = [motifs[i] for i in introns]
250 |     lengths = [lengths[i] for i in introns]
251 |     counts = [counts[i] for i in introns]
252 |     jad_label = [intron_jads[i] for i in introns]
253 | 
254 |     itrees = build_donor_acceptor_ncls(
255 |         introns, counts, jad_label, primary_splice_local_dist
256 |     )
257 |     is_primary_donor = []
258 |     is_primary_acceptor = []
259 |     for i in introns:
260 |         d, a = assign_primary(*i, itrees)
261 |         is_primary_donor.append(d)
262 |         is_primary_acceptor.append(a)
263 | 
264 |     return (introns, motifs, lengths, counts, jad_label,
265 |             is_primary_donor, is_primary_acceptor)
266 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 2passtools
  2 | 
  3 | [![DOI](https://zenodo.org/badge/242980365.svg)](https://zenodo.org/badge/latestdoi/242980365)
  4 | 
  5 | A package for filtering splice junctions extracted from noisy long read alignments generated using minimap2. These can then be used to perform second pass alignment with minimap2, feeding in the junctions using the `--junc-bed` flag.
  6 | 
  7 | ## Installation:
  8 | 
  9 | 2passtools has been tested with python 3.6, and requires `numpy`, `scikit-learn`, `pysam`, `NCLS` and `click`. The easiest way to install it is using the conda environment yaml provided:
 10 | 
 11 | ```
 12 | 
 13 | git clone https://www.github.com/bartongroup/2passtools
 14 | cd 2passtools
 15 | conda env create -f 2passtools.yml
 16 | 
 17 | source activate 2passtools
 18 | ```
 19 | 
 20 | Alternatively 2passtools and the required packages can be installed using pip:
 21 | 
 22 | ```
 23 | pip install git+git://github.com/bartongroup/2passtools.git
 24 | ```
 25 | 
 26 | 
 27 | ## Use:
 28 | 
 29 | 2passtools has three commands....
 30 | 
 31 | NB: There is a [snakemake](https://www.github.com/bartongroup/two_pass_alignment_pipeline) pipeline which can be used to run the benchmarking scripts used in the manuscript.
 32 | 
 33 | ### `score`:
 34 | 
 35 | The `2passtools score` command requires as input a long read sequencing bam file aligned using minimap2 and a reference fasta file. It then extracts junction metrics and sequence information and uses it to score splice junctions found in the alignments. The output of `score` is a BED file with multiple columns corresponding to different metrics and model scores (see output below). This format cannot be passed to minimap2 directly as (A) it has not yet been filtered and (B) the extra column format is not supported by minimap2 which requires 6-column bed. Filtering and reformatting can be done using `2passtools filter`.
 36 | 
 37 | If you already have a reference annotation but want to discover novel splice junctions, consider using the **annotation-aided mode** of `2passtools score`. It takes an additional input: a bed file containing high-confidence splice junctions from an existing reference annotation. `2passtools` will use these as positive examples to train *de novo* models to detect novel splice junctions. It works best if the existing annotation is relatively complete, but there are significant numbers of novel splice junctions: if the annotation is too incomplete it is better to run 2passtools using the pre-trained model, and if there are very few novel splice junctions, it is better just to do reference-guided alignment (without `2passtools`). There are experiments which might help you guide your decision in the [Genome Biology paper](https://doi.org/10.1186/s13059-021-02296-0).
 38 | 
 39 | #### Options:
 40 |  
 41 | ```
 42 | $ 2passtools score --help
 43 | Usage: 2passtools score [OPTIONS] BAM_FN
 44 | 
 45 |   2passtools score: A tool for extracting and scores junctions from a bam
 46 |   file aligned with minimap2. Filtered junctions can be used to realign
 47 |   reads in a second pass with minimap2.
 48 | 
 49 |   Bam file must be mapped with minimap2 and have the long form CS tag, e.g.
 50 | 
 51 |   minimap2 -a --cs=long -k14 -x splice ref.fa reads.fq
 52 | 
 53 | Options:
 54 |   -o, --output-bed-fn TEXT        Output file path  [required]
 55 |   -f, --ref-fasta-fn TEXT         Path to the fasta file that reads were
 56 |                                   mapped to  [required]
 57 | 
 58 |   -a, --annot-bed-fn TEXT         Optional BED file containing annotated
 59 |                                   junctions
 60 | 
 61 |   -j, --jad-size-threshold INTEGER
 62 |                                   JAD to threshold at in the decision tree
 63 |   -d, --primary-splice-local-dist INTEGER
 64 |                                   Distance to search for alternative
 65 |                                   donor/acceptors when calculating primary d/a
 66 | 
 67 |   -m, --canonical-motifs TEXT     Intron motifs considered canonical in
 68 |                                   organism. Should be four char DNA motifs
 69 |                                   separated by vertical bar only
 70 | 
 71 |   -w, --lr-window-size INTEGER    Sequence size to extract to train logistic
 72 |                                   regression models
 73 | 
 74 |   -k, --lr-kfold INTEGER          Number of cross validation k-folds for
 75 |                                   logistic regression models
 76 | 
 77 |   -lt, --lr-low-confidence-threshold FLOAT
 78 |                                   Logistic regression low confidence threshold
 79 |                                   for decision tree 2
 80 | 
 81 |   -ht, --lr-high-confidence-threshold FLOAT
 82 |                                   Logistic regression high confidence
 83 |                                   threshold for decision tree 2
 84 | 
 85 |   -c, --classifier-type [decision_tree|random_forest]
 86 |                                   When annotated juncs are available, train
 87 |                                   this classifier type
 88 | 
 89 |   --keep-all-annot / --filter-annot
 90 |                                   When annotated juncs are available, always
 91 |                                   keep all annotated juncs
 92 | 
 93 |   --stranded / --unstranded       Whether input data is stranded or
 94 |                                   unstranded. direct RNA is stranded, cDNA
 95 |                                   often isn't
 96 | 
 97 |   -p, --processes INTEGER
 98 |   -s, --random-seed INTEGER
 99 |   -v, --verbosity LVL             Either CRITICAL, ERROR, WARNING, INFO or
100 |                                   DEBUG
101 | 
102 |   --help                          Show this message and exit.
103 | ```
104 | 
105 | #### Output:
106 | 
107 | A 13-column BED file format with the following values:
108 | 
109 | ```
110 | 1. chrom (string)
111 | 2. start (integer)
112 | 3. end (integer)
113 | 4. intron-motif (four char string)
114 | 5. supporting read count (integer)
115 | 6. strand (string, either '+' or '-')
116 | 7. junction alignment distance metric (integer)
117 | 8. primary donor metric (integer, either 0 or 1)
118 | 9. primary acceptor metric (integer, either 0 or 1)
119 | 10. decision tree 1 output (integer, either 0 or 1)
120 | 11. logistic regression model donor score (float)
121 | 12. logistic regression model acceptor score (float)
122 | 13. decision tree 2 output (integer, either 0 or 1)
123 | ```
124 | 
125 | ### `filter`:
126 | 
127 | The `2passtools filter` command can be used to filter the 13-column bed file using any expression utilising the metrics or model outputs. The expression should be a valid python expression which evaluates to `True` or `False` for each junction, and can use any of the following safe variables and functions:
128 | 
129 | * `motif`: The intron motif in ACGTN alphabet (`str`),
130 | * `is_GTAG`: The intron motif is GU/AG (`bool`),
131 | * `is_GCAG`: The intron motif is GC/AG (`bool`),
132 | * `is_ATAG`: The intron motif is AU/AG (`bool`),
133 | * `motif_regex_match`: safe function allowing regex matching of motif, e.g. `motif_regex_match("G[CT]AG")` (`func`),
134 | * `count`: The supporting read count (`int`),
135 | * `jad`: The junction alignment distance metric (`int`),
136 | * `primary_donor`: The primary donor metric (`bool`),
137 | * `primary_acceptor`: The primary acceptor metric (`bool`),
138 | * `donor_seq_score`: The logistic regression model donor score (`float`),
139 | * `acceptor_seq_score`: The logistic regression model acceptor score (`float`),
140 | * `decision_tree_1_pred`: Decision tree model 1 output (`bool`),
141 | * `decision_tree_2_pred`: Decision tree model 2 output (`bool`),
142 | * `sum`, `pow`, `min`, `max`, `len`: python functions,
143 | * `math`: The python `math` module, any function from it is useable,
144 | * `bool`, `int`, `str`, `float`: python functions.
145 | 
146 | For example:
147 | 
148 | * `2passtools filter --exprs 'jad > 3'` filters for junction alignment distance of 4 nt or more.
149 | * `2passtools filter --exprs 'decision_tree_2_pred'` filters for junctions that pass the second decision tree model.
150 | 
151 | etc.
152 | 
153 | #### Usage:
154 | 
155 | ```
156 | $ 2passtools filter --help
157 | Usage: 2passtools filter [OPTIONS] BED_FN
158 | 
159 |   2passtools filter: Convenience tool to filter a junction bed and produce
160 |   6-column bed format which is compatible with minimap2.
161 | 
162 | Options:
163 |   -o, --output-bed-fn TEXT  [required]
164 |   --exprs TEXT
165 |   -v, --verbosity LVL       Either CRITICAL, ERROR, WARNING, INFO or DEBUG
166 |   --help                    Show this message and exit.
167 | ```
168 | 
169 | ### `merge`:
170 | 
171 | The `2passtools merge` command is similar to `score`, but takes multiple 13-column bed files produced by `score` and merges them, recalculating metrics and model stats, to produce a unified junction set. This is useful for making sure all replicates are aligned similarly, and often alignment is improved by borrowing power across replicates. Output is in the same 13-column BED format as `score`.
172 | 
173 | #### `Usage`:
174 | 
175 | ```
176 | $ 2passtools merge --help
177 | Usage: 2passtools merge [OPTIONS] BED_FNS...
178 | 
179 |   2passtools merge: Merges bed files produced by 2passtools score on
180 |   individual replicates and recalculates junction strength metrics. Can be
181 |   used to create a unified junction set to realign reads from different
182 |   replicates.
183 | 
184 |   Bed files should be in the 13 column format produced by 2passtools score.
185 | 
186 | Options:
187 |   -o, --output-bed-fn TEXT        Output file path  [required]
188 |   -f, --ref-fasta-fn TEXT         Path to the fasta file that reads were
189 |                                   mapped to  [required]
190 | 
191 |   -a, --annot-bed-fn TEXT         Optional BED file containing annotated
192 |                                   junctions
193 | 
194 |   -j, --jad-size-threshold INTEGER
195 |                                   JAD to threshold at in the decision tree
196 |   -d, --primary-splice-local-dist INTEGER
197 |                                   Distance to search for alternative
198 |                                   donor/acceptors when calculating primary d/a
199 | 
200 |   -m, --canonical-motifs TEXT     Intron motifs considered canonical in
201 |                                   organism. Should be four char DNA motifs
202 |                                   separated by vertical bar only
203 | 
204 |   -w, --lr-window-size INTEGER    Sequence size to extract to train logistic
205 |                                   regression models
206 | 
207 |   -k, --lr-kfold INTEGER          Number of cross validation k-folds for
208 |                                   logistic regression models
209 | 
210 |   -lt, --lr-low-confidence-threshold FLOAT
211 |                                   Logistic regression low confidence threshold
212 |                                   for decision tree 2
213 | 
214 |   -ht, --lr-high-confidence-threshold FLOAT
215 |                                   Logistic regression high confidence
216 |                                   threshold for decision tree 2
217 | 
218 |   -c, --classifier-type [decision_tree|random_forest]
219 |                                   When annotated juncs are available, train
220 |                                   this classifier type
221 | 
222 |   --keep-all-annot / --filter-annot
223 |                                   When annotated juncs are available, always
224 |                                   keep all annotated juncs
225 | 
226 |   -p, --processes INTEGER
227 |   -s, --random-seed INTEGER
228 |   -v, --verbosity LVL             Either CRITICAL, ERROR, WARNING, INFO or
229 |                                   DEBUG
230 | 
231 |   --help                          Show this message and exit.
232 |   ```
233 | 
234 | 
235 | ### Citing `2passtools`:
236 | 
237 | The `2passtools` manuscript is published in Genome Biology (Open access):
238 | 
239 | > Parker, M.T., Knop, K., Barton, G.J. et al. 2passtools: two-pass alignment using machine-learning-filtered splice junctions increases the accuracy of intron detection in long-read RNA sequencing. Genome Biol 22, 72 (2021). https://doi.org/10.1186/s13059-021-02296-0
240 | 


--------------------------------------------------------------------------------
/lib2pass/main.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | lib2pass.main: contains the command line interface for 2passtools
  3 | '''
  4 | import os
  5 | import logging
  6 | 
  7 | import click
  8 | import click_log
  9 | 
 10 | import numpy as np
 11 | from sklearn.metrics import confusion_matrix
 12 | from .bamparse import parse_introns
 13 | from .seqlr import predict_splice_junctions_from_seq
 14 | from .decisiontree import dt1_pred, dt1_de_novo_pred, dt2_pred, dt2_de_novo_pred
 15 | from .merge import get_merged_juncs
 16 | from .filter import apply_eval_expression
 17 | 
 18 | log = logging.getLogger('2passtools')
 19 | click_log.basic_config(log)
 20 | 
 21 | 
 22 | def read_annot_juncs_bed(bed_fn):
 23 |     annot_introns = set()
 24 |     with open(bed_fn) as bed:
 25 |         for record in bed:
 26 |             chrom, start, end, _, _, strand, *_ = record.split()
 27 |             start = int(start)
 28 |             end = int(end)
 29 |             annot_introns.add((chrom, start, end, strand))
 30 |     return annot_introns
 31 | 
 32 | 
 33 | def _all_predictions(introns, motifs, lengths, counts,
 34 |                      jad_labels, is_primary_donor, is_primary_acceptor,
 35 |                      ref_fasta_fn, annot_bed_fn,
 36 |                      canonical_motifs, jad_size_threshold,
 37 |                      lr_window_size, lr_kfold,
 38 |                      lr_low_confidence_threshold,
 39 |                      lr_high_confidence_threshold,
 40 |                      classifier, keep_all_annot,
 41 |                      processes):
 42 |     '''
 43 |     Takes as input the alignment metrics extracted either from a bam file (2passtools score)
 44 |     or previously created junction bed file (2passtools merge). Calculates decision tree
 45 |     score one, then extracts junction sequences from the fasta file and calculates
 46 |     decision tree score two.
 47 |     '''
 48 |     log.info(f'Identified {len(introns):d} introns')
 49 |     if annot_bed_fn is None:
 50 |         log.info('Applying pretrained filter dt1')
 51 |         dt1_labels = dt1_pred(
 52 |             motifs, jad_labels, is_primary_donor, is_primary_acceptor,
 53 |             motif_regex=canonical_motifs,
 54 |             jad_size_threshold=jad_size_threshold,
 55 |         )
 56 |         log.info(f'{sum(dt1_labels):d} introns pass filter dt1')
 57 |         lr_donor_labels, lr_acceptor_labels = predict_splice_junctions_from_seq(
 58 |             introns, dt1_labels, ref_fasta_fn,
 59 |             lr_window_size, lr_kfold,
 60 |             processes
 61 |         )
 62 |         dt2_labels = dt2_pred(
 63 |             jad_labels, is_primary_donor, is_primary_acceptor,
 64 |             lr_donor_labels, lr_acceptor_labels,
 65 |             lr_low_confidence_threshold, lr_high_confidence_threshold,
 66 |             jad_size_threshold=jad_size_threshold
 67 |         )
 68 |     else:
 69 |         log.info(f'Annotated introns file {annot_bed_fn} provided')
 70 |         annot_introns = read_annot_juncs_bed(annot_bed_fn)
 71 |         log.info(f'Identified {len(annot_introns)} annotated introns')
 72 |         is_annot = [i in annot_introns for i in introns]
 73 |         dt1_labels = dt1_de_novo_pred(
 74 |             motifs, lengths, jad_labels,
 75 |             is_primary_donor, is_primary_acceptor,
 76 |             is_annot,
 77 |             motif_regex=canonical_motifs,
 78 |             classifier=classifier
 79 |         )
 80 |         cm = confusion_matrix(is_annot, dt1_labels)
 81 |         log.debug('Decision tree 1 confusion matrix:')
 82 |         log.debug(cm)
 83 |         lr_donor_labels, lr_acceptor_labels = predict_splice_junctions_from_seq(
 84 |             introns, dt1_labels, ref_fasta_fn,
 85 |             lr_window_size, lr_kfold,
 86 |             processes
 87 |         )
 88 |         dt2_labels = dt2_de_novo_pred(
 89 |             lengths, jad_labels,
 90 |             is_primary_donor, is_primary_acceptor,
 91 |             lr_donor_labels, lr_acceptor_labels,
 92 |             is_annot, classifier=classifier
 93 |         )
 94 |         cm = confusion_matrix(is_annot, dt2_labels)
 95 |         log.debug('Decision tree 2 confusion matrix:')
 96 |         log.debug(cm)
 97 |     log.info(f'{sum(dt2_labels):d} introns pass filter dt2')
 98 |     if annot_bed_fn is not None and keep_all_annot:
 99 |         log.info('Adding all annotated introns to results')
100 |         dt1_labels[is_annot == 1] = 1
101 |         dt2_labels[is_annot == 1] = 1
102 |     return (
103 |         introns, motifs, lengths, counts, jad_labels,
104 |         is_primary_donor, is_primary_acceptor,
105 |         dt1_labels,
106 |         lr_donor_labels, lr_acceptor_labels,
107 |         dt2_labels
108 |     )
109 | 
110 | 
111 | def validate_motif_regex(ctx, param, value):
112 |     if not set(value).issubset(set('ACGT|')):
113 |         raise click.BadParameter('unrecognised motifs, use only ACGT and | to separate')
114 |     for m in value.split('|'):
115 |         if not len(m) == 4:
116 |             raise click.BadParameter('all motifs should be 4 nt')
117 |     else:
118 |         return value
119 | 
120 | 
121 | @click.group()
122 | def main():
123 |     pass
124 | 
125 | 
126 | SCORE_MERGE_COMMON_OPTIONS = [
127 |     click.option('-o', '--output-bed-fn', required=True, help='Output file path'),
128 |     click.option('-f', '--ref-fasta-fn', required=True, type=str,
129 |                   help='Path to the fasta file that reads were mapped to'),
130 |     click.option('-a', '--annot-bed-fn', required=False, type=str, default=None,
131 |                  help='Optional BED file containing annotated junctions'),
132 |     click.option('-j', '--jad-size-threshold', default=4, help='JAD to threshold at in the decision tree'),
133 |     click.option('-d', '--primary-splice-local-dist', default=20,
134 |                   help='Distance to search for alternative donor/acceptors when calculating primary d/a'),
135 |     click.option('-m', '--canonical-motifs', default='GTAG|GCAG|ATAG', callback=validate_motif_regex,
136 |                   help=('Intron motifs considered canonical in organism. '
137 |                         'Should be four char DNA motifs separated by vertical bar only')),
138 |     click.option('-w', '--lr-window-size', default=128, type=int,
139 |                   help='Sequence size to extract to train logistic regression models'),
140 |     click.option('-k', '--lr-kfold', default=6, type=int,
141 |                   help='Number of cross validation k-folds for logistic regression models'),
142 |     click.option('-lt', '--lr-low-confidence-threshold', default=0.1, type=float,
143 |                   help='Logistic regression low confidence threshold for decision tree 2'),
144 |     click.option('-ht', '--lr-high-confidence-threshold', default=0.6, type=float,
145 |                   help='Logistic regression high confidence threshold for decision tree 2'),
146 |     click.option('-c', '--classifier-type', default='decision_tree',
147 |                  type=click.Choice(['decision_tree', 'random_forest']),
148 |                  help='When annotated juncs are available, train this classifier type'),
149 |     click.option('--keep-all-annot/--filter-annot', default=True,
150 |                  help='When annotated juncs are available, always keep all annotated juncs'),
151 | ]
152 | 
153 | def _common_options(common_options):
154 |     def _apply_common_options(func):
155 |         for option in reversed(common_options):
156 |             func = option(func)
157 |         return func
158 |     return _apply_common_options
159 | 
160 | 
161 | @main.command()
162 | @click.argument('bam-fn', required=True, nargs=1)
163 | @_common_options(SCORE_MERGE_COMMON_OPTIONS)
164 | @click.option('--stranded/--unstranded', default=True,
165 |               help=('Whether input data is stranded or unstranded. '
166 |                     'direct RNA is stranded, cDNA often isn\'t'))
167 | @click.option('-p', '--processes', default=1)
168 | @click.option('-s', '--random-seed', default=None, type=int)
169 | @click_log.simple_verbosity_option(log)
170 | def score(bam_fn, output_bed_fn, ref_fasta_fn, annot_bed_fn,
171 |           jad_size_threshold,
172 |           primary_splice_local_dist, canonical_motifs,
173 |           lr_window_size, lr_kfold,
174 |           lr_low_confidence_threshold, lr_high_confidence_threshold,
175 |           classifier_type, keep_all_annot, stranded, processes, random_seed):
176 |     '''
177 |     2passtools score: A tool for extracting and scores junctions from a bam file
178 |     aligned with minimap2. Filtered junctions can be used to realign reads in
179 |     a second pass with minimap2.
180 | 
181 |     Bam file must be mapped with minimap2 and have the long form CS tag, e.g.
182 | 
183 |     minimap2 -a --cs=long -k14 -x splice ref.fa reads.fq
184 |     '''
185 | 
186 |     if random_seed is not None:
187 |         np.random.seed(random_seed)
188 | 
189 |     log.info(f'Parsing BAM file: {bam_fn}')
190 |     (introns, motifs, lengths,
191 |      counts, jad_labels,
192 |      is_primary_donor, is_primary_acceptor) = parse_introns(
193 |         bam_fn,
194 |         primary_splice_local_dist,
195 |         stranded,
196 |         1_000_000, processes
197 |     )
198 |     res = zip(*_all_predictions(
199 |         introns, motifs, lengths, counts, jad_labels,
200 |         is_primary_donor, is_primary_acceptor,
201 |         ref_fasta_fn, annot_bed_fn,
202 |         canonical_motifs, jad_size_threshold,
203 |         lr_window_size, lr_kfold,
204 |         lr_low_confidence_threshold,
205 |         lr_high_confidence_threshold,
206 |         classifier_type, keep_all_annot, processes
207 |     ))
208 |     log.info(f'Writing results to {output_bed_fn}')
209 |     with open(output_bed_fn, 'w') as bed:
210 |         for i, motif, _, c, jad, pd, pa, d1, lrd, lra, d2 in res:
211 |             chrom, start, end, strand = i
212 |             bed.write(
213 |                 f'{chrom:s}\t{start:d}\t{end:d}\t{motif:s}\t{c:d}\t{strand:s}\t'
214 |                 f'{jad:d}\t{pd:d}\t{pa:d}\t{d1:d}\t'
215 |                 f'{lrd:.3f}\t{lra:.3f}\t{d2:d}\n'
216 |             )
217 | 
218 | 
219 | @main.command()
220 | @click.argument('bed-fns', required=True, nargs=-1)
221 | @_common_options(SCORE_MERGE_COMMON_OPTIONS)
222 | @click.option('-p', '--processes', default=1)
223 | @click.option('-s', '--random-seed', default=None, type=int)
224 | @click_log.simple_verbosity_option(log)
225 | def merge(bed_fns, output_bed_fn, ref_fasta_fn, annot_bed_fn,
226 |           jad_size_threshold, primary_splice_local_dist, canonical_motifs,
227 |           lr_window_size, lr_kfold,
228 |           lr_low_confidence_threshold, lr_high_confidence_threshold,
229 |           classifier_type, keep_all_annot, processes, random_seed):
230 |     '''
231 |     2passtools merge: Merges bed files produced by 2passtools score on individual
232 |     replicates and recalculates junction strength metrics. Can be used to create
233 |     a unified junction set to realign reads from different replicates.
234 |     
235 |     Bed files should be in the 13 column format produced by 2passtools score.
236 |     '''
237 |     if random_seed is not None:
238 |         np.random.seed(random_seed)
239 | 
240 |     log.info(f'Parsing {len(bed_fns):d} BED files')
241 |     (introns, motifs, lengths,
242 |      counts, jad_labels,
243 |      is_primary_donor, is_primary_acceptor) = get_merged_juncs(
244 |         bed_fns, primary_splice_local_dist
245 |     )
246 |     res = zip(*_all_predictions(
247 |         introns, motifs, lengths, counts, jad_labels,
248 |         is_primary_donor, is_primary_acceptor,
249 |         ref_fasta_fn, annot_bed_fn,
250 |         canonical_motifs, jad_size_threshold,
251 |         lr_window_size, lr_kfold,
252 |         lr_low_confidence_threshold,
253 |         lr_high_confidence_threshold,
254 |         classifier_type, keep_all_annot, processes
255 |     ))
256 |     log.info(f'Writing results to {output_bed_fn}')
257 |     with open(output_bed_fn, 'w') as bed:
258 |         for i, motif, _, c, jad, pd, pa, d1, lrd, lra, d2 in res:
259 |             chrom, start, end, strand = i
260 |             bed.write(
261 |                 f'{chrom:s}\t{start:d}\t{end:d}\t{motif:s}\t{c:d}\t{strand:s}\t'
262 |                 f'{jad:d}\t{pd:d}\t{pa:d}\t{d1:d}\t'
263 |                 f'{lrd:.3f}\t{lra:.3f}\t{d2:d}\n'
264 |             )
265 | 
266 | 
267 | @main.command()
268 | @click.argument('bed-fn', nargs=1, required=True)
269 | @click.option('-o', '--output-bed-fn', required=True)    
270 | @click.option('--exprs', required=False, default="decision_tree_2_pred")
271 | @click_log.simple_verbosity_option(log)
272 | def filter(bed_fn, output_bed_fn, exprs):
273 |     '''
274 |     2passtools filter: Convenience tool to filter a junction bed and produce
275 |     6-column bed format which is compatible with minimap2.
276 |     '''
277 |     with open(output_bed_fn, 'w') as bed:
278 |         for chrom, start, end, strand, decision in apply_eval_expression(bed_fn, exprs):
279 |             if decision:
280 |                 record = f'{chrom}\t{start}\t{end}\tintron\t0\t{strand}\n'
281 |                 bed.write(record)
282 | 
283 | 
284 | @main.command()
285 | @click_log.simple_verbosity_option(log)
286 | def mm2pass():
287 |     raise NotImplementedError('TODO: implement convience tool to wrap '
288 |                               'minimap2 and run two pass alignment')
289 | 
290 | 
291 | if __name__ == '__main__':
292 |     main()


--------------------------------------------------------------------------------