├── .gitignore
├── LICENSE
├── README.md
├── logo.png
├── setup.py
└── snapper
    ├── __init__.py
    ├── snapper.py
    └── src
        ├── data_processing.py
        ├── methods.py
        ├── motif_extraction.py
        ├── plotting.py
        ├── seq_processing.py
        ├── statistics_methods.py
        └── type_I_RM_system.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 
131 | # Snapper Results
132 | Results_*
133 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Dmitry N. Konanov
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="logo.png" align="left" width=150> 
 2 | 
 3 | # Snapper: nanopore-based modification motifs caller
 4 | 
 5 | This tool is designed to efficiently detect methylation sites using ONT sequencing data.
 6 | Snapper uses balanced approach to compute statistics for each k-mer which is likely to be modified.
 7 | The core feature of Snapper in comparison with other tools is a new high-sensitive greedy algorithm that is used 
 8 | for position-specific motif enrichment. This repository contains not the Snapper tool itself but its pip distribution.
 9 | 
10 | ## Dependencies
11 | - python 3.7 (later versions might be incompatible because of inner biopython dependencies)
12 | - ont-tombo
13 | - h5py
14 | - biopython
15 | - matplotlib
16 | - scipy
17 | - seaborn
18 | 
19 | ## Installation
20 | 
21 | ```
22 | (base) $ conda create -n snapper python=3.7
23 | (base) $ conda activate snapper
24 | (snapper) $ conda install -c bioconda ont-fast5-api ont-tombo
25 | (snapper) $ pip install snapper-ont
26 | ```
27 | 
28 | ## Usage
29 | 
30 | Firstly, fast5 files should be resquiggled using [Tombo](https://github.com/nanoporetech/tombo) software. 
31 | After resquiggling, fast5 files should be converted to the multi-fast5 format using [ont_fast5_api](https://github.com/nanoporetech/ont_fast5_api).
32 | 
33 | A more detailed usage guideline and few usercases are available in [Snapper's documentation](https://snapper-tutorial.readthedocs.io/en/latest/index.html)
34 | 
35 | ```
36 | usage: snapper [-h] -sample_fast5dir SAMPLE_FAST5DIR -control_fast5dir
37 |                CONTROL_FAST5DIR -reference REFERENCE [-ks_t KS_T]
38 |                [-outdir OUTDIR] [-coverage COVERAGE] [-threads THREADS]
39 |                [-k_size K_SIZE] [-long_k_size LONG_K_SIZE]
40 |                [-max_motifs MAX_MOTIFS] [-min_conf MIN_CONF]
41 |                [-target_chr TARGET_CHR]
42 | 
43 | optional arguments:
44 |   -h, --help            show this help message and exit
45 |   -sample_fast5dir SAMPLE_FAST5DIR
46 |                         sample multi fast5 dir
47 |   -control_fast5dir CONTROL_FAST5DIR
48 |                         control multi fast5 dir
49 |   -reference REFERENCE  reference genome in the fasta format
50 |   -ks_t KS_T            -log ks_test p-value (default 3).
51 |   -outdir OUTDIR        output directory name
52 |   -coverage COVERAGE    minimal genome coverage depth (default 40)
53 |   -threads THREADS      number of threads used (default 8)
54 |   -k_size K_SIZE        k-mer size, must be odd, 
55 |                         should not be less than 11 (default 15)
56 |   -long_k_size LONG_K_SIZE
57 |                         k-mer size, must be odd, 
58 |                         should not be less than 21 (default 29)
59 |   -max_motifs MAX_MOTIFS
60 |                         the maximum expected number of motifs extracted
61 |   -min_conf MIN_CONF    the minimal confidence value (default is 100)
62 |   -target_chr TARGET_CHR
63 |                         target chromosome name (by default all
64 |                         contigs/replicons are considered)
65 | 
66 | 
67 | ```
68 | 
69 | 
70 | Typical run command:
71 | ```
72 | snapper -sample_fast5dir ../HelicobacterMod/fast5/J99_multi/ -control_fast5dir ../HelicobacterMod/fast5/J99_wga_multi/ -reference ../HelicobacterMod/genome/J99.fasta
73 | ```
74 | 
75 | ## Output explanation
76 | 
77 | The output directory contains the following files:
78 | - `passed_motifs_[strand]_[contig_name].fasta` - all k-mers that most likely bring a modified base
79 | - `final_motifs_[strand]_[contig_name].fasta` - optimal set of motifs generated from the passed motifs by the Snapper greedy algorithm
80 | - `plots_[strand]_[contig_name]` - signal distribution plots for each extracted motif  
81 | 
82 | ## Citation
83 | 
84 | Dmitry N Konanov, Vladislav V Babenko, Aleksandra M Belova, Arina G Madan, Daria I Boldyreva, Oksana E Glushenko, Ivan O Butenko, Dmitry E Fedorov, Alexander I Manolov, Danil V Krivonos, Vassilii N Lazarev, Vadim M Govorun, Elena N Ilina, [Snapper: high-sensitive detection of methylation motifs based on Oxford Nanopore reads](https://doi.org/10.1093/bioinformatics/btad702), Bioinformatics, 2023
85 | 


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DNKonanov/Snapper/29e659247091ff41e74a1cae7380356445da4020/logo.png


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import setuptools
 2 | 
 3 | with open("README.md", "r", encoding="utf-8") as fh:
 4 |     long_description = fh.read()
 5 | 
 6 | setuptools.setup(
 7 |     name="snapper-ont",
 8 |     version="0.4.5",
 9 |     author="D.N. Konanov",
10 |     author_email="konanovdmitriy@gmail.com",
11 |     description="Nanopore-based methylation sites caller",
12 |     long_description="snapper",
13 |     long_description_content_type="",
14 |     url="https://github.com/DNKonanov/Snapper",
15 |     project_urls={
16 |         "Bug Tracker": "https://github.com/DNKonanov/Snapper",
17 |     },
18 |     classifiers=[
19 |         "Programming Language :: Python :: 3",
20 |         "License :: OSI Approved :: MIT License",
21 |         "Operating System :: OS Independent",
22 |     ],
23 |     python_requires=">=3.7",
24 |     include_package_data=True,
25 |     packages=['snapper', 'snapper.src'],
26 |     install_requires=[
27 |         'h5py',
28 |         'biopython',
29 |         'matplotlib',
30 |         'scipy',
31 |         'seaborn',
32 |         'tqdm'
33 |     ],
34 |     entry_points={
35 |         'console_scripts': [
36 |             'snapper=snapper.snapper:main'
37 |         ]
38 |     }
39 | )
40 | 


--------------------------------------------------------------------------------
/snapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DNKonanov/Snapper/29e659247091ff41e74a1cae7380356445da4020/snapper/__init__.py


--------------------------------------------------------------------------------
/snapper/snapper.py:
--------------------------------------------------------------------------------
  1 | from argparse import ArgumentParser
  2 | import os
  3 | import sys
  4 | 
  5 | import warnings
  6 | warnings.filterwarnings("ignore")
  7 | 
  8 | 
  9 | 
 10 | 
 11 | def main():
 12 | 
 13 |     parser = ArgumentParser()
 14 | 
 15 |     parser.add_argument('-sample_fast5dir', type=str, help='sample multi fast5 dir', required=True)
 16 |     parser.add_argument('-control_fast5dir', type=str, help='control multi fast5 dir', required=True)
 17 |     parser.add_argument('-reference', type=str, help='reference genome in the fasta format', required=True)
 18 |     parser.add_argument('-ks_t', type=int, default=3, help='-log ks_test p-value (default 3).')
 19 |     parser.add_argument('-outdir', type=str, default='default', help='output directory name')
 20 |     parser.add_argument('-coverage', type=float, help='minimal genome coverage depth (default 40)', default=40)
 21 |     parser.add_argument('-threads', type=int, default=8, help='number of threads used (default 8)')
 22 |     parser.add_argument('-k_size', type=int, default=15, help='k-mer size, must be odd, should not be less than 11 (default 15)')
 23 |     parser.add_argument('-long_k_size', type=int, default=29, help='long k-mer size, must be odd, should not be less than 21 (default 29)')
 24 |     parser.add_argument('-max_motifs', help='the maximum expected number of motifs extracted (default 20)', default=20, type=int)
 25 |     parser.add_argument('-min_conf', help='the minimal confidence value (default is 100)', type=float, default=100)
 26 |     parser.add_argument('-target_chr', help='target chromosome name (by default all contigs/replicons are considered)', type=str, default='all')
 27 |     
 28 | 
 29 | 
 30 |     from snapper.src.motif_extraction import extract_motifs
 31 |     from snapper.src.plotting import plot_motif, plot_coverage, plot_dist
 32 |     from snapper.src.data_processing import get_reference, parse_data
 33 |     from snapper.src.statistics_methods import get_difsignals, get_statistics
 34 |     from snapper.src.methods import save_results, save_k_mers
 35 |     from snapper.src.statistics_methods import SAMPLESIZE, MINSAMPLESIZE
 36 | 
 37 |     if len(sys.argv)==1:
 38 |         parser.print_help(sys.stderr)
 39 |         sys.exit(1)
 40 | 
 41 | 
 42 |     
 43 | 
 44 |     args = parser.parse_args()
 45 | 
 46 |     if args.k_size%2 == 0 or args.long_k_size%2 == 0:
 47 |         raise ValueError('Both -k_size and -long_k_size must be odd numbers')
 48 |     
 49 |     if args.k_size < 11:
 50 |         raise ValueError('-k_size parameter should not be less than 11')
 51 | 
 52 |     if args.long_k_size < 21:
 53 |         raise ValueError('-long_k_size parameter should not be less than 21')
 54 |     
 55 |     if args.k_size >= args.long_k_size:
 56 |         raise ValueError('K_SIZE should be less than LONG_K_SIZE')
 57 |     
 58 | 
 59 | 
 60 | 
 61 |     if args.outdir == 'default':
 62 |         import datetime
 63 |         
 64 |         sp = str(datetime.datetime.now()
 65 |             ).replace(' ', '_').replace(':', '').replace('-', '_').split('.')[0]
 66 |         outdir = 'Results_' + sp
 67 | 
 68 |     else:
 69 |         outdir  = args.outdir
 70 | 
 71 |     try:
 72 |         os.mkdir(outdir)
 73 |     except:
 74 |         raise FileExistsError('The specified output dir already exists!')
 75 | 
 76 |     print('\nSample data collecting...')
 77 | 
 78 |     sample_motifs, sample_reverse_motifs, sample_long_motifs, sample_long_reverse_motifs, sample_coverages, sample_rev_coverages = parse_data(
 79 |         args.sample_fast5dir, 
 80 |         args.reference, 
 81 |         target_chr=args.target_chr, 
 82 |         required_coverage=args.coverage,
 83 |         MOTIF_LEN=args.k_size,
 84 |         LONG_MOTIF_LEN=args.long_k_size,
 85 |     )
 86 | 
 87 | 
 88 | 
 89 |     print('\nControl data collecting...')
 90 |     control_motifs, control_reverse_motifs, control_long_motifs, control_long_reverse_motifs, control_coverages, control_rev_coverages = parse_data(
 91 |         args.control_fast5dir, 
 92 |         args.reference, 
 93 |         target_chr=args.target_chr, 
 94 |         required_coverage=args.coverage,
 95 |         MOTIF_LEN=args.k_size,
 96 |         LONG_MOTIF_LEN=args.long_k_size,
 97 |     )
 98 | 
 99 | 
100 |     refs, reverse_refs = get_reference(
101 |         args.reference,
102 |         target_chr=args.target_chr
103 |     )
104 | 
105 |     for contig in refs:
106 |         print(contig, len(refs[contig]))
107 | 
108 | 
109 |     print('\nForward strand signals processing...')
110 |     motifs_lines, ks_stat_lines = get_statistics(
111 |         sample_motifs, 
112 |         control_motifs, 
113 |         maxsamplesize=SAMPLESIZE,
114 |         minsamplesize=MINSAMPLESIZE,
115 |         threads=args.threads
116 |     )
117 | 
118 | 
119 |     print('\nReverse strand signals processing...')
120 |     reverse_motifs_lines, reverse_ks_stat_lines = get_statistics(
121 |         sample_reverse_motifs, 
122 |         control_reverse_motifs, 
123 |         maxsamplesize=SAMPLESIZE,
124 |         minsamplesize=MINSAMPLESIZE,
125 |         threads=args.threads
126 |     )
127 | 
128 |     
129 | 
130 | 
131 |     # MOTIFS EXTRACTION
132 | 
133 |     for contig in motifs_lines:
134 | 
135 | 
136 |         print('Processing forward motifs {}...'.format(contig))
137 | 
138 |             
139 |         
140 |         contig_passed_motifs = get_difsignals(
141 |             motifs_lines[contig], 
142 |             ks_stat_lines[contig], 
143 |             log10_pval_thr = args.ks_t,
144 |         )
145 | 
146 |         if len(contig_passed_motifs) < 100:
147 |             print('---The number of k-mers is insufficient for the enrichment process. {} is skipped.---'.format(contig))
148 |             continue
149 | 
150 | 
151 |         
152 | 
153 | 
154 | 
155 |         plotdir = outdir + '/plots_forward_{}'.format(contig) 
156 |         os.mkdir(plotdir)
157 | 
158 |         save_k_mers(contig_passed_motifs, outdir + '/passed_motifs_forward_{}.fasta'.format(contig))
159 |         motifs = extract_motifs(contig_passed_motifs, 
160 |                                 refs[contig], 
161 |                                 outdir, 
162 |                                 args.max_motifs,
163 |                                 args.min_conf, 
164 |                                 'forward_' + contig,
165 |                                 
166 |                                 sample_motifs[contig], 
167 |                                 control_motifs[contig], 
168 |                                 sample_long_motifs[contig], 
169 |                                 control_long_motifs[contig], 
170 |                                 args.k_size, 
171 |                                 args.long_k_size,  
172 |                                 args.ks_t,
173 | 
174 |                                 threads=args.threads,
175 |                                 lenmotif=args.k_size
176 |                                 )
177 | 
178 | 
179 |         for motif in motifs:
180 |             plot_dist(motif, sample_motifs[contig], control_motifs[contig], plotdir, lenmotif=args.k_size)
181 | 
182 | 
183 | 
184 |         save_results(motifs, outdir + '/final_motifs_forward_{}.fasta'.format(contig))
185 |         plot_coverage(sample_coverages[contig], control_coverages[contig], contig, f'{outdir}/coverage_forward_{contig}.pdf')
186 | 
187 | 
188 |     for contig in reverse_motifs_lines:
189 | 
190 |         print(contig, len(reverse_refs[contig]))
191 |         print('Processing reversed motifs {}...'.format(contig))
192 |             
193 | 
194 |         contig_passed_motifs = get_difsignals(
195 |             reverse_motifs_lines[contig], 
196 |             reverse_ks_stat_lines[contig], 
197 |             log10_pval_thr = args.ks_t,
198 |         )
199 | 
200 |         
201 |         if len(contig_passed_motifs) < 100:
202 |             print('---The number of k-mers is insufficient for the enrichment process. {}(reverse) is skipped.---'.format(contig))
203 |             continue
204 |         
205 | 
206 |         plotdir = outdir + '/plots_reverse_{}'.format(contig) 
207 |         os.mkdir(plotdir)
208 | 
209 |         save_k_mers(contig_passed_motifs, outdir + '/passed_motifs_reverse_{}.fasta'.format(contig))
210 |         motifs = extract_motifs(contig_passed_motifs, 
211 |                                 reverse_refs[contig], 
212 |                                 outdir, 
213 |                                 args.max_motifs,
214 |                                 args.min_conf, 
215 |                                 'reverse_' + contig,
216 | 
217 |                                 sample_reverse_motifs[contig], 
218 |                                 control_reverse_motifs[contig], 
219 |                                 sample_long_reverse_motifs[contig], 
220 |                                 control_long_reverse_motifs[contig], 
221 |                                 args.k_size, 
222 |                                 args.long_k_size,  
223 |                                 args.ks_t,
224 | 
225 |                                 threads=args.threads,
226 |                                 lenmotif=args.k_size
227 |                                 )
228 | 
229 |         
230 | 
231 | 
232 |         for motif in motifs:
233 |             plot_dist(motif, sample_reverse_motifs[contig], control_reverse_motifs[contig], plotdir, lenmotif=args.k_size)
234 | 
235 | 
236 | 
237 |         save_results(motifs, outdir + '/final_motifs_reverse_{}.fasta'.format(contig))
238 |         plot_coverage(sample_rev_coverages[contig], control_rev_coverages[contig], contig, f'{outdir}/coverage_reverse_{contig}.pdf')
239 | 
240 | 
241 |     print('Done!')
242 | 
243 | 
244 | if __name__ == '__main__':
245 |     main()


--------------------------------------------------------------------------------
/snapper/src/data_processing.py:
--------------------------------------------------------------------------------
  1 | from multiprocessing import Pool
  2 | from unittest import result
  3 | from webbrowser import get
  4 | import numpy as np
  5 | import h5py
  6 | from itertools import product
  7 | from Bio.SeqIO import parse
  8 | from Bio.Seq import reverse_complement
  9 | import os
 10 | from tqdm import tqdm
 11 | 
 12 | letters = ['A','G','C','T']
 13 | 
 14 | 
 15 | def get_reference(reference_file, target_chr='all'):
 16 | 
 17 |     ref_file = parse(reference_file, format='fasta')
 18 | 
 19 |     
 20 |     if target_chr == 'all':
 21 | 
 22 |         refs = {}
 23 |         reverse_refs = {}        
 24 |         for rec in ref_file:
 25 |             
 26 |             seq = str(rec.seq)
 27 |             contig = str(rec.description).split(' ')[0]
 28 |             refs[contig] = seq
 29 |             reverse_refs[contig] = reverse_complement(seq)
 30 |         return refs, reverse_refs
 31 |     
 32 |     if target_chr == 'longest':
 33 |         length = 0
 34 | 
 35 |         refs = {}
 36 |         reverse_refs = {}
 37 | 
 38 |         for rec in ref_file:
 39 |             
 40 |             contig = str(rec.description).split(' ')[0]
 41 |             if len(rec.seq) > length:
 42 |                 seq = str(rec.seq)
 43 |                 
 44 |                 refs = {contig : seq}
 45 |                 reverse_refs = {contig : reverse_complement(seq)}
 46 |                 length = str(rec.seq)
 47 | 
 48 |         return refs, reverse_refs
 49 | 
 50 |     else:
 51 |         refs = {}
 52 |         reverse_refs = {}
 53 |         for rec in ref_file:
 54 |             contig = str(rec.description).split(' ')[0]
 55 |             if contig == target_chr:
 56 |                 seq = str(rec.seq)
 57 | 
 58 |                 refs[target_chr] = seq
 59 |                 reverse_refs[target_chr] = reverse_complement(seq)
 60 |         
 61 |         if len(refs) == 0:
 62 |             raise KeyError('{} contig does not exist!'.format(target_chr))
 63 | 
 64 |         return refs, reverse_refs
 65 | 
 66 | 
 67 | 
 68 | def get_max_replicon(refs):
 69 | 
 70 |     length = 0
 71 |     max_chrom = 0
 72 |     for chrom in refs:
 73 |         if len(refs[chrom]) > length:
 74 |             length = len(refs[chrom])
 75 |             max_chrom = chrom
 76 | 
 77 |     return max_chrom
 78 |         
 79 | 
 80 | 
 81 | def _get_shifts(MOTIF_LEN):
 82 | 
 83 |     left_shift = int(np.floor(MOTIF_LEN/2))
 84 |     right_shift = int(np.ceil(MOTIF_LEN/2))
 85 | 
 86 |     return left_shift, right_shift
 87 | 
 88 | 
 89 | def parse_data(fast5dir, reference_file, target_chr='all', required_coverage=30, MOTIF_LEN=11, LONG_MOTIF_LEN=29):
 90 | 
 91 |     refs, reverse_refs = get_reference(reference_file, target_chr)
 92 | 
 93 |     coverages = {
 94 |         ref: np.zeros(len(refs[ref])) for ref in refs
 95 |     }
 96 |     
 97 |     rev_coverages = {
 98 |         ref: np.zeros(len(refs[ref])) for ref in reverse_refs
 99 |     }
100 | 
101 |     l, r = _get_shifts(MOTIF_LEN)
102 | 
103 |     long_l, long_r = _get_shifts(LONG_MOTIF_LEN)
104 | 
105 |     motifs = {}
106 |     reverse_motifs = {}
107 | 
108 |     long_motifs = {}
109 |     long_reverse_motifs = {}
110 | 
111 | 
112 | 
113 | 
114 |     for ref in refs:
115 |         motifs[ref] = {}
116 |         reverse_motifs[ref] = {}
117 |         
118 |         long_motifs[ref] = {}
119 |         long_reverse_motifs[ref] = {}
120 | 
121 | 
122 |     files = [file for file in os.listdir(fast5dir) if '.fast5' in file]
123 | 
124 |     batch = 1
125 | 
126 |     max_chrom = get_max_replicon(refs)
127 | 
128 | 
129 |     for f in files:
130 |         print('Batch {} out of {}...'.format(batch, len(files)))
131 |         
132 |         batch += 1
133 |         
134 | 
135 |         try:
136 | 
137 |             with h5py.File('{}/{}'.format(fast5dir, f), 'r', rdcc_nbytes=1024**3) as file:
138 | 
139 |                 for i in tqdm(list(file.items()), leave=False, ncols=75):
140 | 
141 |                     
142 |                     readname = i[0]
143 |                     try:
144 |                         trace = file['/{}/Analyses/RawGenomeCorrected_000/BaseCalled_template/Events'.format(readname)][:]
145 | 
146 |                     except KeyError:
147 |                         continue
148 |                         
149 |                     chrom = file['/{}/Analyses/RawGenomeCorrected_000/BaseCalled_template/Alignment'.format(readname)].attrs['mapped_chrom']  
150 |                     
151 |                     if chrom not in motifs:
152 |                         continue
153 |                     seq = [t[4].decode() for t in trace]
154 | 
155 |                     str_seq = ''.join(seq).upper()
156 | 
157 | 
158 |                     f = refs[chrom].find(str_seq)
159 | 
160 | 
161 |                     if f != -1:
162 | 
163 |                         for i in range(l, len(seq)-r):
164 |                             context = str_seq[i-l:i+r]
165 |                             
166 |                             if context not in motifs[chrom]:
167 |                                 motifs[chrom][context] = []
168 |                                 
169 |                             motifs[chrom][context].append(trace[i][0])
170 | 
171 |                         
172 |                         for i in range(long_l, len(seq) - long_r):
173 |                             long_context = str_seq[i-long_l:i+long_r]
174 | 
175 |                             if long_context not in long_motifs[chrom]:
176 |                                 long_motifs[chrom][long_context] = []
177 | 
178 |                             long_motifs[chrom][long_context].append(trace[i][0])
179 | 
180 |                         if chrom == max_chrom:
181 |                             coverages[chrom][f:f+len(seq)] += 1
182 | 
183 |                         continue
184 | 
185 | 
186 |                     
187 |                 
188 |                     f_reverse = reverse_refs[chrom].find(str_seq)
189 |                     if f_reverse != -1:
190 |                         
191 | 
192 |                         for i in range(l, len(seq)-r):
193 |                             context = str_seq[i-l:i+r]
194 | 
195 |                             if context not in reverse_motifs[chrom]:
196 |                                 reverse_motifs[chrom][context] = []
197 | 
198 |                             reverse_motifs[chrom][context].append(trace[i][0])
199 |                         
200 | 
201 |                         for i in range(long_l, len(seq) - long_r):
202 |                             long_context = str_seq[i-long_l:i+long_r]
203 | 
204 |                             if long_context not in long_reverse_motifs[chrom]:
205 |                                 long_reverse_motifs[chrom][long_context] = []
206 | 
207 |                             long_reverse_motifs[chrom][long_context].append(trace[i][0])
208 | 
209 | 
210 |                         
211 |                         if chrom == max_chrom:
212 |                             rev_coverages[chrom][f_reverse:f_reverse+len(seq)] += 1
213 |                         continue
214 |         except KeyboardInterrupt:
215 |             import sys
216 |             sys.exit()
217 |             pass
218 |         except:
219 |             print('Invalid batch!')
220 |             continue    
221 |         
222 |         
223 |         current_forward_coverage = np.round(np.mean(coverages[max_chrom]), 2)
224 |         current_reverse_coverage = np.round(np.mean(rev_coverages[max_chrom]), 2)
225 | 
226 |         
227 |         if min(current_forward_coverage, current_reverse_coverage) > required_coverage:
228 |             break
229 | 
230 |         print(f'Current forward coverage {current_forward_coverage}X ; reverse coverage {current_reverse_coverage}X')
231 | 
232 |     print(f'Final coverage depth: forward {current_forward_coverage}X ; reverse {current_reverse_coverage}X (with {required_coverage}X threshold)')
233 |     return motifs, reverse_motifs, long_motifs, long_reverse_motifs, coverages, rev_coverages
234 | 
235 | 
236 | 
237 |     
238 | 
239 | 


--------------------------------------------------------------------------------
/snapper/src/methods.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from itertools import combinations, product
  3 | import re
  4 | from snapper.src.seq_processing import gen_variants, letter_codes_rev, letter_anticodes
  5 | from scipy.stats import chi2_contingency, mode
  6 | from tqdm import tqdm
  7 | 
  8 | 
  9 | from multiprocessing import Process, Manager
 10 | 
 11 | 
 12 | 
 13 | regular_letters = ['A','G','C','T']
 14 | non_regular_letters = ['M', 'R', 'W', 'S', 'Y','K', 'V', 'H', 'D','B']
 15 | 
 16 | 
 17 | def filter_pos_variants_l3(pos_variants):
 18 | 
 19 |     filtered_pos_variants = []
 20 | 
 21 |     for pos_variant in pos_variants:
 22 |         _p = sorted(pos_variant)
 23 |         
 24 |         if _p[-1] - _p[0] >= 6:
 25 |             continue
 26 |         
 27 |         if tuple(pos_variant) not in filtered_pos_variants:
 28 |             filtered_pos_variants.append(tuple(_p))
 29 |     
 30 |     return filtered_pos_variants
 31 | 
 32 | 
 33 | 
 34 | def filter_pos_variants(pos_variants):
 35 | 
 36 | 
 37 |     # custom filtering for pos_variants with length of 3
 38 |     if len(pos_variants[0]) == 3:
 39 |         return filter_pos_variants_l3(pos_variants)
 40 | 
 41 | 
 42 | 
 43 |     filtered_pos_variants = []
 44 |     for pos_variant in pos_variants:
 45 |         
 46 |         _p = sorted(pos_variant)
 47 |         
 48 |         if _p[-1] - _p[0] >= 6:
 49 |             continue
 50 |             
 51 |         filtered_pos_variants.append(_p)
 52 |     
 53 |     _2_filtered_pos_variants = []
 54 | 
 55 |     
 56 |     for pos_variant in filtered_pos_variants:
 57 |         
 58 |         #for i in range(1, len(pos_variant) - 1):
 59 |         #    if (pos_variant[i] - pos_variant[i-1] > 1) and (pos_variant[i+1] - pos_variant[i] > 1):
 60 |         #        continue
 61 |         #    
 62 |         if tuple(pos_variant) in _2_filtered_pos_variants:
 63 |             continue
 64 |         
 65 |         #if pos_variant[1] - pos_variant[0] > 1 or  pos_variant[-1] - pos_variant[-2] > 1:
 66 |         #    continue
 67 |         
 68 |         _2_filtered_pos_variants.append(tuple(pos_variant))
 69 |     
 70 |     
 71 |     return _2_filtered_pos_variants
 72 |     
 73 | 
 74 |     
 75 | def filter_motifs(motif_variants):
 76 |     filtered_motifs  = []
 77 |     
 78 |     for motif in motif_variants:
 79 |         if 'C' not in motif and 'A' not in motif:
 80 |             continue
 81 |         filtered_motifs.append(motif)
 82 |         
 83 |     return filtered_motifs
 84 | 
 85 | 
 86 | 
 87 | 
 88 | def extract_template_subset(pos_variant, motif_variant, seq_array):
 89 |     subseq = seq_array
 90 |     for i in range(len(pos_variant)):
 91 |         if motif_variant[i] == '.':
 92 |             continue
 93 | 
 94 |         subseq = subseq[subseq[:,pos_variant[i]] == motif_variant[i]]
 95 | 
 96 |     return subseq
 97 | 
 98 | 
 99 | def extract_template_count(pos_variant, motif_variant, seq_array):
100 |     
101 |     subseq = seq_array
102 |     for i in range(len(pos_variant)):
103 |         subseq = subseq[subseq[:,pos_variant[i]] == motif_variant[i]]
104 |         
105 |     return len(subseq)
106 | 
107 | 
108 | def gen_regexp_template(motif_variant, pos_variant, length=6):
109 |     
110 |     template = ['.',]*length
111 |     
112 |     base_pos = pos_variant[0]
113 |     for i, pos in enumerate(pos_variant):
114 |         template[pos-base_pos] = motif_variant[i]
115 |         
116 |     return ''.join(template)
117 | 
118 | def normalized_variation(array):
119 |     return np.std(array)/np.mean(array)
120 | 
121 | 
122 | def local_filter_seqs(seqs, pos_variant, motif_variant):
123 |     
124 |     new_seqs = []
125 |     template = ''.join(motif_variant)
126 |     template = template.replace('.', 'N')
127 | 
128 |     template_subvariants = gen_variants(template)
129 |     
130 |     for s in seqs:
131 |         
132 |         str_vec = ''.join([s[i] for i in pos_variant])
133 |         if str_vec in template_subvariants:
134 |             continue
135 |     
136 |         new_seqs.append(s)
137 |             
138 |     return new_seqs
139 | 
140 | 
141 | def modify_seq(seq, pos, target_letter):
142 | 
143 |     newseq = list(seq)
144 |     newseq[pos] = target_letter
145 | 
146 |     return ''.join(newseq)
147 | 
148 | 
149 | def generate_reference_freqs_parallel(seq_array, batch, dict_per_length):
150 | 
151 |     for pos_variant, motif_variant in batch:
152 |         variant_count = extract_template_count(pos_variant, motif_variant, seq_array)
153 |         dict_per_length[(motif_variant, pos_variant)] = variant_count
154 |      
155 |     
156 | def generate_reference_freqs(reference, length, threads, lengths=(4,5,6)):
157 | 
158 |     variants_counter = {}
159 | 
160 |     seqs = list(set([
161 |         reference[i:i+length] for i in range(len(reference) - length)
162 |     ]))
163 | 
164 |     seq_array = np.array([list(s) for s in seqs])
165 | 
166 |     print(len(seq_array))
167 |     for LENGTH in lengths: 
168 | 
169 |         print('Reference indexing with length of {}...'.format(LENGTH))
170 | 
171 |         manager = Manager()
172 |         dict_per_length = manager.dict()
173 | 
174 |         pos_variants = list(combinations(range(0,length), r=LENGTH))
175 |         pos_variants = filter_pos_variants(pos_variants)
176 | 
177 |         motif_variants = list(product(regular_letters, repeat=LENGTH))
178 |         motif_variants = filter_motifs(motif_variants)
179 |         
180 |         batch_len = len(pos_variants)*len(motif_variants)//threads
181 |         
182 |         processes = [] #all processes
183 |         for i in range(threads+1):
184 |             try:
185 |                 batch = list(product(pos_variants, motif_variants))[(i)*batch_len:(i+1)*batch_len]
186 |             except IndexError: 
187 |                 batch = list(product(pos_variants, motif_variants))[(i)*batch_len:]
188 |             p = Process(target=generate_reference_freqs_parallel, 
189 |                                  args = (seq_array, batch, dict_per_length,))
190 |             
191 |             processes.append(p)
192 |             p.start()
193 | 
194 |         #join processes    
195 |         [p.join() for p in processes]
196 |         
197 |         variants_counter[LENGTH] = dict(dict_per_length)
198 |               
199 |     return variants_counter, len(seq_array)
200 | 
201 | 
202 | 
203 | 
204 | def add_N(motif):
205 |     
206 |     if motif[0] != 'N':
207 |         motif = 'N' + motif
208 |     
209 |     if motif[-1] != 'N':
210 |         motif += 'N'     
211 |     
212 |     return motif
213 |     
214 | 
215 | def is_superset(motif1, motif2, edgelength=2):
216 |     
217 |     motif1 = add_N(motif1)
218 |     motif2 = add_N(motif2)
219 |     
220 |     if len(motif2) <= len(motif1):
221 |         extended_motif1 = motif1
222 | 
223 |     else:
224 |         extended_motif1 = 'N' * edgelength + motif1 + 'N' * edgelength
225 |     
226 |     motif1_variants = gen_variants(extended_motif1)
227 |     motif2_variatns = gen_variants(motif2)
228 |     
229 |     global_match = True
230 |     for variant2 in motif2_variatns:
231 |         match = False
232 |         for variant1 in motif1_variants:
233 |             if variant2 in variant1:
234 |                 match = True
235 |                 break
236 |         if match == False:
237 |             global_match = False
238 |             break
239 | 
240 |     
241 |     
242 |     return global_match
243 | 
244 | 
245 | def get_alternate_variants(motif_variant, lenmotif=11, range_of_filtering=5):
246 | 
247 |     seq_variant, pos_variant  = motif_variant[1], motif_variant[2]
248 | 
249 |     while seq_variant[0] == 'N':
250 |         seq_variant = seq_variant[1:]
251 |         pos_variant = pos_variant[1:]
252 | 
253 |     while seq_variant[-1] == 'N':
254 |         seq_variant = seq_variant[:-1]
255 |         pos_variant = pos_variant[:-1]
256 |         
257 |     
258 |     alternate_variants = []
259 | 
260 | 
261 |     for i in range(
262 |         max(0,        pos_variant[0]  - range_of_filtering), 
263 |         min(lenmotif, pos_variant[-1] + range_of_filtering)
264 |     ):
265 |         
266 |         shift = i - pos_variant[0]
267 |         
268 |         pos_alternate = tuple(j+shift for j in pos_variant)
269 |         if pos_alternate[-1] >= lenmotif:
270 |             break
271 |             
272 |         alternate_variants.append((motif_variant[0], seq_variant, pos_alternate))
273 | 
274 | 
275 |     return alternate_variants
276 | 
277 |             
278 | 
279 | 
280 | 
281 | def is_subset(motif1, motif2, edgelength=2):
282 |     return is_superset(motif2, motif1, edgelength=edgelength)
283 | 
284 | 
285 | def variant_counts_parallel(seq_array, ref_motifs_counter, N_REF, batch, LENGTH, total_variants_counter_list):
286 |     variants_counter_list = [] 
287 |     N_VARIANT = len(seq_array)
288 |     for pos_variant, motif_variant in batch:
289 |         try:
290 |             reference_count = ref_motifs_counter[LENGTH][(motif_variant, pos_variant)]
291 |                 
292 |         except KeyError:
293 |             variants_counter_list.append((0, motif_variant, pos_variant))
294 |             
295 |         else:
296 |             variant_count = extract_template_count(pos_variant, motif_variant, seq_array)
297 | 
298 |             
299 |             if variant_count == 0 and reference_count == 0:
300 |                     variants_counter_list.append((0, motif_variant, pos_variant))
301 | 
302 |             else:
303 |                 chi2_result = chi2_contingency(
304 |                     [
305 |                         [variant_count, N_VARIANT-variant_count],
306 |                         [reference_count, N_REF-reference_count],
307 |                     ]
308 |                 )
309 | 
310 |                 # chi2_log_pval = -np.log10(chi2_result[1])
311 |                 chi2_statistic = chi2_result[0]
312 | 
313 |                 variants_counter_list.append((chi2_statistic, motif_variant, pos_variant))
314 | 
315 |     total_variants_counter_list+=variants_counter_list
316 | 
317 |                
318 | def collect_variant_counts(seq_array, ref_motifs_counter, N_REF, threads, lengths=(4,5,6), lenmotif=11):
319 |     merged_variants_counter_list = []
320 |     
321 |     for LENGTH in lengths:
322 | 
323 |         print('\tOBSERVING ANCHOR MOTIFS WITH LENGTH OF', LENGTH)
324 | 
325 |         pos_variants = list(combinations(range(0,lenmotif), r=LENGTH))
326 |         pos_variants = filter_pos_variants(pos_variants)
327 | 
328 |         motif_variants = list(product(regular_letters, repeat=LENGTH))
329 |         motif_variants = filter_motifs(motif_variants)
330 |         
331 |         #create batch
332 |         batch_len = len(pos_variants)*len(motif_variants)//threads
333 |         total_variants_counter_list = Manager().list() #for all outputs
334 |         
335 |         processes = [] #all processes
336 |         args_list = list(product(pos_variants, motif_variants))
337 | 
338 |         
339 |         for i in range(threads+1):
340 |             try:
341 |                 batch = args_list[i*batch_len:(i+1)*batch_len]
342 |             except IndexError: 
343 |                 batch = args_list[i*batch_len:]
344 | 
345 |             p = Process(target=variant_counts_parallel, 
346 |                                  args = (seq_array, ref_motifs_counter, N_REF, batch, LENGTH, total_variants_counter_list))
347 |             
348 |             processes.append(p)
349 |             p.start()
350 |         
351 |         [p.join() for p in processes]
352 |         
353 |         merged_variants_counter_list+=list(total_variants_counter_list) # add to 
354 | 
355 |     merged_variants_counter_list.sort(reverse=True)
356 | 
357 |         
358 |     return merged_variants_counter_list
359 | 
360 | 
361 | def get_significant_letters(sub_seq_array, top_variant, pos, reference, threshold_ratio):
362 | 
363 |     print('\tLocal motif adjustment...')
364 | 
365 |     reference_letter_freqs = {'A':0, 'G':0, 'T':0, 'C':0}
366 |     variant_subset_letter_freqs = {'A':0, 'G':0, 'T':0, 'C':0}
367 |     ref_vs_variant_ratios = {'A':0, 'G':0, 'T':0, 'C':0}
368 | 
369 |     variant_length = (top_variant[2][-1] - top_variant[2][0] + 1)
370 |     re_variant = gen_regexp_template(top_variant[1], top_variant[2], length=variant_length)
371 | 
372 |     pos_letters = sub_seq_array[:,pos]
373 | 
374 |     for letter in reference_letter_freqs:
375 |         re_variant_mod = modify_seq(re_variant, pos-top_variant[2][0], letter)
376 |         ref_letter_count = len(re.findall(re_variant_mod, reference))
377 |         
378 |         variant_subset_letter_count = len(pos_letters[pos_letters == letter])
379 | 
380 |         reference_letter_freqs[letter] += ref_letter_count
381 |         variant_subset_letter_freqs[letter] +=  variant_subset_letter_count
382 | 
383 |     
384 |     list_variant_letter_freqs = [
385 |         (variant_subset_letter_freqs[k], k) for k in variant_subset_letter_freqs
386 |     ]
387 | 
388 |     list_variant_letter_freqs.sort(reverse=True)
389 | 
390 |     # consider the first letter to be presented apriori
391 |     the_first_letter = list_variant_letter_freqs[0][1]
392 | 
393 |     ref_vs_variant_ratios[the_first_letter] = 1
394 | 
395 |     significant_letters = set([the_first_letter])
396 | 
397 |     for record in list_variant_letter_freqs[1:]:
398 | 
399 |         try:
400 |             ref_letter_ratio = reference_letter_freqs[the_first_letter]/reference_letter_freqs[record[1]]
401 |         except ZeroDivisionError:
402 |             ref_letter_ratio = np.inf
403 | 
404 |         try:
405 |             variant_subset_letter_ratio = variant_subset_letter_freqs[the_first_letter]/variant_subset_letter_freqs[record[1]]
406 |         except ZeroDivisionError:
407 |             variant_subset_letter_ratio = np.inf
408 | 
409 |         ref_vs_variant_ratio = variant_subset_letter_ratio/ref_letter_ratio
410 | 
411 |         ref_vs_variant_ratios[record[1]] = round(ref_vs_variant_ratio, 4)
412 | 
413 |         if ref_vs_variant_ratio > threshold_ratio:
414 |             break
415 | 
416 |         significant_letters.add(record[1])
417 | 
418 | 
419 |     return tuple(sorted(list(significant_letters)))
420 | 
421 | 
422 | def adjust_letter(seq_array, top_variant, pos, reference, threshold_ratio=5):
423 | 
424 |     sub_seq_array = extract_template_subset(top_variant[2], top_variant[1], seq_array)
425 | 
426 |     pos_letters = get_significant_letters(sub_seq_array, top_variant, pos, reference, threshold_ratio=threshold_ratio) 
427 | 
428 |     return letter_codes_rev[pos_letters]
429 | 
430 | 
431 | def change_subset_motif(supermotif, submotif, edgelength=2):
432 |     
433 |     extended_supermotif = 'N'*edgelength + ''.join(supermotif[1]) + 'N'*edgelength
434 |     
435 |     super_variants = gen_variants(extended_supermotif)
436 |     sub_variants = gen_variants(''.join(submotif[1]))
437 |     
438 |     shifts = []  
439 |     for subvariant in sub_variants:
440 |         for supervariant in super_variants:
441 |             if subvariant in supervariant:
442 |                 shift = edgelength - supervariant.find(subvariant)
443 |                 shifts.append(shift)
444 |     
445 |     shift = mode(shifts).mode[0]
446 | 
447 |     left_pos = max(0, submotif[2][0] + shift)
448 |     right_pos = min(11, submotif[2][0] + shift + len(supermotif[2]))
449 | 
450 | 
451 |     # check left edge case 
452 |     if shift < 0:
453 |         adjusted_subvariant = (
454 |             submotif[0],
455 |             supermotif[1][-shift:],
456 |             tuple(range(submotif[2][0], submotif[2][0] + len(supermotif[1][-shift:])))
457 |         )
458 | 
459 |     
460 |     # check rigth edge case
461 |     elif submotif[1][-1] in regular_letters and submotif[2][-1] == 10 and supermotif[1][-1] == 'N':
462 |         adjusted_subvariant = (
463 |             submotif[0],
464 |             supermotif[1][:-1],
465 |             tuple(range(left_pos, 11))
466 |         )
467 | 
468 |     # common case
469 |     else:
470 |         adjusted_subvariant = (
471 |             submotif[0],
472 |             supermotif[1],
473 |             tuple(range(left_pos, right_pos))
474 |         )
475 | 
476 |     
477 |     # just a patch, must be formalized!!
478 |     if len(adjusted_subvariant[1]) != len(adjusted_subvariant[2]):
479 |         adjusted_subvariant = [
480 |             submotif[0],
481 |             supermotif[1],
482 |             tuple(range(left_pos, left_pos + len(supermotif[1])))
483 |         ]  
484 |         while adjusted_subvariant[2][-1] > 10:
485 |             adjusted_subvariant[1] = adjusted_subvariant[1][:-1]
486 |             adjusted_subvariant[2] = adjusted_subvariant[2][:-1]
487 |     
488 |     return tuple(adjusted_subvariant)
489 | 
490 | 
491 | 
492 | def extend_template(top_variant, maxlength=11):
493 | 
494 |     extended_top_variant = [ top_variant[0], list(top_variant[1]), list(top_variant[2])]
495 | 
496 |     if top_variant[2][0] != 0:
497 |         extended_top_variant[2] = [extended_top_variant[2][0] - 1] + extended_top_variant[2]
498 |         extended_top_variant[1] = ['.'] + extended_top_variant[1]
499 | 
500 |     if top_variant[2][-1] != maxlength-1:
501 |         extended_top_variant[2] = extended_top_variant[2] + [extended_top_variant[2][-1] + 1]
502 |         extended_top_variant[1] = extended_top_variant[1] + ['.']
503 | 
504 |     
505 |     variant_length = (extended_top_variant[2][-1] - extended_top_variant[2][0] + 1)
506 |     re_variant = gen_regexp_template(extended_top_variant[1], extended_top_variant[2], length=variant_length)
507 | 
508 |     extended_top_variant = (
509 |         top_variant[0],
510 |         tuple(re_variant), 
511 |         list(range(extended_top_variant[2][0], extended_top_variant[2][-1] + 1))
512 |     )
513 | 
514 |     return extended_top_variant
515 | 
516 | 
517 | 
518 | def save_results (motifs, out_fasta):
519 | 
520 |     with open(out_fasta, 'w') as f:
521 |         
522 |         cnt = 1
523 | 
524 |         for m in motifs:
525 |             f.write('>MOTIF_{} conflevel={}\n{}\n'.format(cnt, m[0], ''.join(m[1])))
526 |             cnt += 1
527 | 
528 | 
529 | def save_k_mers (motifs, out_fasta):
530 |     with open(out_fasta, 'w') as f:
531 |         
532 |         cnt = 1
533 | 
534 |         for m in motifs:
535 |             f.write('>MOTIF_{}\n{}\n'.format(cnt, m))
536 |             cnt += 1
537 | 


--------------------------------------------------------------------------------
/snapper/src/motif_extraction.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import os
  3 | from Bio.SeqIO import parse
  4 | from pickle import dump, load
  5 | from snapper.src.methods import collect_variant_counts, is_superset, is_subset, local_filter_seqs, adjust_letter, extend_template, generate_reference_freqs, change_subset_motif
  6 | from snapper.src.methods import get_alternate_variants
  7 | from snapper.src.type_I_RM_system import check_for_completeness
  8 | 
  9 | 
 10 | def extract_motifs(
 11 |     seqs, 
 12 |     reference, 
 13 |     savepath, 
 14 |     max_motifs,
 15 |     min_conf, 
 16 |     contig_name,
 17 | 
 18 |     
 19 |     sample_motifs, 
 20 |     control_motifs, 
 21 |     sample_long_motifs, 
 22 |     control_long_motifs, 
 23 |     k_size, 
 24 |     long_k_size,  
 25 |     ks_t,
 26 | 
 27 |     threads=10,
 28 |     lenmotif=11,
 29 | 
 30 |     ):
 31 | 
 32 | 
 33 |     print()
 34 |     print('Motif enrichment')
 35 |     print()
 36 | 
 37 | 
 38 |     N_REF = len(set(
 39 |         [reference[i:i+lenmotif] for i in range(len(reference) - lenmotif)]
 40 |     ))
 41 | 
 42 |     lengths = [3,4,5,6]
 43 | 
 44 |     print('Reference indexing...')
 45 |     ref_motifs_counter, N_REF = generate_reference_freqs(reference, lenmotif, threads, lengths=lengths)
 46 | 
 47 | 
 48 | 
 49 | 
 50 | 
 51 |     ITERATION = 1
 52 | 
 53 | 
 54 |     new_seqs = seqs.copy()
 55 |     try:
 56 |         os.mkdir(savepath + '/seq_iter')
 57 |         os.mkdir(savepath + '/motif_refine')
 58 | 
 59 |     except:
 60 |         pass
 61 | 
 62 | 
 63 |     os.mkdir(savepath + '/seq_iter/{}/'.format(contig_name))
 64 | 
 65 |     with open(savepath + '/seq_iter/{}/seqs_iter_{}.fasta'.format(contig_name, ITERATION), 'w') as fseqiter:
 66 | 
 67 |         for seq in new_seqs:
 68 |             fseqiter.write('>')
 69 |             fseqiter.write(seq)
 70 |             fseqiter.write('\n')
 71 |             fseqiter.write(seq)
 72 |             fseqiter.write('\n')
 73 | 
 74 |     seq_array = np.array([list(s) for s in new_seqs])
 75 | 
 76 |     initial_seq_array = seq_array.copy()
 77 | 
 78 | 
 79 |     MOTIFS_SET = []
 80 |     DETAILED_MOTIF_SET = []
 81 | 
 82 |     print(f'ITERATION 1 ({len(seq_array)} unexplained {lenmotif}-mers):')
 83 | 
 84 | 
 85 | 
 86 | 
 87 |     variants_counter_list = collect_variant_counts(seq_array, ref_motifs_counter, N_REF, threads=threads, lengths=lengths, lenmotif=lenmotif)
 88 | 
 89 | 
 90 |     ITERATION = 2
 91 |     while variants_counter_list[0][0] > min_conf and len(seq_array) > 0:
 92 |         
 93 |         for v in variants_counter_list[:15]:
 94 |             print('\t', v)
 95 | 
 96 |         top_variant = variants_counter_list[0]
 97 | 
 98 |         extended_top_variant = extend_template(top_variant, maxlength=lenmotif)
 99 | 
100 |         positions_to_adjust = []
101 | 
102 |         for i, pos in enumerate(extended_top_variant[2]):
103 |             if extended_top_variant[1][i] == '.':
104 |                 positions_to_adjust.append((pos, i))
105 |         
106 |         modifiable_extended_top_variant = [
107 |             extended_top_variant[0],
108 |             list(extended_top_variant[1]),
109 |             list(extended_top_variant[2])
110 |         ]
111 | 
112 |         
113 |         for pos in positions_to_adjust:
114 | 
115 |             adjusted_pos_letter = adjust_letter(initial_seq_array, extended_top_variant, pos[0], reference)
116 |             modifiable_extended_top_variant[1][pos[1]] = adjusted_pos_letter
117 | 
118 |         extended_top_variant = (
119 |             extended_top_variant[0],
120 |             tuple(modifiable_extended_top_variant[1]),
121 |             tuple(modifiable_extended_top_variant[2]),
122 |         )
123 | 
124 |         print(extended_top_variant)
125 | 
126 |         is_superset_check = False
127 |         is_subset_check = False
128 | 
129 |         for i, motif in enumerate(MOTIFS_SET):
130 |             is_superset_check = is_superset(motif, ''.join(extended_top_variant[1]))
131 |             is_subset_check = is_subset(motif, ''.join(extended_top_variant[1]))
132 | 
133 |             if is_subset_check:
134 |                 break
135 | 
136 |             if is_superset_check:
137 |                 break
138 | 
139 |         refine_outdir = f'{savepath}/motif_refine/{contig_name}/{"".join(extended_top_variant[1])}' 
140 |         
141 |         complete_motif = check_for_completeness(
142 |                 extended_top_variant, 
143 |                 sample_motifs, 
144 |                 control_motifs, 
145 |                 sample_long_motifs, 
146 |                 control_long_motifs, 
147 |                 k_size, 
148 |                 long_k_size, 
149 |                 reference, 
150 |                 outputdir=refine_outdir,
151 |                 log_threshold=ks_t
152 |                 )
153 |         
154 | 
155 |         alternate_variants = get_alternate_variants(extended_top_variant, lenmotif=lenmotif)
156 | 
157 |         print('Filtering seq_set...')
158 | 
159 |         n_seqs = len(new_seqs)
160 |         
161 |         for variant in alternate_variants:
162 |             if variant[0] > min_conf:
163 | 
164 |                 new_seqs = local_filter_seqs(new_seqs, variant[2], variant[1])
165 |         
166 | 
167 |         # filter seq_set by top_variant to prevent infinite loop
168 |         if len(new_seqs) == n_seqs:
169 |             alternate_variants = get_alternate_variants(top_variant)    
170 |             for variant in alternate_variants:
171 |                 if variant[0] > min_conf:
172 | 
173 |                     new_seqs = local_filter_seqs(new_seqs, variant[2], variant[1])
174 | 
175 |         
176 |         MOTIFS_SET.append(''.join(extended_top_variant[1]))
177 |         DETAILED_MOTIF_SET.append(extended_top_variant)
178 |             
179 |         
180 |         
181 |         print(MOTIFS_SET)
182 | 
183 |         with open(savepath + '/seq_iter/{}/seqs_iter_{}.fasta'.format(contig_name, ITERATION), 'w') as fseqiter:
184 | 
185 |             for seq in new_seqs:
186 |                 fseqiter.write('>')
187 |                 fseqiter.write(seq)
188 |                 fseqiter.write('\n')
189 |                 fseqiter.write(seq)
190 |                 fseqiter.write('\n')
191 | 
192 |         
193 |         if len(MOTIFS_SET) == max_motifs:
194 |             break
195 | 
196 |         seq_array = np.array([list(s) for s in new_seqs])
197 |         
198 |         print(f'ITERATION {ITERATION} ({len(seq_array)} unexplained {lenmotif}-mers):')
199 |         ITERATION += 1
200 |         
201 |         variants_counter_list = collect_variant_counts(seq_array, ref_motifs_counter, N_REF, threads=threads, lengths=lengths)
202 | 
203 |     
204 |     return DETAILED_MOTIF_SET


--------------------------------------------------------------------------------
/snapper/src/plotting.py:
--------------------------------------------------------------------------------
  1 | import matplotlib.pyplot as plt
  2 | from snapper.src.seq_processing import letter_codes, gen_variants
  3 | import warnings
  4 | warnings.filterwarnings("ignore")
  5 | import seaborn as sns
  6 | import numpy as np
  7 | 
  8 | from random import sample
  9 | 
 10 | 
 11 | def cohend(d1, d2):
 12 |     n1, n2 = len(d1), len(d2)
 13 |     s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1)
 14 |     s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2))
 15 |     u1, u2 = np.mean(d1), np.mean(d2)
 16 |     return (u1 - u2) / s
 17 | 
 18 | 
 19 | regular_letters = ['A','G','C','T']
 20 | 
 21 | 
 22 | 
 23 | def gen_template(motif_variant, pos_variant, lenmotif):
 24 |     
 25 |     template = ['N',]*lenmotif
 26 |     
 27 |     for i, pos in enumerate(pos_variant):
 28 |         template[pos] = motif_variant[i]
 29 |         
 30 |     return ''.join(template)
 31 | 
 32 | 
 33 | 
 34 | def get_anc_variants(ancMOTIF, lenmotif=15):
 35 |     
 36 |     ancMOTIF_c = ancMOTIF
 37 |     while ancMOTIF_c[0] == 'N':
 38 |         ancMOTIF_c = ancMOTIF_c[1:]
 39 |         
 40 |     while ancMOTIF_c[-1] == 'N':
 41 |         ancMOTIF_c = ancMOTIF_c[:-1]
 42 |         
 43 |     ext_length = lenmotif - len(ancMOTIF_c)
 44 |     anc_variants = []
 45 |     print(ancMOTIF_c)
 46 |     for i in range(ext_length + 1):
 47 |         anc_variants.append('N'*i + ancMOTIF_c + 'N'*(ext_length - i))
 48 |         
 49 |     return anc_variants
 50 | 
 51 | def plot_dist(motif, native_motifs, wga_motifs, savepath, lenmotif=15, MAXSAMPLESIZE = 2000):
 52 |     
 53 |     print(f'Rendering {"".join(motif[1])}...')
 54 |     
 55 |     ancMOTIF_init = gen_template(motif[1], motif[2], lenmotif)
 56 | 
 57 |     anc_variants = get_anc_variants(ancMOTIF_init, lenmotif=lenmotif)
 58 |     
 59 |     N = len(anc_variants)
 60 |     fig, axs = plt.subplots(N, 2, figsize=(14, 4*N))
 61 |     
 62 |     motif_cnt = 0
 63 |     for ancMOTIF in anc_variants:
 64 |         
 65 |         _wga = []
 66 |         _native = []
 67 |         
 68 |         lens = []
 69 |         effect_size_dist = []
 70 | 
 71 |         cnt = 0
 72 |         for MOTIF in gen_variants(ancMOTIF):
 73 | 
 74 | 
 75 |             if MOTIF not in wga_motifs or MOTIF not in native_motifs:
 76 |                 continue
 77 | 
 78 |             _wga += wga_motifs[MOTIF]
 79 |             _native += native_motifs[MOTIF]
 80 | 
 81 |             effect_size_dist.append(np.abs(cohend(native_motifs[MOTIF], wga_motifs[MOTIF])))
 82 | 
 83 |             #print(len(_wga), len(_native))
 84 |         
 85 |         if len(effect_size_dist) == 0:
 86 |             continue
 87 |         if len(_native) > MAXSAMPLESIZE:
 88 |             _native = sample(_native, MAXSAMPLESIZE)
 89 | 
 90 |         if len(_wga) > MAXSAMPLESIZE:
 91 |             _wga = sample(_wga, MAXSAMPLESIZE)
 92 |         
 93 |         
 94 |         sns.distplot(x = _wga, hist = False, label='WGA', color='red', ax=axs[motif_cnt][0])
 95 |         sns.distplot(x = _native, hist = False, label='native', color='green', ax=axs[motif_cnt][0])
 96 |         axs[motif_cnt][0].grid()
 97 | 
 98 |         
 99 |         axs[motif_cnt][0].set_title('{}, confidence = {}\nmed effsize = {}'.format(ancMOTIF, motif[0], np.median(effect_size_dist)))
100 |         axs[motif_cnt][0].set_xlabel('Normalized signal')
101 | 
102 |         axs[motif_cnt][1].hist(effect_size_dist, bins=50, density=True, rwidth=0.8, color='black')
103 |         axs[motif_cnt][1].set_xlabel('eff size')
104 |         
105 |         title = np.round([
106 |                 np.percentile(effect_size_dist, 10),
107 |                 np.percentile(effect_size_dist, 25),
108 |                 np.percentile(effect_size_dist, 50),
109 |                 np.percentile(effect_size_dist, 75),
110 |                 np.percentile(effect_size_dist, 90)
111 |                 ], 2)
112 |         
113 |         title = list(map(str, title))
114 |         axs[motif_cnt][1].set_title(
115 |             f'{" ".join(title)}'
116 |         )
117 | 
118 |         axs[motif_cnt][0].legend()
119 |         
120 |         motif_cnt += 1
121 | 
122 |     plt.tight_layout()
123 | 
124 |     plt.savefig(savepath + '/{}.png'.format("".join(motif[1])), format='png', dpi=400)
125 |     plt.show()
126 | 
127 | 
128 | 
129 | 
130 | def plot_motif(motif, sample_motifs, control_motifs, savepath, lenmotif=11):
131 | 
132 |     _sample = []
133 |     _control = []
134 | 
135 | 
136 |     ancMOTIF = gen_template(motif[1], motif[2], lenmotif)
137 | 
138 | 
139 |     effect_size_dist = []
140 | 
141 |     for MOTIF in gen_variants(ancMOTIF):
142 |             
143 |         
144 |         if MOTIF not in sample_motifs or MOTIF not in control_motifs:
145 |             continue
146 |             
147 |         
148 |         
149 |         _sample += sample_motifs[MOTIF]
150 |         _control += control_motifs[MOTIF]
151 | 
152 |         effect_size_dist.append(np.abs(cohend(sample_motifs[MOTIF], control_motifs[MOTIF])))
153 |                 
154 | 
155 | 
156 |     plt.figure(figsize=(8,5))
157 | 
158 |     plt.grid()
159 | 
160 |     sns.distplot(x = _control, hist=False, label='Control', color='red')
161 |     sns.distplot(x = _sample, hist=False, label='Sample', color='green')
162 |     #plt.savefig('tnp/check3.png', dpi=400)
163 | 
164 |     plt.title('{}, confidence = {}\nmed effsize = {}'.format(ancMOTIF, motif[0], np.median(effect_size_dist)))
165 | 
166 |     plt.xlim(-5,5)
167 |     plt.xlabel('Normalized signal')
168 |     plt.legend()
169 | 
170 |     plt.tight_layout()
171 |     plt.savefig(savepath + '/{}.png'.format(ancMOTIF), format='png', dpi=400)
172 | 
173 |     plt.close()
174 | 
175 | 
176 | 
177 | def plot_coverage(sample_coverage, control_coverage, chrom, output):
178 | 
179 |     plt.figure(figsize=(25,7))
180 | 
181 |     plt.xlabel('chrom position')
182 |     plt.ylabel('depth')
183 | 
184 |     sample_mean_cov = np.round(np.mean(sample_coverage), 0)
185 |     control_mean_cov = np.round(np.mean(control_coverage), 0)
186 | 
187 | 
188 |     plt.title(f'{chrom}\nNative mean cov = {sample_mean_cov}X\nControl mean cov = {control_mean_cov}X')
189 | 
190 |     plt.plot(sample_coverage, label='Native')
191 |     plt.plot(control_coverage, label='Control')
192 | 
193 |     plt.legend()
194 | 
195 |     plt.tight_layout()
196 | 
197 |     plt.savefig(output, format='pdf')


--------------------------------------------------------------------------------
/snapper/src/seq_processing.py:
--------------------------------------------------------------------------------
 1 | letter_codes = {
 2 |     'A': ['A'],
 3 |     'C': ['C'],
 4 |     'G': ['G'],
 5 |     'T': ['T'],
 6 |     'M': ['A','C'],
 7 |     'R': ['A','G'],
 8 |     'W': ['A','T'],
 9 |     'S': ['C','G'],
10 |     'Y': ['C','T'],
11 |     'K': ['G','T'],
12 |     'V': ['A','C','G'],
13 |     'H': ['A','C','T'],
14 |     'D': ['A','G','T'],
15 |     'B': ['C','G','T'],
16 |     'N': ['A','C','G','T']
17 | }
18 | 
19 | 
20 | letter_anticodes = {
21 |     'A': set(['C', 'G', 'T']),
22 |     'C': set(['A', 'G', 'T']),
23 |     'G': set(['A', 'C', 'T']),
24 |     'T': set(['A', 'C', 'G']),
25 |     'M': set(['G','T']),
26 |     'R': set(['C','T']),
27 |     'W': set(['C','G']),
28 |     'S': set(['A','T']),
29 |     'Y': set(['C','T']),
30 |     'K': set(['A','C']),
31 |     'V': set(['T']),
32 |     'H': set(['G']),
33 |     'D': set(['C']),
34 |     'B': set(['A']),
35 |     'N': set([])
36 | }
37 | 
38 | letter_codes_rev = {
39 |     ('A',): 'A',
40 |     ('C',): 'C',
41 |     ('G',): 'G',
42 |     ('T',): 'T',
43 |     ('A', 'C'): 'M',
44 |     ('A', 'G'): 'R',
45 |     ('A', 'T'): 'W',
46 |     ('C', 'G'): 'S',
47 |     ('C', 'T'): 'Y',
48 |     ('G', 'T'): 'K',
49 |     ('A', 'C', 'G'): 'V',
50 |     ('A', 'C', 'T'): 'H',
51 |     ('A', 'G', 'T'): 'D',
52 |     ('C', 'G', 'T'): 'B',
53 |     ('A', 'C', 'G', 'T'): 'N'
54 | }
55 | 
56 | 
57 | 
58 | 
59 | 
60 | def gen_variants(seq):
61 |     variants = ['']
62 |     
63 |     for i in range(len(seq)):
64 |         new_variants = []
65 |         for l in letter_codes[seq[i]]:
66 |             
67 |             for v in variants:
68 |                 new_variants.append(v + l)
69 |                 
70 |         variants = new_variants
71 |             
72 |     return variants


--------------------------------------------------------------------------------
/snapper/src/statistics_methods.py:
--------------------------------------------------------------------------------
  1 | from scipy.stats import ks_2samp
  2 | from random import sample
  3 | import numpy as np
  4 | import os
  5 | from multiprocessing import Process, Manager
  6 | from tqdm import tqdm
  7 | 
  8 | SAMPLESIZE = 200
  9 | 
 10 | # just a patch
 11 | MINSAMPLESIZE = 10
 12 | 
 13 | LOG10_PVAL_TRH = 5
 14 | 
 15 | def get_ks_test(
 16 |     motif_subset, 
 17 |     ks_stat_line,
 18 |     motifs_line, 
 19 |     native_motifs, 
 20 |     wga_motifs, 
 21 |     contig, 
 22 |     minsamplesize,
 23 |     maxsamplesize):
 24 | 
 25 |     ks_test_batch = []
 26 |     motif_batch = []
 27 | 
 28 |     for MOTIF in motif_subset:
 29 |         if MOTIF not in wga_motifs[contig].keys():
 30 |             continue
 31 |         
 32 |         try:
 33 |             s1 = sample(native_motifs[contig][MOTIF], maxsamplesize)
 34 |         except ValueError:
 35 |             s1 = native_motifs[contig][MOTIF]
 36 |             
 37 |         try:
 38 |             s2 = sample(wga_motifs[contig][MOTIF], maxsamplesize)
 39 |         except ValueError:
 40 |             s2 = wga_motifs[contig][MOTIF]
 41 |         
 42 |         if len(s1) < minsamplesize or len(s2) < minsamplesize:
 43 |             continue
 44 | 
 45 |         ks_test_batch.append(ks_2samp(s1,s2, mode='asymp')[1])
 46 |         motif_batch.append(MOTIF)
 47 |     #return_data[os.getpid()] = {}
 48 |     ks_stat_line += ks_test_batch
 49 |     motifs_line += motif_batch
 50 | 
 51 | def get_statistics(
 52 |     native_motifs, 
 53 |     wga_motifs, 
 54 |     maxsamplesize=SAMPLESIZE,
 55 |     minsamplesize=MINSAMPLESIZE,
 56 |     threads=1,
 57 |     ):
 58 | 
 59 |     print('Getting difsignals...')
 60 |     contigs = list(native_motifs.keys())
 61 |     motifs_lines = {}
 62 |     ks_stat_lines = {}
 63 | 
 64 |     for contig in contigs:
 65 | 
 66 |       #  ks_stat_line = []
 67 |        # motifs_line = []
 68 |         procs = []
 69 |         KS_manager = Manager()
 70 |         ks_stat_line = KS_manager.list()
 71 | 
 72 |         motif_manager = Manager()
 73 |         motifs_line = motif_manager.list()
 74 |         
 75 |         interval_coordinates = np.linspace(0, len(native_motifs[contig].keys()), threads+1)
 76 |         intervals = [(interval_coordinates[idx], interval_coordinates[idx+1]) for idx,_ in  list(enumerate(interval_coordinates))[: -1]]
 77 |         motifs = list(native_motifs[contig].keys())
 78 | 
 79 |         # Thread filtering
 80 |         threads_limit = len(motifs)
 81 |         if threads > threads_limit:
 82 |             threads = threads_limit
 83 | 
 84 |         for thread_process in tqdm(intervals):
 85 |             
 86 |             motif_subset = motifs[int(thread_process[0]): int(thread_process[1])]
 87 |             #print(motif_subset)
 88 |             proc = Process(target=get_ks_test, args=(motif_subset, ks_stat_line, motifs_line, native_motifs, wga_motifs, contig, minsamplesize, maxsamplesize))
 89 |             procs.append(proc)
 90 |             proc.start()
 91 |         
 92 |         for proc in procs:
 93 |             proc.join()
 94 |         
 95 |         print()
 96 |         ks_stat_lines[contig] = list(ks_stat_line)
 97 |         motifs_lines[contig] = list(motifs_line)
 98 | 
 99 |     return motifs_lines, ks_stat_lines
100 | 
101 | 
102 | def get_difsignals(
103 |     motifs_line, 
104 |     ks_stats, 
105 |     log10_pval_thr = LOG10_PVAL_TRH,
106 | 
107 |     ):
108 |     
109 | 
110 |     passed_motifs = []
111 | 
112 |     for i in range(len(ks_stats)):
113 |         if -np.log10(ks_stats[i]) >= log10_pval_thr:
114 |             passed_motifs.append(motifs_line[i])
115 |     return passed_motifs


--------------------------------------------------------------------------------
/snapper/src/type_I_RM_system.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from itertools import product
  3 | from snapper.src.seq_processing import letter_codes, gen_variants
  4 | from scipy.stats import ks_2samp
  5 | from tqdm import tqdm
  6 | from random import sample
  7 | from scipy.stats import chi2_contingency
  8 | from tqdm import tqdm
  9 | 
 10 | bases = ['A', 'G', 'T', 'C', 'N']
 11 | 
 12 | 
 13 | MAXSAMPLESIZE = 200
 14 | 
 15 | 
 16 | def gen_template(motif_variant, pos_variant, lenmotif):
 17 |     
 18 |     template = ['N',]*lenmotif
 19 |     
 20 |     for i, pos in enumerate(pos_variant):
 21 |         template[pos] = motif_variant[i]
 22 |         
 23 |     template =  ''.join(template)
 24 | 
 25 |     return template
 26 | 
 27 | 
 28 | def get_delta(k_size, long_k_size):
 29 |     return int(long_k_size/2) - int(k_size/2)
 30 | 
 31 | 
 32 | 
 33 | 
 34 | 
 35 | #### THE PROBLEM IS HERERERERERE!!!!
 36 | 
 37 | def filter_long_kmers(target, long_kmers, lenmotif, long_k_size):
 38 |     
 39 | 
 40 |     
 41 |     delta = get_delta(lenmotif, long_k_size)
 42 |     
 43 |     
 44 |     
 45 |     if target[2][-1] + delta >= long_k_size:
 46 |         return []
 47 | 
 48 |     long_kmers = np.array([list(l) for l in long_kmers])
 49 | 
 50 | 
 51 |     target_seq = ''.join(target[1])
 52 | 
 53 |     target_variants = gen_variants(target_seq)
 54 | 
 55 | 
 56 | 
 57 | 
 58 |     filtered_long_kmers = []
 59 |     
 60 |     for target_variant in target_variants:
 61 | 
 62 |         current_long_kmers = long_kmers.copy()
 63 |     
 64 |         for i in range(len(target[1])):
 65 |             try:
 66 |                 current_long_kmers = current_long_kmers[current_long_kmers[:, target[2][i] + delta] == target_variant[i]]
 67 |             except IndexError:
 68 |                 continue
 69 | 
 70 |         filtered_long_kmers += [''.join(l) for l in current_long_kmers]
 71 | 
 72 |     return filtered_long_kmers
 73 | 
 74 | 
 75 | 
 76 | 
 77 | 
 78 | 
 79 | def check_for_completeness(
 80 |     motif, 
 81 |     sample_motifs, 
 82 |     control_motifs, 
 83 |     long_sample_motifs, 
 84 |     long_control_motifs,
 85 |     lenmotif, 
 86 |     long_k_size, 
 87 |     reference, 
 88 |     outputdir,
 89 |     log_threshold=2,
 90 |     long_motif_confidence=100_000,
 91 |     ):
 92 | 
 93 |     
 94 |     #return motif
 95 | 
 96 |     print(f'Checking for {"".join(motif[1])} motif completeness...')
 97 | 
 98 |     _sample = []
 99 |     _control = []
100 | 
101 | 
102 |     ancMOTIF = gen_template(motif[1], motif[2], lenmotif)
103 | 
104 | 
105 |     for MOTIF in gen_variants(ancMOTIF):
106 |             
107 |         
108 |         if MOTIF not in sample_motifs or MOTIF not in control_motifs:
109 |             continue
110 |             
111 |     
112 |         _sample += sample_motifs[MOTIF]
113 |         _control += control_motifs[MOTIF]
114 | 
115 |     # THINKKKKK!!!!
116 | 
117 |     if len(_sample) > MAXSAMPLESIZE:
118 |         _sample = sample(_sample, MAXSAMPLESIZE)
119 |     
120 |     if len(_control) > MAXSAMPLESIZE:
121 |         _control = sample(_control, MAXSAMPLESIZE)
122 | 
123 |     global_ks_results = ks_2samp(_sample, _control)[1]
124 | 
125 |     if -np.log10(global_ks_results) > log_threshold:
126 |         print(f'Motif {"".join(motif[1])} seems complete (p-val = {global_ks_results})')
127 |         print()
128 |         return motif
129 | 
130 | 
131 |     print(f'Motif {"".join(motif[1])} is probably incomplete (p-val = {global_ks_results}). Extending enrichment heuristics...')
132 |     
133 |     _sample = []
134 |     _control = []
135 | 
136 |     significant_contexts = []
137 | 
138 |     filtered_long_motifs = filter_long_kmers(motif, list(long_control_motifs.keys()), lenmotif, long_k_size)
139 | 
140 |     print('Collecting extended contexts...')
141 |     for MOTIF in tqdm(filtered_long_motifs):
142 |         
143 |         
144 |         if MOTIF not in long_sample_motifs or MOTIF not in long_control_motifs:
145 |             continue   
146 |         _sample += long_sample_motifs[MOTIF]
147 |         _control += long_control_motifs[MOTIF]
148 | 
149 |         if -np.log10(ks_2samp(long_sample_motifs[MOTIF], long_control_motifs[MOTIF])[1]) > log_threshold:
150 |             significant_contexts.append(MOTIF)
151 | 
152 |     import os
153 | 
154 |     os.makedirs(outputdir)
155 | 
156 |     context_cnt = 1
157 |     with open(outputdir + '/long_contexts.fasta', 'w') as f_contexts:
158 |         for context in significant_contexts:
159 |             f_contexts.write(f'>long_context_{context_cnt}\n{context}\n')
160 |             context_cnt += 1
161 | 
162 | 
163 |     delta = get_delta(lenmotif, long_k_size)
164 |     trd1_pos = motif[2][0] + delta
165 |     
166 |     
167 |     long_motif_veriants = find_possible_trd2(motif, significant_contexts, reference, trd1_pos, long_k_size)
168 | 
169 |     with open(outputdir + '/long_motif_variants.tsv', 'w') as f_variants:
170 |         for motif in long_motif_veriants:
171 |             confidence, possible_long_motif = motif
172 | 
173 |             f_variants.write(f'{possible_long_motif}\t{confidence}\n')
174 | 
175 | 
176 |     print(f'{possible_long_motif} has shown the best statistics. All data have been saved to {outputdir}.')
177 |     print()
178 | 
179 |     return motif
180 |     
181 |         
182 |     
183 | 
184 | 
185 | def create_long_motif_template(long_motif, trd1_pos, confidence):
186 | 
187 |     motif_template = []
188 |     pos_template = []
189 |     for i in range(len(long_motif)):
190 | 
191 |         pos_template.append(i + trd1_pos)
192 |         motif_template.append(long_motif[i])
193 | 
194 | 
195 |     while motif_template[0] == 'N':
196 |         motif_template = motif_template[1:]
197 |         pos_template = pos_template[1:]
198 |     
199 |     while motif_template[-1] == 'N':
200 |         motif_template = motif_template[:-1]
201 |         pos_template = pos_template[:-1]
202 |     
203 | 
204 |     return (
205 |         confidence, tuple(motif_template), tuple(pos_template),
206 |     )
207 | 
208 | 
209 | 
210 | 
211 | def filter_trd2(trd2_variants, min_nondegenerate_letters=2, max_N_letters=2):
212 |     
213 |     filtered_trd2_variants = []
214 |     
215 |     for v in trd2_variants:
216 |         
217 |         Ncnt = v.count('N')
218 |         
219 |         if v[0] == 'N':
220 |             continue
221 |             
222 |         if v[-1] == 'N':
223 |             continue
224 |         
225 |         if Ncnt > max_N_letters:
226 |             continue
227 |             
228 |         if len(v) - Ncnt < min_nondegenerate_letters:
229 |             continue
230 |         
231 |         filtered_trd2_variants.append(v)
232 |     return filtered_trd2_variants
233 |     
234 | 
235 | 
236 | 
237 | def generate_RM_type_I_templates(trd1, N_lens=(5,6,7,8), trd2_lens=(2,3,4,5,6)):
238 |         
239 |         
240 |     trd2_variants = []
241 | 
242 |     for trd2_len in trd2_lens:
243 |         trd2_variants += [''.join(v) for v in list(product(bases, repeat=trd2_len))]
244 | 
245 |     trd2_variants = filter_trd2(trd2_variants)
246 |     
247 |     
248 |     N_variants = ['N'*N_len for N_len in N_lens]
249 | 
250 |     while trd1[0] == 'N':
251 |         trd1 = trd1[1:]
252 | 
253 |     while trd1[-1] == 'N':
254 |         trd1 = trd1[:-1]
255 |     
256 |     templates = list(product(gen_variants(trd1), N_variants,trd2_variants))
257 |     
258 |     templates = [''.join(t) for t in templates]
259 |     return templates
260 |     
261 | def find_possible_trd2(trd1, significant_contexts, reference, trd1_pos, lenmotif, N_lens=(5,6,7,8), trd2_lens=(2,3,4,5,6)):
262 | 
263 | 
264 | 
265 |     
266 |     trd1 = ''.join(trd1[1])
267 | 
268 |     seq_array = np.array([list(l) for l in significant_contexts])
269 |     ref_array = list(set([reference[i:i + lenmotif]  for i in range(len(reference)-lenmotif)]))
270 | 
271 |     ref_array = np.array([list(l) for l in ref_array])
272 | 
273 |     N_ref = len(ref_array)
274 |     filtered_ref_array = None
275 |     for short_motif in gen_variants(trd1):
276 |         tmp_ref_array = ref_array.copy()
277 | 
278 |         for pos in range(trd1_pos, trd1_pos + len(trd1)):
279 |             #print(ref_array, pos, trd1, trd1_pos)
280 |             tmp_ref_array = tmp_ref_array[tmp_ref_array[:,pos] == short_motif[pos-trd1_pos]]
281 |         if filtered_ref_array is None:
282 | 
283 |             filtered_ref_array = tmp_ref_array
284 |         else:
285 |             filtered_ref_array = np.concatenate((filtered_ref_array, tmp_ref_array))
286 |         
287 |     ref_array = filtered_ref_array
288 |     
289 |     N_seqset = len(seq_array)
290 | 
291 |     templates = generate_RM_type_I_templates(trd1, N_lens=N_lens, trd2_lens=trd2_lens)
292 | 
293 |     trd2_testing_results = []
294 |     
295 |     # THINKKKKK!!!
296 |     print('TRD2 sequence optimization...')
297 | 
298 |     for template in tqdm(templates):
299 |         
300 |         if trd1_pos + len(template) >= lenmotif:
301 |             continue
302 | 
303 |         subseq = seq_array.copy()
304 |         ref_subseq = ref_array.copy()
305 |         
306 |         for pos in range(trd1_pos, trd1_pos + len(template)):
307 |             if template[pos - trd1_pos] == 'N':
308 |                 continue
309 |             
310 |             if len(subseq) != 0:
311 |                 subseq = subseq[subseq[:, pos+1] == template[pos - trd1_pos]]
312 |             
313 |             if len(ref_subseq) != 0:
314 |                 ref_subseq = ref_subseq[ref_subseq[:, pos+1] == template[pos - trd1_pos]]
315 | 
316 |         subseq_N = len(subseq)
317 |         ref_subseq_N = len(ref_subseq)
318 | 
319 |         try:
320 |             chi_res = chi2_contingency(
321 |                 [
322 |                     [subseq_N, N_seqset-subseq_N],
323 |                     [ref_subseq_N, N_ref-ref_subseq_N],
324 |                 ]
325 |             )[0]
326 |         except ValueError:
327 |             chi_res = 0
328 | 
329 |         trd2_testing_results.append(
330 |             (
331 |                 chi_res, template
332 |             )
333 |         )
334 |         
335 | 
336 |     trd2_testing_results.sort(reverse=True)
337 | 
338 |     print('Best results:')
339 |     for t in trd2_testing_results[:10]:
340 |         print(t[0], t[1])
341 | 
342 | 
343 |     return trd2_testing_results[:20]
344 | 
345 | 
346 | 
347 |                 
348 |             
349 | 


--------------------------------------------------------------------------------