├── .gitignore ├── LICENSE ├── README.md ├── logo.png ├── setup.py └── snapper ├── __init__.py ├── snapper.py └── src ├── data_processing.py ├── methods.py ├── motif_extraction.py ├── plotting.py ├── seq_processing.py ├── statistics_methods.py └── type_I_RM_system.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | 131 | # Snapper Results 132 | Results_* 133 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 Dmitry N. Konanov 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Snapper: nanopore-based modification motifs caller 4 | 5 | This tool is designed to efficiently detect methylation sites using ONT sequencing data. 6 | Snapper uses balanced approach to compute statistics for each k-mer which is likely to be modified. 7 | The core feature of Snapper in comparison with other tools is a new high-sensitive greedy algorithm that is used 8 | for position-specific motif enrichment. This repository contains not the Snapper tool itself but its pip distribution. 9 | 10 | ## Dependencies 11 | - python 3.7 (later versions might be incompatible because of inner biopython dependencies) 12 | - ont-tombo 13 | - h5py 14 | - biopython 15 | - matplotlib 16 | - scipy 17 | - seaborn 18 | 19 | ## Installation 20 | 21 | ``` 22 | (base) $ conda create -n snapper python=3.7 23 | (base) $ conda activate snapper 24 | (snapper) $ conda install -c bioconda ont-fast5-api ont-tombo 25 | (snapper) $ pip install snapper-ont 26 | ``` 27 | 28 | ## Usage 29 | 30 | Firstly, fast5 files should be resquiggled using [Tombo](https://github.com/nanoporetech/tombo) software. 31 | After resquiggling, fast5 files should be converted to the multi-fast5 format using [ont_fast5_api](https://github.com/nanoporetech/ont_fast5_api). 32 | 33 | A more detailed usage guideline and few usercases are available in [Snapper's documentation](https://snapper-tutorial.readthedocs.io/en/latest/index.html) 34 | 35 | ``` 36 | usage: snapper [-h] -sample_fast5dir SAMPLE_FAST5DIR -control_fast5dir 37 | CONTROL_FAST5DIR -reference REFERENCE [-ks_t KS_T] 38 | [-outdir OUTDIR] [-coverage COVERAGE] [-threads THREADS] 39 | [-k_size K_SIZE] [-long_k_size LONG_K_SIZE] 40 | [-max_motifs MAX_MOTIFS] [-min_conf MIN_CONF] 41 | [-target_chr TARGET_CHR] 42 | 43 | optional arguments: 44 | -h, --help show this help message and exit 45 | -sample_fast5dir SAMPLE_FAST5DIR 46 | sample multi fast5 dir 47 | -control_fast5dir CONTROL_FAST5DIR 48 | control multi fast5 dir 49 | -reference REFERENCE reference genome in the fasta format 50 | -ks_t KS_T -log ks_test p-value (default 3). 51 | -outdir OUTDIR output directory name 52 | -coverage COVERAGE minimal genome coverage depth (default 40) 53 | -threads THREADS number of threads used (default 8) 54 | -k_size K_SIZE k-mer size, must be odd, 55 | should not be less than 11 (default 15) 56 | -long_k_size LONG_K_SIZE 57 | k-mer size, must be odd, 58 | should not be less than 21 (default 29) 59 | -max_motifs MAX_MOTIFS 60 | the maximum expected number of motifs extracted 61 | -min_conf MIN_CONF the minimal confidence value (default is 100) 62 | -target_chr TARGET_CHR 63 | target chromosome name (by default all 64 | contigs/replicons are considered) 65 | 66 | 67 | ``` 68 | 69 | 70 | Typical run command: 71 | ``` 72 | snapper -sample_fast5dir ../HelicobacterMod/fast5/J99_multi/ -control_fast5dir ../HelicobacterMod/fast5/J99_wga_multi/ -reference ../HelicobacterMod/genome/J99.fasta 73 | ``` 74 | 75 | ## Output explanation 76 | 77 | The output directory contains the following files: 78 | - `passed_motifs_[strand]_[contig_name].fasta` - all k-mers that most likely bring a modified base 79 | - `final_motifs_[strand]_[contig_name].fasta` - optimal set of motifs generated from the passed motifs by the Snapper greedy algorithm 80 | - `plots_[strand]_[contig_name]` - signal distribution plots for each extracted motif 81 | 82 | ## Citation 83 | 84 | Dmitry N Konanov, Vladislav V Babenko, Aleksandra M Belova, Arina G Madan, Daria I Boldyreva, Oksana E Glushenko, Ivan O Butenko, Dmitry E Fedorov, Alexander I Manolov, Danil V Krivonos, Vassilii N Lazarev, Vadim M Govorun, Elena N Ilina, [Snapper: high-sensitive detection of methylation motifs based on Oxford Nanopore reads](https://doi.org/10.1093/bioinformatics/btad702), Bioinformatics, 2023 85 | -------------------------------------------------------------------------------- /logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DNKonanov/Snapper/29e659247091ff41e74a1cae7380356445da4020/logo.png -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import setuptools 2 | 3 | with open("README.md", "r", encoding="utf-8") as fh: 4 | long_description = fh.read() 5 | 6 | setuptools.setup( 7 | name="snapper-ont", 8 | version="0.4.5", 9 | author="D.N. Konanov", 10 | author_email="konanovdmitriy@gmail.com", 11 | description="Nanopore-based methylation sites caller", 12 | long_description="snapper", 13 | long_description_content_type="", 14 | url="https://github.com/DNKonanov/Snapper", 15 | project_urls={ 16 | "Bug Tracker": "https://github.com/DNKonanov/Snapper", 17 | }, 18 | classifiers=[ 19 | "Programming Language :: Python :: 3", 20 | "License :: OSI Approved :: MIT License", 21 | "Operating System :: OS Independent", 22 | ], 23 | python_requires=">=3.7", 24 | include_package_data=True, 25 | packages=['snapper', 'snapper.src'], 26 | install_requires=[ 27 | 'h5py', 28 | 'biopython', 29 | 'matplotlib', 30 | 'scipy', 31 | 'seaborn', 32 | 'tqdm' 33 | ], 34 | entry_points={ 35 | 'console_scripts': [ 36 | 'snapper=snapper.snapper:main' 37 | ] 38 | } 39 | ) 40 | -------------------------------------------------------------------------------- /snapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DNKonanov/Snapper/29e659247091ff41e74a1cae7380356445da4020/snapper/__init__.py -------------------------------------------------------------------------------- /snapper/snapper.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import os 3 | import sys 4 | 5 | import warnings 6 | warnings.filterwarnings("ignore") 7 | 8 | 9 | 10 | 11 | def main(): 12 | 13 | parser = ArgumentParser() 14 | 15 | parser.add_argument('-sample_fast5dir', type=str, help='sample multi fast5 dir', required=True) 16 | parser.add_argument('-control_fast5dir', type=str, help='control multi fast5 dir', required=True) 17 | parser.add_argument('-reference', type=str, help='reference genome in the fasta format', required=True) 18 | parser.add_argument('-ks_t', type=int, default=3, help='-log ks_test p-value (default 3).') 19 | parser.add_argument('-outdir', type=str, default='default', help='output directory name') 20 | parser.add_argument('-coverage', type=float, help='minimal genome coverage depth (default 40)', default=40) 21 | parser.add_argument('-threads', type=int, default=8, help='number of threads used (default 8)') 22 | parser.add_argument('-k_size', type=int, default=15, help='k-mer size, must be odd, should not be less than 11 (default 15)') 23 | parser.add_argument('-long_k_size', type=int, default=29, help='long k-mer size, must be odd, should not be less than 21 (default 29)') 24 | parser.add_argument('-max_motifs', help='the maximum expected number of motifs extracted (default 20)', default=20, type=int) 25 | parser.add_argument('-min_conf', help='the minimal confidence value (default is 100)', type=float, default=100) 26 | parser.add_argument('-target_chr', help='target chromosome name (by default all contigs/replicons are considered)', type=str, default='all') 27 | 28 | 29 | 30 | from snapper.src.motif_extraction import extract_motifs 31 | from snapper.src.plotting import plot_motif, plot_coverage, plot_dist 32 | from snapper.src.data_processing import get_reference, parse_data 33 | from snapper.src.statistics_methods import get_difsignals, get_statistics 34 | from snapper.src.methods import save_results, save_k_mers 35 | from snapper.src.statistics_methods import SAMPLESIZE, MINSAMPLESIZE 36 | 37 | if len(sys.argv)==1: 38 | parser.print_help(sys.stderr) 39 | sys.exit(1) 40 | 41 | 42 | 43 | 44 | args = parser.parse_args() 45 | 46 | if args.k_size%2 == 0 or args.long_k_size%2 == 0: 47 | raise ValueError('Both -k_size and -long_k_size must be odd numbers') 48 | 49 | if args.k_size < 11: 50 | raise ValueError('-k_size parameter should not be less than 11') 51 | 52 | if args.long_k_size < 21: 53 | raise ValueError('-long_k_size parameter should not be less than 21') 54 | 55 | if args.k_size >= args.long_k_size: 56 | raise ValueError('K_SIZE should be less than LONG_K_SIZE') 57 | 58 | 59 | 60 | 61 | if args.outdir == 'default': 62 | import datetime 63 | 64 | sp = str(datetime.datetime.now() 65 | ).replace(' ', '_').replace(':', '').replace('-', '_').split('.')[0] 66 | outdir = 'Results_' + sp 67 | 68 | else: 69 | outdir = args.outdir 70 | 71 | try: 72 | os.mkdir(outdir) 73 | except: 74 | raise FileExistsError('The specified output dir already exists!') 75 | 76 | print('\nSample data collecting...') 77 | 78 | sample_motifs, sample_reverse_motifs, sample_long_motifs, sample_long_reverse_motifs, sample_coverages, sample_rev_coverages = parse_data( 79 | args.sample_fast5dir, 80 | args.reference, 81 | target_chr=args.target_chr, 82 | required_coverage=args.coverage, 83 | MOTIF_LEN=args.k_size, 84 | LONG_MOTIF_LEN=args.long_k_size, 85 | ) 86 | 87 | 88 | 89 | print('\nControl data collecting...') 90 | control_motifs, control_reverse_motifs, control_long_motifs, control_long_reverse_motifs, control_coverages, control_rev_coverages = parse_data( 91 | args.control_fast5dir, 92 | args.reference, 93 | target_chr=args.target_chr, 94 | required_coverage=args.coverage, 95 | MOTIF_LEN=args.k_size, 96 | LONG_MOTIF_LEN=args.long_k_size, 97 | ) 98 | 99 | 100 | refs, reverse_refs = get_reference( 101 | args.reference, 102 | target_chr=args.target_chr 103 | ) 104 | 105 | for contig in refs: 106 | print(contig, len(refs[contig])) 107 | 108 | 109 | print('\nForward strand signals processing...') 110 | motifs_lines, ks_stat_lines = get_statistics( 111 | sample_motifs, 112 | control_motifs, 113 | maxsamplesize=SAMPLESIZE, 114 | minsamplesize=MINSAMPLESIZE, 115 | threads=args.threads 116 | ) 117 | 118 | 119 | print('\nReverse strand signals processing...') 120 | reverse_motifs_lines, reverse_ks_stat_lines = get_statistics( 121 | sample_reverse_motifs, 122 | control_reverse_motifs, 123 | maxsamplesize=SAMPLESIZE, 124 | minsamplesize=MINSAMPLESIZE, 125 | threads=args.threads 126 | ) 127 | 128 | 129 | 130 | 131 | # MOTIFS EXTRACTION 132 | 133 | for contig in motifs_lines: 134 | 135 | 136 | print('Processing forward motifs {}...'.format(contig)) 137 | 138 | 139 | 140 | contig_passed_motifs = get_difsignals( 141 | motifs_lines[contig], 142 | ks_stat_lines[contig], 143 | log10_pval_thr = args.ks_t, 144 | ) 145 | 146 | if len(contig_passed_motifs) < 100: 147 | print('---The number of k-mers is insufficient for the enrichment process. {} is skipped.---'.format(contig)) 148 | continue 149 | 150 | 151 | 152 | 153 | 154 | 155 | plotdir = outdir + '/plots_forward_{}'.format(contig) 156 | os.mkdir(plotdir) 157 | 158 | save_k_mers(contig_passed_motifs, outdir + '/passed_motifs_forward_{}.fasta'.format(contig)) 159 | motifs = extract_motifs(contig_passed_motifs, 160 | refs[contig], 161 | outdir, 162 | args.max_motifs, 163 | args.min_conf, 164 | 'forward_' + contig, 165 | 166 | sample_motifs[contig], 167 | control_motifs[contig], 168 | sample_long_motifs[contig], 169 | control_long_motifs[contig], 170 | args.k_size, 171 | args.long_k_size, 172 | args.ks_t, 173 | 174 | threads=args.threads, 175 | lenmotif=args.k_size 176 | ) 177 | 178 | 179 | for motif in motifs: 180 | plot_dist(motif, sample_motifs[contig], control_motifs[contig], plotdir, lenmotif=args.k_size) 181 | 182 | 183 | 184 | save_results(motifs, outdir + '/final_motifs_forward_{}.fasta'.format(contig)) 185 | plot_coverage(sample_coverages[contig], control_coverages[contig], contig, f'{outdir}/coverage_forward_{contig}.pdf') 186 | 187 | 188 | for contig in reverse_motifs_lines: 189 | 190 | print(contig, len(reverse_refs[contig])) 191 | print('Processing reversed motifs {}...'.format(contig)) 192 | 193 | 194 | contig_passed_motifs = get_difsignals( 195 | reverse_motifs_lines[contig], 196 | reverse_ks_stat_lines[contig], 197 | log10_pval_thr = args.ks_t, 198 | ) 199 | 200 | 201 | if len(contig_passed_motifs) < 100: 202 | print('---The number of k-mers is insufficient for the enrichment process. {}(reverse) is skipped.---'.format(contig)) 203 | continue 204 | 205 | 206 | plotdir = outdir + '/plots_reverse_{}'.format(contig) 207 | os.mkdir(plotdir) 208 | 209 | save_k_mers(contig_passed_motifs, outdir + '/passed_motifs_reverse_{}.fasta'.format(contig)) 210 | motifs = extract_motifs(contig_passed_motifs, 211 | reverse_refs[contig], 212 | outdir, 213 | args.max_motifs, 214 | args.min_conf, 215 | 'reverse_' + contig, 216 | 217 | sample_reverse_motifs[contig], 218 | control_reverse_motifs[contig], 219 | sample_long_reverse_motifs[contig], 220 | control_long_reverse_motifs[contig], 221 | args.k_size, 222 | args.long_k_size, 223 | args.ks_t, 224 | 225 | threads=args.threads, 226 | lenmotif=args.k_size 227 | ) 228 | 229 | 230 | 231 | 232 | for motif in motifs: 233 | plot_dist(motif, sample_reverse_motifs[contig], control_reverse_motifs[contig], plotdir, lenmotif=args.k_size) 234 | 235 | 236 | 237 | save_results(motifs, outdir + '/final_motifs_reverse_{}.fasta'.format(contig)) 238 | plot_coverage(sample_rev_coverages[contig], control_rev_coverages[contig], contig, f'{outdir}/coverage_reverse_{contig}.pdf') 239 | 240 | 241 | print('Done!') 242 | 243 | 244 | if __name__ == '__main__': 245 | main() -------------------------------------------------------------------------------- /snapper/src/data_processing.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Pool 2 | from unittest import result 3 | from webbrowser import get 4 | import numpy as np 5 | import h5py 6 | from itertools import product 7 | from Bio.SeqIO import parse 8 | from Bio.Seq import reverse_complement 9 | import os 10 | from tqdm import tqdm 11 | 12 | letters = ['A','G','C','T'] 13 | 14 | 15 | def get_reference(reference_file, target_chr='all'): 16 | 17 | ref_file = parse(reference_file, format='fasta') 18 | 19 | 20 | if target_chr == 'all': 21 | 22 | refs = {} 23 | reverse_refs = {} 24 | for rec in ref_file: 25 | 26 | seq = str(rec.seq) 27 | contig = str(rec.description).split(' ')[0] 28 | refs[contig] = seq 29 | reverse_refs[contig] = reverse_complement(seq) 30 | return refs, reverse_refs 31 | 32 | if target_chr == 'longest': 33 | length = 0 34 | 35 | refs = {} 36 | reverse_refs = {} 37 | 38 | for rec in ref_file: 39 | 40 | contig = str(rec.description).split(' ')[0] 41 | if len(rec.seq) > length: 42 | seq = str(rec.seq) 43 | 44 | refs = {contig : seq} 45 | reverse_refs = {contig : reverse_complement(seq)} 46 | length = str(rec.seq) 47 | 48 | return refs, reverse_refs 49 | 50 | else: 51 | refs = {} 52 | reverse_refs = {} 53 | for rec in ref_file: 54 | contig = str(rec.description).split(' ')[0] 55 | if contig == target_chr: 56 | seq = str(rec.seq) 57 | 58 | refs[target_chr] = seq 59 | reverse_refs[target_chr] = reverse_complement(seq) 60 | 61 | if len(refs) == 0: 62 | raise KeyError('{} contig does not exist!'.format(target_chr)) 63 | 64 | return refs, reverse_refs 65 | 66 | 67 | 68 | def get_max_replicon(refs): 69 | 70 | length = 0 71 | max_chrom = 0 72 | for chrom in refs: 73 | if len(refs[chrom]) > length: 74 | length = len(refs[chrom]) 75 | max_chrom = chrom 76 | 77 | return max_chrom 78 | 79 | 80 | 81 | def _get_shifts(MOTIF_LEN): 82 | 83 | left_shift = int(np.floor(MOTIF_LEN/2)) 84 | right_shift = int(np.ceil(MOTIF_LEN/2)) 85 | 86 | return left_shift, right_shift 87 | 88 | 89 | def parse_data(fast5dir, reference_file, target_chr='all', required_coverage=30, MOTIF_LEN=11, LONG_MOTIF_LEN=29): 90 | 91 | refs, reverse_refs = get_reference(reference_file, target_chr) 92 | 93 | coverages = { 94 | ref: np.zeros(len(refs[ref])) for ref in refs 95 | } 96 | 97 | rev_coverages = { 98 | ref: np.zeros(len(refs[ref])) for ref in reverse_refs 99 | } 100 | 101 | l, r = _get_shifts(MOTIF_LEN) 102 | 103 | long_l, long_r = _get_shifts(LONG_MOTIF_LEN) 104 | 105 | motifs = {} 106 | reverse_motifs = {} 107 | 108 | long_motifs = {} 109 | long_reverse_motifs = {} 110 | 111 | 112 | 113 | 114 | for ref in refs: 115 | motifs[ref] = {} 116 | reverse_motifs[ref] = {} 117 | 118 | long_motifs[ref] = {} 119 | long_reverse_motifs[ref] = {} 120 | 121 | 122 | files = [file for file in os.listdir(fast5dir) if '.fast5' in file] 123 | 124 | batch = 1 125 | 126 | max_chrom = get_max_replicon(refs) 127 | 128 | 129 | for f in files: 130 | print('Batch {} out of {}...'.format(batch, len(files))) 131 | 132 | batch += 1 133 | 134 | 135 | try: 136 | 137 | with h5py.File('{}/{}'.format(fast5dir, f), 'r', rdcc_nbytes=1024**3) as file: 138 | 139 | for i in tqdm(list(file.items()), leave=False, ncols=75): 140 | 141 | 142 | readname = i[0] 143 | try: 144 | trace = file['/{}/Analyses/RawGenomeCorrected_000/BaseCalled_template/Events'.format(readname)][:] 145 | 146 | except KeyError: 147 | continue 148 | 149 | chrom = file['/{}/Analyses/RawGenomeCorrected_000/BaseCalled_template/Alignment'.format(readname)].attrs['mapped_chrom'] 150 | 151 | if chrom not in motifs: 152 | continue 153 | seq = [t[4].decode() for t in trace] 154 | 155 | str_seq = ''.join(seq).upper() 156 | 157 | 158 | f = refs[chrom].find(str_seq) 159 | 160 | 161 | if f != -1: 162 | 163 | for i in range(l, len(seq)-r): 164 | context = str_seq[i-l:i+r] 165 | 166 | if context not in motifs[chrom]: 167 | motifs[chrom][context] = [] 168 | 169 | motifs[chrom][context].append(trace[i][0]) 170 | 171 | 172 | for i in range(long_l, len(seq) - long_r): 173 | long_context = str_seq[i-long_l:i+long_r] 174 | 175 | if long_context not in long_motifs[chrom]: 176 | long_motifs[chrom][long_context] = [] 177 | 178 | long_motifs[chrom][long_context].append(trace[i][0]) 179 | 180 | if chrom == max_chrom: 181 | coverages[chrom][f:f+len(seq)] += 1 182 | 183 | continue 184 | 185 | 186 | 187 | 188 | f_reverse = reverse_refs[chrom].find(str_seq) 189 | if f_reverse != -1: 190 | 191 | 192 | for i in range(l, len(seq)-r): 193 | context = str_seq[i-l:i+r] 194 | 195 | if context not in reverse_motifs[chrom]: 196 | reverse_motifs[chrom][context] = [] 197 | 198 | reverse_motifs[chrom][context].append(trace[i][0]) 199 | 200 | 201 | for i in range(long_l, len(seq) - long_r): 202 | long_context = str_seq[i-long_l:i+long_r] 203 | 204 | if long_context not in long_reverse_motifs[chrom]: 205 | long_reverse_motifs[chrom][long_context] = [] 206 | 207 | long_reverse_motifs[chrom][long_context].append(trace[i][0]) 208 | 209 | 210 | 211 | if chrom == max_chrom: 212 | rev_coverages[chrom][f_reverse:f_reverse+len(seq)] += 1 213 | continue 214 | except KeyboardInterrupt: 215 | import sys 216 | sys.exit() 217 | pass 218 | except: 219 | print('Invalid batch!') 220 | continue 221 | 222 | 223 | current_forward_coverage = np.round(np.mean(coverages[max_chrom]), 2) 224 | current_reverse_coverage = np.round(np.mean(rev_coverages[max_chrom]), 2) 225 | 226 | 227 | if min(current_forward_coverage, current_reverse_coverage) > required_coverage: 228 | break 229 | 230 | print(f'Current forward coverage {current_forward_coverage}X ; reverse coverage {current_reverse_coverage}X') 231 | 232 | print(f'Final coverage depth: forward {current_forward_coverage}X ; reverse {current_reverse_coverage}X (with {required_coverage}X threshold)') 233 | return motifs, reverse_motifs, long_motifs, long_reverse_motifs, coverages, rev_coverages 234 | 235 | 236 | 237 | 238 | 239 | -------------------------------------------------------------------------------- /snapper/src/methods.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from itertools import combinations, product 3 | import re 4 | from snapper.src.seq_processing import gen_variants, letter_codes_rev, letter_anticodes 5 | from scipy.stats import chi2_contingency, mode 6 | from tqdm import tqdm 7 | 8 | 9 | from multiprocessing import Process, Manager 10 | 11 | 12 | 13 | regular_letters = ['A','G','C','T'] 14 | non_regular_letters = ['M', 'R', 'W', 'S', 'Y','K', 'V', 'H', 'D','B'] 15 | 16 | 17 | def filter_pos_variants_l3(pos_variants): 18 | 19 | filtered_pos_variants = [] 20 | 21 | for pos_variant in pos_variants: 22 | _p = sorted(pos_variant) 23 | 24 | if _p[-1] - _p[0] >= 6: 25 | continue 26 | 27 | if tuple(pos_variant) not in filtered_pos_variants: 28 | filtered_pos_variants.append(tuple(_p)) 29 | 30 | return filtered_pos_variants 31 | 32 | 33 | 34 | def filter_pos_variants(pos_variants): 35 | 36 | 37 | # custom filtering for pos_variants with length of 3 38 | if len(pos_variants[0]) == 3: 39 | return filter_pos_variants_l3(pos_variants) 40 | 41 | 42 | 43 | filtered_pos_variants = [] 44 | for pos_variant in pos_variants: 45 | 46 | _p = sorted(pos_variant) 47 | 48 | if _p[-1] - _p[0] >= 6: 49 | continue 50 | 51 | filtered_pos_variants.append(_p) 52 | 53 | _2_filtered_pos_variants = [] 54 | 55 | 56 | for pos_variant in filtered_pos_variants: 57 | 58 | #for i in range(1, len(pos_variant) - 1): 59 | # if (pos_variant[i] - pos_variant[i-1] > 1) and (pos_variant[i+1] - pos_variant[i] > 1): 60 | # continue 61 | # 62 | if tuple(pos_variant) in _2_filtered_pos_variants: 63 | continue 64 | 65 | #if pos_variant[1] - pos_variant[0] > 1 or pos_variant[-1] - pos_variant[-2] > 1: 66 | # continue 67 | 68 | _2_filtered_pos_variants.append(tuple(pos_variant)) 69 | 70 | 71 | return _2_filtered_pos_variants 72 | 73 | 74 | 75 | def filter_motifs(motif_variants): 76 | filtered_motifs = [] 77 | 78 | for motif in motif_variants: 79 | if 'C' not in motif and 'A' not in motif: 80 | continue 81 | filtered_motifs.append(motif) 82 | 83 | return filtered_motifs 84 | 85 | 86 | 87 | 88 | def extract_template_subset(pos_variant, motif_variant, seq_array): 89 | subseq = seq_array 90 | for i in range(len(pos_variant)): 91 | if motif_variant[i] == '.': 92 | continue 93 | 94 | subseq = subseq[subseq[:,pos_variant[i]] == motif_variant[i]] 95 | 96 | return subseq 97 | 98 | 99 | def extract_template_count(pos_variant, motif_variant, seq_array): 100 | 101 | subseq = seq_array 102 | for i in range(len(pos_variant)): 103 | subseq = subseq[subseq[:,pos_variant[i]] == motif_variant[i]] 104 | 105 | return len(subseq) 106 | 107 | 108 | def gen_regexp_template(motif_variant, pos_variant, length=6): 109 | 110 | template = ['.',]*length 111 | 112 | base_pos = pos_variant[0] 113 | for i, pos in enumerate(pos_variant): 114 | template[pos-base_pos] = motif_variant[i] 115 | 116 | return ''.join(template) 117 | 118 | def normalized_variation(array): 119 | return np.std(array)/np.mean(array) 120 | 121 | 122 | def local_filter_seqs(seqs, pos_variant, motif_variant): 123 | 124 | new_seqs = [] 125 | template = ''.join(motif_variant) 126 | template = template.replace('.', 'N') 127 | 128 | template_subvariants = gen_variants(template) 129 | 130 | for s in seqs: 131 | 132 | str_vec = ''.join([s[i] for i in pos_variant]) 133 | if str_vec in template_subvariants: 134 | continue 135 | 136 | new_seqs.append(s) 137 | 138 | return new_seqs 139 | 140 | 141 | def modify_seq(seq, pos, target_letter): 142 | 143 | newseq = list(seq) 144 | newseq[pos] = target_letter 145 | 146 | return ''.join(newseq) 147 | 148 | 149 | def generate_reference_freqs_parallel(seq_array, batch, dict_per_length): 150 | 151 | for pos_variant, motif_variant in batch: 152 | variant_count = extract_template_count(pos_variant, motif_variant, seq_array) 153 | dict_per_length[(motif_variant, pos_variant)] = variant_count 154 | 155 | 156 | def generate_reference_freqs(reference, length, threads, lengths=(4,5,6)): 157 | 158 | variants_counter = {} 159 | 160 | seqs = list(set([ 161 | reference[i:i+length] for i in range(len(reference) - length) 162 | ])) 163 | 164 | seq_array = np.array([list(s) for s in seqs]) 165 | 166 | print(len(seq_array)) 167 | for LENGTH in lengths: 168 | 169 | print('Reference indexing with length of {}...'.format(LENGTH)) 170 | 171 | manager = Manager() 172 | dict_per_length = manager.dict() 173 | 174 | pos_variants = list(combinations(range(0,length), r=LENGTH)) 175 | pos_variants = filter_pos_variants(pos_variants) 176 | 177 | motif_variants = list(product(regular_letters, repeat=LENGTH)) 178 | motif_variants = filter_motifs(motif_variants) 179 | 180 | batch_len = len(pos_variants)*len(motif_variants)//threads 181 | 182 | processes = [] #all processes 183 | for i in range(threads+1): 184 | try: 185 | batch = list(product(pos_variants, motif_variants))[(i)*batch_len:(i+1)*batch_len] 186 | except IndexError: 187 | batch = list(product(pos_variants, motif_variants))[(i)*batch_len:] 188 | p = Process(target=generate_reference_freqs_parallel, 189 | args = (seq_array, batch, dict_per_length,)) 190 | 191 | processes.append(p) 192 | p.start() 193 | 194 | #join processes 195 | [p.join() for p in processes] 196 | 197 | variants_counter[LENGTH] = dict(dict_per_length) 198 | 199 | return variants_counter, len(seq_array) 200 | 201 | 202 | 203 | 204 | def add_N(motif): 205 | 206 | if motif[0] != 'N': 207 | motif = 'N' + motif 208 | 209 | if motif[-1] != 'N': 210 | motif += 'N' 211 | 212 | return motif 213 | 214 | 215 | def is_superset(motif1, motif2, edgelength=2): 216 | 217 | motif1 = add_N(motif1) 218 | motif2 = add_N(motif2) 219 | 220 | if len(motif2) <= len(motif1): 221 | extended_motif1 = motif1 222 | 223 | else: 224 | extended_motif1 = 'N' * edgelength + motif1 + 'N' * edgelength 225 | 226 | motif1_variants = gen_variants(extended_motif1) 227 | motif2_variatns = gen_variants(motif2) 228 | 229 | global_match = True 230 | for variant2 in motif2_variatns: 231 | match = False 232 | for variant1 in motif1_variants: 233 | if variant2 in variant1: 234 | match = True 235 | break 236 | if match == False: 237 | global_match = False 238 | break 239 | 240 | 241 | 242 | return global_match 243 | 244 | 245 | def get_alternate_variants(motif_variant, lenmotif=11, range_of_filtering=5): 246 | 247 | seq_variant, pos_variant = motif_variant[1], motif_variant[2] 248 | 249 | while seq_variant[0] == 'N': 250 | seq_variant = seq_variant[1:] 251 | pos_variant = pos_variant[1:] 252 | 253 | while seq_variant[-1] == 'N': 254 | seq_variant = seq_variant[:-1] 255 | pos_variant = pos_variant[:-1] 256 | 257 | 258 | alternate_variants = [] 259 | 260 | 261 | for i in range( 262 | max(0, pos_variant[0] - range_of_filtering), 263 | min(lenmotif, pos_variant[-1] + range_of_filtering) 264 | ): 265 | 266 | shift = i - pos_variant[0] 267 | 268 | pos_alternate = tuple(j+shift for j in pos_variant) 269 | if pos_alternate[-1] >= lenmotif: 270 | break 271 | 272 | alternate_variants.append((motif_variant[0], seq_variant, pos_alternate)) 273 | 274 | 275 | return alternate_variants 276 | 277 | 278 | 279 | 280 | 281 | def is_subset(motif1, motif2, edgelength=2): 282 | return is_superset(motif2, motif1, edgelength=edgelength) 283 | 284 | 285 | def variant_counts_parallel(seq_array, ref_motifs_counter, N_REF, batch, LENGTH, total_variants_counter_list): 286 | variants_counter_list = [] 287 | N_VARIANT = len(seq_array) 288 | for pos_variant, motif_variant in batch: 289 | try: 290 | reference_count = ref_motifs_counter[LENGTH][(motif_variant, pos_variant)] 291 | 292 | except KeyError: 293 | variants_counter_list.append((0, motif_variant, pos_variant)) 294 | 295 | else: 296 | variant_count = extract_template_count(pos_variant, motif_variant, seq_array) 297 | 298 | 299 | if variant_count == 0 and reference_count == 0: 300 | variants_counter_list.append((0, motif_variant, pos_variant)) 301 | 302 | else: 303 | chi2_result = chi2_contingency( 304 | [ 305 | [variant_count, N_VARIANT-variant_count], 306 | [reference_count, N_REF-reference_count], 307 | ] 308 | ) 309 | 310 | # chi2_log_pval = -np.log10(chi2_result[1]) 311 | chi2_statistic = chi2_result[0] 312 | 313 | variants_counter_list.append((chi2_statistic, motif_variant, pos_variant)) 314 | 315 | total_variants_counter_list+=variants_counter_list 316 | 317 | 318 | def collect_variant_counts(seq_array, ref_motifs_counter, N_REF, threads, lengths=(4,5,6), lenmotif=11): 319 | merged_variants_counter_list = [] 320 | 321 | for LENGTH in lengths: 322 | 323 | print('\tOBSERVING ANCHOR MOTIFS WITH LENGTH OF', LENGTH) 324 | 325 | pos_variants = list(combinations(range(0,lenmotif), r=LENGTH)) 326 | pos_variants = filter_pos_variants(pos_variants) 327 | 328 | motif_variants = list(product(regular_letters, repeat=LENGTH)) 329 | motif_variants = filter_motifs(motif_variants) 330 | 331 | #create batch 332 | batch_len = len(pos_variants)*len(motif_variants)//threads 333 | total_variants_counter_list = Manager().list() #for all outputs 334 | 335 | processes = [] #all processes 336 | args_list = list(product(pos_variants, motif_variants)) 337 | 338 | 339 | for i in range(threads+1): 340 | try: 341 | batch = args_list[i*batch_len:(i+1)*batch_len] 342 | except IndexError: 343 | batch = args_list[i*batch_len:] 344 | 345 | p = Process(target=variant_counts_parallel, 346 | args = (seq_array, ref_motifs_counter, N_REF, batch, LENGTH, total_variants_counter_list)) 347 | 348 | processes.append(p) 349 | p.start() 350 | 351 | [p.join() for p in processes] 352 | 353 | merged_variants_counter_list+=list(total_variants_counter_list) # add to 354 | 355 | merged_variants_counter_list.sort(reverse=True) 356 | 357 | 358 | return merged_variants_counter_list 359 | 360 | 361 | def get_significant_letters(sub_seq_array, top_variant, pos, reference, threshold_ratio): 362 | 363 | print('\tLocal motif adjustment...') 364 | 365 | reference_letter_freqs = {'A':0, 'G':0, 'T':0, 'C':0} 366 | variant_subset_letter_freqs = {'A':0, 'G':0, 'T':0, 'C':0} 367 | ref_vs_variant_ratios = {'A':0, 'G':0, 'T':0, 'C':0} 368 | 369 | variant_length = (top_variant[2][-1] - top_variant[2][0] + 1) 370 | re_variant = gen_regexp_template(top_variant[1], top_variant[2], length=variant_length) 371 | 372 | pos_letters = sub_seq_array[:,pos] 373 | 374 | for letter in reference_letter_freqs: 375 | re_variant_mod = modify_seq(re_variant, pos-top_variant[2][0], letter) 376 | ref_letter_count = len(re.findall(re_variant_mod, reference)) 377 | 378 | variant_subset_letter_count = len(pos_letters[pos_letters == letter]) 379 | 380 | reference_letter_freqs[letter] += ref_letter_count 381 | variant_subset_letter_freqs[letter] += variant_subset_letter_count 382 | 383 | 384 | list_variant_letter_freqs = [ 385 | (variant_subset_letter_freqs[k], k) for k in variant_subset_letter_freqs 386 | ] 387 | 388 | list_variant_letter_freqs.sort(reverse=True) 389 | 390 | # consider the first letter to be presented apriori 391 | the_first_letter = list_variant_letter_freqs[0][1] 392 | 393 | ref_vs_variant_ratios[the_first_letter] = 1 394 | 395 | significant_letters = set([the_first_letter]) 396 | 397 | for record in list_variant_letter_freqs[1:]: 398 | 399 | try: 400 | ref_letter_ratio = reference_letter_freqs[the_first_letter]/reference_letter_freqs[record[1]] 401 | except ZeroDivisionError: 402 | ref_letter_ratio = np.inf 403 | 404 | try: 405 | variant_subset_letter_ratio = variant_subset_letter_freqs[the_first_letter]/variant_subset_letter_freqs[record[1]] 406 | except ZeroDivisionError: 407 | variant_subset_letter_ratio = np.inf 408 | 409 | ref_vs_variant_ratio = variant_subset_letter_ratio/ref_letter_ratio 410 | 411 | ref_vs_variant_ratios[record[1]] = round(ref_vs_variant_ratio, 4) 412 | 413 | if ref_vs_variant_ratio > threshold_ratio: 414 | break 415 | 416 | significant_letters.add(record[1]) 417 | 418 | 419 | return tuple(sorted(list(significant_letters))) 420 | 421 | 422 | def adjust_letter(seq_array, top_variant, pos, reference, threshold_ratio=5): 423 | 424 | sub_seq_array = extract_template_subset(top_variant[2], top_variant[1], seq_array) 425 | 426 | pos_letters = get_significant_letters(sub_seq_array, top_variant, pos, reference, threshold_ratio=threshold_ratio) 427 | 428 | return letter_codes_rev[pos_letters] 429 | 430 | 431 | def change_subset_motif(supermotif, submotif, edgelength=2): 432 | 433 | extended_supermotif = 'N'*edgelength + ''.join(supermotif[1]) + 'N'*edgelength 434 | 435 | super_variants = gen_variants(extended_supermotif) 436 | sub_variants = gen_variants(''.join(submotif[1])) 437 | 438 | shifts = [] 439 | for subvariant in sub_variants: 440 | for supervariant in super_variants: 441 | if subvariant in supervariant: 442 | shift = edgelength - supervariant.find(subvariant) 443 | shifts.append(shift) 444 | 445 | shift = mode(shifts).mode[0] 446 | 447 | left_pos = max(0, submotif[2][0] + shift) 448 | right_pos = min(11, submotif[2][0] + shift + len(supermotif[2])) 449 | 450 | 451 | # check left edge case 452 | if shift < 0: 453 | adjusted_subvariant = ( 454 | submotif[0], 455 | supermotif[1][-shift:], 456 | tuple(range(submotif[2][0], submotif[2][0] + len(supermotif[1][-shift:]))) 457 | ) 458 | 459 | 460 | # check rigth edge case 461 | elif submotif[1][-1] in regular_letters and submotif[2][-1] == 10 and supermotif[1][-1] == 'N': 462 | adjusted_subvariant = ( 463 | submotif[0], 464 | supermotif[1][:-1], 465 | tuple(range(left_pos, 11)) 466 | ) 467 | 468 | # common case 469 | else: 470 | adjusted_subvariant = ( 471 | submotif[0], 472 | supermotif[1], 473 | tuple(range(left_pos, right_pos)) 474 | ) 475 | 476 | 477 | # just a patch, must be formalized!! 478 | if len(adjusted_subvariant[1]) != len(adjusted_subvariant[2]): 479 | adjusted_subvariant = [ 480 | submotif[0], 481 | supermotif[1], 482 | tuple(range(left_pos, left_pos + len(supermotif[1]))) 483 | ] 484 | while adjusted_subvariant[2][-1] > 10: 485 | adjusted_subvariant[1] = adjusted_subvariant[1][:-1] 486 | adjusted_subvariant[2] = adjusted_subvariant[2][:-1] 487 | 488 | return tuple(adjusted_subvariant) 489 | 490 | 491 | 492 | def extend_template(top_variant, maxlength=11): 493 | 494 | extended_top_variant = [ top_variant[0], list(top_variant[1]), list(top_variant[2])] 495 | 496 | if top_variant[2][0] != 0: 497 | extended_top_variant[2] = [extended_top_variant[2][0] - 1] + extended_top_variant[2] 498 | extended_top_variant[1] = ['.'] + extended_top_variant[1] 499 | 500 | if top_variant[2][-1] != maxlength-1: 501 | extended_top_variant[2] = extended_top_variant[2] + [extended_top_variant[2][-1] + 1] 502 | extended_top_variant[1] = extended_top_variant[1] + ['.'] 503 | 504 | 505 | variant_length = (extended_top_variant[2][-1] - extended_top_variant[2][0] + 1) 506 | re_variant = gen_regexp_template(extended_top_variant[1], extended_top_variant[2], length=variant_length) 507 | 508 | extended_top_variant = ( 509 | top_variant[0], 510 | tuple(re_variant), 511 | list(range(extended_top_variant[2][0], extended_top_variant[2][-1] + 1)) 512 | ) 513 | 514 | return extended_top_variant 515 | 516 | 517 | 518 | def save_results (motifs, out_fasta): 519 | 520 | with open(out_fasta, 'w') as f: 521 | 522 | cnt = 1 523 | 524 | for m in motifs: 525 | f.write('>MOTIF_{} conflevel={}\n{}\n'.format(cnt, m[0], ''.join(m[1]))) 526 | cnt += 1 527 | 528 | 529 | def save_k_mers (motifs, out_fasta): 530 | with open(out_fasta, 'w') as f: 531 | 532 | cnt = 1 533 | 534 | for m in motifs: 535 | f.write('>MOTIF_{}\n{}\n'.format(cnt, m)) 536 | cnt += 1 537 | -------------------------------------------------------------------------------- /snapper/src/motif_extraction.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | from Bio.SeqIO import parse 4 | from pickle import dump, load 5 | from snapper.src.methods import collect_variant_counts, is_superset, is_subset, local_filter_seqs, adjust_letter, extend_template, generate_reference_freqs, change_subset_motif 6 | from snapper.src.methods import get_alternate_variants 7 | from snapper.src.type_I_RM_system import check_for_completeness 8 | 9 | 10 | def extract_motifs( 11 | seqs, 12 | reference, 13 | savepath, 14 | max_motifs, 15 | min_conf, 16 | contig_name, 17 | 18 | 19 | sample_motifs, 20 | control_motifs, 21 | sample_long_motifs, 22 | control_long_motifs, 23 | k_size, 24 | long_k_size, 25 | ks_t, 26 | 27 | threads=10, 28 | lenmotif=11, 29 | 30 | ): 31 | 32 | 33 | print() 34 | print('Motif enrichment') 35 | print() 36 | 37 | 38 | N_REF = len(set( 39 | [reference[i:i+lenmotif] for i in range(len(reference) - lenmotif)] 40 | )) 41 | 42 | lengths = [3,4,5,6] 43 | 44 | print('Reference indexing...') 45 | ref_motifs_counter, N_REF = generate_reference_freqs(reference, lenmotif, threads, lengths=lengths) 46 | 47 | 48 | 49 | 50 | 51 | ITERATION = 1 52 | 53 | 54 | new_seqs = seqs.copy() 55 | try: 56 | os.mkdir(savepath + '/seq_iter') 57 | os.mkdir(savepath + '/motif_refine') 58 | 59 | except: 60 | pass 61 | 62 | 63 | os.mkdir(savepath + '/seq_iter/{}/'.format(contig_name)) 64 | 65 | with open(savepath + '/seq_iter/{}/seqs_iter_{}.fasta'.format(contig_name, ITERATION), 'w') as fseqiter: 66 | 67 | for seq in new_seqs: 68 | fseqiter.write('>') 69 | fseqiter.write(seq) 70 | fseqiter.write('\n') 71 | fseqiter.write(seq) 72 | fseqiter.write('\n') 73 | 74 | seq_array = np.array([list(s) for s in new_seqs]) 75 | 76 | initial_seq_array = seq_array.copy() 77 | 78 | 79 | MOTIFS_SET = [] 80 | DETAILED_MOTIF_SET = [] 81 | 82 | print(f'ITERATION 1 ({len(seq_array)} unexplained {lenmotif}-mers):') 83 | 84 | 85 | 86 | 87 | variants_counter_list = collect_variant_counts(seq_array, ref_motifs_counter, N_REF, threads=threads, lengths=lengths, lenmotif=lenmotif) 88 | 89 | 90 | ITERATION = 2 91 | while variants_counter_list[0][0] > min_conf and len(seq_array) > 0: 92 | 93 | for v in variants_counter_list[:15]: 94 | print('\t', v) 95 | 96 | top_variant = variants_counter_list[0] 97 | 98 | extended_top_variant = extend_template(top_variant, maxlength=lenmotif) 99 | 100 | positions_to_adjust = [] 101 | 102 | for i, pos in enumerate(extended_top_variant[2]): 103 | if extended_top_variant[1][i] == '.': 104 | positions_to_adjust.append((pos, i)) 105 | 106 | modifiable_extended_top_variant = [ 107 | extended_top_variant[0], 108 | list(extended_top_variant[1]), 109 | list(extended_top_variant[2]) 110 | ] 111 | 112 | 113 | for pos in positions_to_adjust: 114 | 115 | adjusted_pos_letter = adjust_letter(initial_seq_array, extended_top_variant, pos[0], reference) 116 | modifiable_extended_top_variant[1][pos[1]] = adjusted_pos_letter 117 | 118 | extended_top_variant = ( 119 | extended_top_variant[0], 120 | tuple(modifiable_extended_top_variant[1]), 121 | tuple(modifiable_extended_top_variant[2]), 122 | ) 123 | 124 | print(extended_top_variant) 125 | 126 | is_superset_check = False 127 | is_subset_check = False 128 | 129 | for i, motif in enumerate(MOTIFS_SET): 130 | is_superset_check = is_superset(motif, ''.join(extended_top_variant[1])) 131 | is_subset_check = is_subset(motif, ''.join(extended_top_variant[1])) 132 | 133 | if is_subset_check: 134 | break 135 | 136 | if is_superset_check: 137 | break 138 | 139 | refine_outdir = f'{savepath}/motif_refine/{contig_name}/{"".join(extended_top_variant[1])}' 140 | 141 | complete_motif = check_for_completeness( 142 | extended_top_variant, 143 | sample_motifs, 144 | control_motifs, 145 | sample_long_motifs, 146 | control_long_motifs, 147 | k_size, 148 | long_k_size, 149 | reference, 150 | outputdir=refine_outdir, 151 | log_threshold=ks_t 152 | ) 153 | 154 | 155 | alternate_variants = get_alternate_variants(extended_top_variant, lenmotif=lenmotif) 156 | 157 | print('Filtering seq_set...') 158 | 159 | n_seqs = len(new_seqs) 160 | 161 | for variant in alternate_variants: 162 | if variant[0] > min_conf: 163 | 164 | new_seqs = local_filter_seqs(new_seqs, variant[2], variant[1]) 165 | 166 | 167 | # filter seq_set by top_variant to prevent infinite loop 168 | if len(new_seqs) == n_seqs: 169 | alternate_variants = get_alternate_variants(top_variant) 170 | for variant in alternate_variants: 171 | if variant[0] > min_conf: 172 | 173 | new_seqs = local_filter_seqs(new_seqs, variant[2], variant[1]) 174 | 175 | 176 | MOTIFS_SET.append(''.join(extended_top_variant[1])) 177 | DETAILED_MOTIF_SET.append(extended_top_variant) 178 | 179 | 180 | 181 | print(MOTIFS_SET) 182 | 183 | with open(savepath + '/seq_iter/{}/seqs_iter_{}.fasta'.format(contig_name, ITERATION), 'w') as fseqiter: 184 | 185 | for seq in new_seqs: 186 | fseqiter.write('>') 187 | fseqiter.write(seq) 188 | fseqiter.write('\n') 189 | fseqiter.write(seq) 190 | fseqiter.write('\n') 191 | 192 | 193 | if len(MOTIFS_SET) == max_motifs: 194 | break 195 | 196 | seq_array = np.array([list(s) for s in new_seqs]) 197 | 198 | print(f'ITERATION {ITERATION} ({len(seq_array)} unexplained {lenmotif}-mers):') 199 | ITERATION += 1 200 | 201 | variants_counter_list = collect_variant_counts(seq_array, ref_motifs_counter, N_REF, threads=threads, lengths=lengths) 202 | 203 | 204 | return DETAILED_MOTIF_SET -------------------------------------------------------------------------------- /snapper/src/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from snapper.src.seq_processing import letter_codes, gen_variants 3 | import warnings 4 | warnings.filterwarnings("ignore") 5 | import seaborn as sns 6 | import numpy as np 7 | 8 | from random import sample 9 | 10 | 11 | def cohend(d1, d2): 12 | n1, n2 = len(d1), len(d2) 13 | s1, s2 = np.var(d1, ddof=1), np.var(d2, ddof=1) 14 | s = np.sqrt(((n1 - 1) * s1 + (n2 - 1) * s2) / (n1 + n2 - 2)) 15 | u1, u2 = np.mean(d1), np.mean(d2) 16 | return (u1 - u2) / s 17 | 18 | 19 | regular_letters = ['A','G','C','T'] 20 | 21 | 22 | 23 | def gen_template(motif_variant, pos_variant, lenmotif): 24 | 25 | template = ['N',]*lenmotif 26 | 27 | for i, pos in enumerate(pos_variant): 28 | template[pos] = motif_variant[i] 29 | 30 | return ''.join(template) 31 | 32 | 33 | 34 | def get_anc_variants(ancMOTIF, lenmotif=15): 35 | 36 | ancMOTIF_c = ancMOTIF 37 | while ancMOTIF_c[0] == 'N': 38 | ancMOTIF_c = ancMOTIF_c[1:] 39 | 40 | while ancMOTIF_c[-1] == 'N': 41 | ancMOTIF_c = ancMOTIF_c[:-1] 42 | 43 | ext_length = lenmotif - len(ancMOTIF_c) 44 | anc_variants = [] 45 | print(ancMOTIF_c) 46 | for i in range(ext_length + 1): 47 | anc_variants.append('N'*i + ancMOTIF_c + 'N'*(ext_length - i)) 48 | 49 | return anc_variants 50 | 51 | def plot_dist(motif, native_motifs, wga_motifs, savepath, lenmotif=15, MAXSAMPLESIZE = 2000): 52 | 53 | print(f'Rendering {"".join(motif[1])}...') 54 | 55 | ancMOTIF_init = gen_template(motif[1], motif[2], lenmotif) 56 | 57 | anc_variants = get_anc_variants(ancMOTIF_init, lenmotif=lenmotif) 58 | 59 | N = len(anc_variants) 60 | fig, axs = plt.subplots(N, 2, figsize=(14, 4*N)) 61 | 62 | motif_cnt = 0 63 | for ancMOTIF in anc_variants: 64 | 65 | _wga = [] 66 | _native = [] 67 | 68 | lens = [] 69 | effect_size_dist = [] 70 | 71 | cnt = 0 72 | for MOTIF in gen_variants(ancMOTIF): 73 | 74 | 75 | if MOTIF not in wga_motifs or MOTIF not in native_motifs: 76 | continue 77 | 78 | _wga += wga_motifs[MOTIF] 79 | _native += native_motifs[MOTIF] 80 | 81 | effect_size_dist.append(np.abs(cohend(native_motifs[MOTIF], wga_motifs[MOTIF]))) 82 | 83 | #print(len(_wga), len(_native)) 84 | 85 | if len(effect_size_dist) == 0: 86 | continue 87 | if len(_native) > MAXSAMPLESIZE: 88 | _native = sample(_native, MAXSAMPLESIZE) 89 | 90 | if len(_wga) > MAXSAMPLESIZE: 91 | _wga = sample(_wga, MAXSAMPLESIZE) 92 | 93 | 94 | sns.distplot(x = _wga, hist = False, label='WGA', color='red', ax=axs[motif_cnt][0]) 95 | sns.distplot(x = _native, hist = False, label='native', color='green', ax=axs[motif_cnt][0]) 96 | axs[motif_cnt][0].grid() 97 | 98 | 99 | axs[motif_cnt][0].set_title('{}, confidence = {}\nmed effsize = {}'.format(ancMOTIF, motif[0], np.median(effect_size_dist))) 100 | axs[motif_cnt][0].set_xlabel('Normalized signal') 101 | 102 | axs[motif_cnt][1].hist(effect_size_dist, bins=50, density=True, rwidth=0.8, color='black') 103 | axs[motif_cnt][1].set_xlabel('eff size') 104 | 105 | title = np.round([ 106 | np.percentile(effect_size_dist, 10), 107 | np.percentile(effect_size_dist, 25), 108 | np.percentile(effect_size_dist, 50), 109 | np.percentile(effect_size_dist, 75), 110 | np.percentile(effect_size_dist, 90) 111 | ], 2) 112 | 113 | title = list(map(str, title)) 114 | axs[motif_cnt][1].set_title( 115 | f'{" ".join(title)}' 116 | ) 117 | 118 | axs[motif_cnt][0].legend() 119 | 120 | motif_cnt += 1 121 | 122 | plt.tight_layout() 123 | 124 | plt.savefig(savepath + '/{}.png'.format("".join(motif[1])), format='png', dpi=400) 125 | plt.show() 126 | 127 | 128 | 129 | 130 | def plot_motif(motif, sample_motifs, control_motifs, savepath, lenmotif=11): 131 | 132 | _sample = [] 133 | _control = [] 134 | 135 | 136 | ancMOTIF = gen_template(motif[1], motif[2], lenmotif) 137 | 138 | 139 | effect_size_dist = [] 140 | 141 | for MOTIF in gen_variants(ancMOTIF): 142 | 143 | 144 | if MOTIF not in sample_motifs or MOTIF not in control_motifs: 145 | continue 146 | 147 | 148 | 149 | _sample += sample_motifs[MOTIF] 150 | _control += control_motifs[MOTIF] 151 | 152 | effect_size_dist.append(np.abs(cohend(sample_motifs[MOTIF], control_motifs[MOTIF]))) 153 | 154 | 155 | 156 | plt.figure(figsize=(8,5)) 157 | 158 | plt.grid() 159 | 160 | sns.distplot(x = _control, hist=False, label='Control', color='red') 161 | sns.distplot(x = _sample, hist=False, label='Sample', color='green') 162 | #plt.savefig('tnp/check3.png', dpi=400) 163 | 164 | plt.title('{}, confidence = {}\nmed effsize = {}'.format(ancMOTIF, motif[0], np.median(effect_size_dist))) 165 | 166 | plt.xlim(-5,5) 167 | plt.xlabel('Normalized signal') 168 | plt.legend() 169 | 170 | plt.tight_layout() 171 | plt.savefig(savepath + '/{}.png'.format(ancMOTIF), format='png', dpi=400) 172 | 173 | plt.close() 174 | 175 | 176 | 177 | def plot_coverage(sample_coverage, control_coverage, chrom, output): 178 | 179 | plt.figure(figsize=(25,7)) 180 | 181 | plt.xlabel('chrom position') 182 | plt.ylabel('depth') 183 | 184 | sample_mean_cov = np.round(np.mean(sample_coverage), 0) 185 | control_mean_cov = np.round(np.mean(control_coverage), 0) 186 | 187 | 188 | plt.title(f'{chrom}\nNative mean cov = {sample_mean_cov}X\nControl mean cov = {control_mean_cov}X') 189 | 190 | plt.plot(sample_coverage, label='Native') 191 | plt.plot(control_coverage, label='Control') 192 | 193 | plt.legend() 194 | 195 | plt.tight_layout() 196 | 197 | plt.savefig(output, format='pdf') -------------------------------------------------------------------------------- /snapper/src/seq_processing.py: -------------------------------------------------------------------------------- 1 | letter_codes = { 2 | 'A': ['A'], 3 | 'C': ['C'], 4 | 'G': ['G'], 5 | 'T': ['T'], 6 | 'M': ['A','C'], 7 | 'R': ['A','G'], 8 | 'W': ['A','T'], 9 | 'S': ['C','G'], 10 | 'Y': ['C','T'], 11 | 'K': ['G','T'], 12 | 'V': ['A','C','G'], 13 | 'H': ['A','C','T'], 14 | 'D': ['A','G','T'], 15 | 'B': ['C','G','T'], 16 | 'N': ['A','C','G','T'] 17 | } 18 | 19 | 20 | letter_anticodes = { 21 | 'A': set(['C', 'G', 'T']), 22 | 'C': set(['A', 'G', 'T']), 23 | 'G': set(['A', 'C', 'T']), 24 | 'T': set(['A', 'C', 'G']), 25 | 'M': set(['G','T']), 26 | 'R': set(['C','T']), 27 | 'W': set(['C','G']), 28 | 'S': set(['A','T']), 29 | 'Y': set(['C','T']), 30 | 'K': set(['A','C']), 31 | 'V': set(['T']), 32 | 'H': set(['G']), 33 | 'D': set(['C']), 34 | 'B': set(['A']), 35 | 'N': set([]) 36 | } 37 | 38 | letter_codes_rev = { 39 | ('A',): 'A', 40 | ('C',): 'C', 41 | ('G',): 'G', 42 | ('T',): 'T', 43 | ('A', 'C'): 'M', 44 | ('A', 'G'): 'R', 45 | ('A', 'T'): 'W', 46 | ('C', 'G'): 'S', 47 | ('C', 'T'): 'Y', 48 | ('G', 'T'): 'K', 49 | ('A', 'C', 'G'): 'V', 50 | ('A', 'C', 'T'): 'H', 51 | ('A', 'G', 'T'): 'D', 52 | ('C', 'G', 'T'): 'B', 53 | ('A', 'C', 'G', 'T'): 'N' 54 | } 55 | 56 | 57 | 58 | 59 | 60 | def gen_variants(seq): 61 | variants = [''] 62 | 63 | for i in range(len(seq)): 64 | new_variants = [] 65 | for l in letter_codes[seq[i]]: 66 | 67 | for v in variants: 68 | new_variants.append(v + l) 69 | 70 | variants = new_variants 71 | 72 | return variants -------------------------------------------------------------------------------- /snapper/src/statistics_methods.py: -------------------------------------------------------------------------------- 1 | from scipy.stats import ks_2samp 2 | from random import sample 3 | import numpy as np 4 | import os 5 | from multiprocessing import Process, Manager 6 | from tqdm import tqdm 7 | 8 | SAMPLESIZE = 200 9 | 10 | # just a patch 11 | MINSAMPLESIZE = 10 12 | 13 | LOG10_PVAL_TRH = 5 14 | 15 | def get_ks_test( 16 | motif_subset, 17 | ks_stat_line, 18 | motifs_line, 19 | native_motifs, 20 | wga_motifs, 21 | contig, 22 | minsamplesize, 23 | maxsamplesize): 24 | 25 | ks_test_batch = [] 26 | motif_batch = [] 27 | 28 | for MOTIF in motif_subset: 29 | if MOTIF not in wga_motifs[contig].keys(): 30 | continue 31 | 32 | try: 33 | s1 = sample(native_motifs[contig][MOTIF], maxsamplesize) 34 | except ValueError: 35 | s1 = native_motifs[contig][MOTIF] 36 | 37 | try: 38 | s2 = sample(wga_motifs[contig][MOTIF], maxsamplesize) 39 | except ValueError: 40 | s2 = wga_motifs[contig][MOTIF] 41 | 42 | if len(s1) < minsamplesize or len(s2) < minsamplesize: 43 | continue 44 | 45 | ks_test_batch.append(ks_2samp(s1,s2, mode='asymp')[1]) 46 | motif_batch.append(MOTIF) 47 | #return_data[os.getpid()] = {} 48 | ks_stat_line += ks_test_batch 49 | motifs_line += motif_batch 50 | 51 | def get_statistics( 52 | native_motifs, 53 | wga_motifs, 54 | maxsamplesize=SAMPLESIZE, 55 | minsamplesize=MINSAMPLESIZE, 56 | threads=1, 57 | ): 58 | 59 | print('Getting difsignals...') 60 | contigs = list(native_motifs.keys()) 61 | motifs_lines = {} 62 | ks_stat_lines = {} 63 | 64 | for contig in contigs: 65 | 66 | # ks_stat_line = [] 67 | # motifs_line = [] 68 | procs = [] 69 | KS_manager = Manager() 70 | ks_stat_line = KS_manager.list() 71 | 72 | motif_manager = Manager() 73 | motifs_line = motif_manager.list() 74 | 75 | interval_coordinates = np.linspace(0, len(native_motifs[contig].keys()), threads+1) 76 | intervals = [(interval_coordinates[idx], interval_coordinates[idx+1]) for idx,_ in list(enumerate(interval_coordinates))[: -1]] 77 | motifs = list(native_motifs[contig].keys()) 78 | 79 | # Thread filtering 80 | threads_limit = len(motifs) 81 | if threads > threads_limit: 82 | threads = threads_limit 83 | 84 | for thread_process in tqdm(intervals): 85 | 86 | motif_subset = motifs[int(thread_process[0]): int(thread_process[1])] 87 | #print(motif_subset) 88 | proc = Process(target=get_ks_test, args=(motif_subset, ks_stat_line, motifs_line, native_motifs, wga_motifs, contig, minsamplesize, maxsamplesize)) 89 | procs.append(proc) 90 | proc.start() 91 | 92 | for proc in procs: 93 | proc.join() 94 | 95 | print() 96 | ks_stat_lines[contig] = list(ks_stat_line) 97 | motifs_lines[contig] = list(motifs_line) 98 | 99 | return motifs_lines, ks_stat_lines 100 | 101 | 102 | def get_difsignals( 103 | motifs_line, 104 | ks_stats, 105 | log10_pval_thr = LOG10_PVAL_TRH, 106 | 107 | ): 108 | 109 | 110 | passed_motifs = [] 111 | 112 | for i in range(len(ks_stats)): 113 | if -np.log10(ks_stats[i]) >= log10_pval_thr: 114 | passed_motifs.append(motifs_line[i]) 115 | return passed_motifs -------------------------------------------------------------------------------- /snapper/src/type_I_RM_system.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from itertools import product 3 | from snapper.src.seq_processing import letter_codes, gen_variants 4 | from scipy.stats import ks_2samp 5 | from tqdm import tqdm 6 | from random import sample 7 | from scipy.stats import chi2_contingency 8 | from tqdm import tqdm 9 | 10 | bases = ['A', 'G', 'T', 'C', 'N'] 11 | 12 | 13 | MAXSAMPLESIZE = 200 14 | 15 | 16 | def gen_template(motif_variant, pos_variant, lenmotif): 17 | 18 | template = ['N',]*lenmotif 19 | 20 | for i, pos in enumerate(pos_variant): 21 | template[pos] = motif_variant[i] 22 | 23 | template = ''.join(template) 24 | 25 | return template 26 | 27 | 28 | def get_delta(k_size, long_k_size): 29 | return int(long_k_size/2) - int(k_size/2) 30 | 31 | 32 | 33 | 34 | 35 | #### THE PROBLEM IS HERERERERERE!!!! 36 | 37 | def filter_long_kmers(target, long_kmers, lenmotif, long_k_size): 38 | 39 | 40 | 41 | delta = get_delta(lenmotif, long_k_size) 42 | 43 | 44 | 45 | if target[2][-1] + delta >= long_k_size: 46 | return [] 47 | 48 | long_kmers = np.array([list(l) for l in long_kmers]) 49 | 50 | 51 | target_seq = ''.join(target[1]) 52 | 53 | target_variants = gen_variants(target_seq) 54 | 55 | 56 | 57 | 58 | filtered_long_kmers = [] 59 | 60 | for target_variant in target_variants: 61 | 62 | current_long_kmers = long_kmers.copy() 63 | 64 | for i in range(len(target[1])): 65 | try: 66 | current_long_kmers = current_long_kmers[current_long_kmers[:, target[2][i] + delta] == target_variant[i]] 67 | except IndexError: 68 | continue 69 | 70 | filtered_long_kmers += [''.join(l) for l in current_long_kmers] 71 | 72 | return filtered_long_kmers 73 | 74 | 75 | 76 | 77 | 78 | 79 | def check_for_completeness( 80 | motif, 81 | sample_motifs, 82 | control_motifs, 83 | long_sample_motifs, 84 | long_control_motifs, 85 | lenmotif, 86 | long_k_size, 87 | reference, 88 | outputdir, 89 | log_threshold=2, 90 | long_motif_confidence=100_000, 91 | ): 92 | 93 | 94 | #return motif 95 | 96 | print(f'Checking for {"".join(motif[1])} motif completeness...') 97 | 98 | _sample = [] 99 | _control = [] 100 | 101 | 102 | ancMOTIF = gen_template(motif[1], motif[2], lenmotif) 103 | 104 | 105 | for MOTIF in gen_variants(ancMOTIF): 106 | 107 | 108 | if MOTIF not in sample_motifs or MOTIF not in control_motifs: 109 | continue 110 | 111 | 112 | _sample += sample_motifs[MOTIF] 113 | _control += control_motifs[MOTIF] 114 | 115 | # THINKKKKK!!!! 116 | 117 | if len(_sample) > MAXSAMPLESIZE: 118 | _sample = sample(_sample, MAXSAMPLESIZE) 119 | 120 | if len(_control) > MAXSAMPLESIZE: 121 | _control = sample(_control, MAXSAMPLESIZE) 122 | 123 | global_ks_results = ks_2samp(_sample, _control)[1] 124 | 125 | if -np.log10(global_ks_results) > log_threshold: 126 | print(f'Motif {"".join(motif[1])} seems complete (p-val = {global_ks_results})') 127 | print() 128 | return motif 129 | 130 | 131 | print(f'Motif {"".join(motif[1])} is probably incomplete (p-val = {global_ks_results}). Extending enrichment heuristics...') 132 | 133 | _sample = [] 134 | _control = [] 135 | 136 | significant_contexts = [] 137 | 138 | filtered_long_motifs = filter_long_kmers(motif, list(long_control_motifs.keys()), lenmotif, long_k_size) 139 | 140 | print('Collecting extended contexts...') 141 | for MOTIF in tqdm(filtered_long_motifs): 142 | 143 | 144 | if MOTIF not in long_sample_motifs or MOTIF not in long_control_motifs: 145 | continue 146 | _sample += long_sample_motifs[MOTIF] 147 | _control += long_control_motifs[MOTIF] 148 | 149 | if -np.log10(ks_2samp(long_sample_motifs[MOTIF], long_control_motifs[MOTIF])[1]) > log_threshold: 150 | significant_contexts.append(MOTIF) 151 | 152 | import os 153 | 154 | os.makedirs(outputdir) 155 | 156 | context_cnt = 1 157 | with open(outputdir + '/long_contexts.fasta', 'w') as f_contexts: 158 | for context in significant_contexts: 159 | f_contexts.write(f'>long_context_{context_cnt}\n{context}\n') 160 | context_cnt += 1 161 | 162 | 163 | delta = get_delta(lenmotif, long_k_size) 164 | trd1_pos = motif[2][0] + delta 165 | 166 | 167 | long_motif_veriants = find_possible_trd2(motif, significant_contexts, reference, trd1_pos, long_k_size) 168 | 169 | with open(outputdir + '/long_motif_variants.tsv', 'w') as f_variants: 170 | for motif in long_motif_veriants: 171 | confidence, possible_long_motif = motif 172 | 173 | f_variants.write(f'{possible_long_motif}\t{confidence}\n') 174 | 175 | 176 | print(f'{possible_long_motif} has shown the best statistics. All data have been saved to {outputdir}.') 177 | print() 178 | 179 | return motif 180 | 181 | 182 | 183 | 184 | 185 | def create_long_motif_template(long_motif, trd1_pos, confidence): 186 | 187 | motif_template = [] 188 | pos_template = [] 189 | for i in range(len(long_motif)): 190 | 191 | pos_template.append(i + trd1_pos) 192 | motif_template.append(long_motif[i]) 193 | 194 | 195 | while motif_template[0] == 'N': 196 | motif_template = motif_template[1:] 197 | pos_template = pos_template[1:] 198 | 199 | while motif_template[-1] == 'N': 200 | motif_template = motif_template[:-1] 201 | pos_template = pos_template[:-1] 202 | 203 | 204 | return ( 205 | confidence, tuple(motif_template), tuple(pos_template), 206 | ) 207 | 208 | 209 | 210 | 211 | def filter_trd2(trd2_variants, min_nondegenerate_letters=2, max_N_letters=2): 212 | 213 | filtered_trd2_variants = [] 214 | 215 | for v in trd2_variants: 216 | 217 | Ncnt = v.count('N') 218 | 219 | if v[0] == 'N': 220 | continue 221 | 222 | if v[-1] == 'N': 223 | continue 224 | 225 | if Ncnt > max_N_letters: 226 | continue 227 | 228 | if len(v) - Ncnt < min_nondegenerate_letters: 229 | continue 230 | 231 | filtered_trd2_variants.append(v) 232 | return filtered_trd2_variants 233 | 234 | 235 | 236 | 237 | def generate_RM_type_I_templates(trd1, N_lens=(5,6,7,8), trd2_lens=(2,3,4,5,6)): 238 | 239 | 240 | trd2_variants = [] 241 | 242 | for trd2_len in trd2_lens: 243 | trd2_variants += [''.join(v) for v in list(product(bases, repeat=trd2_len))] 244 | 245 | trd2_variants = filter_trd2(trd2_variants) 246 | 247 | 248 | N_variants = ['N'*N_len for N_len in N_lens] 249 | 250 | while trd1[0] == 'N': 251 | trd1 = trd1[1:] 252 | 253 | while trd1[-1] == 'N': 254 | trd1 = trd1[:-1] 255 | 256 | templates = list(product(gen_variants(trd1), N_variants,trd2_variants)) 257 | 258 | templates = [''.join(t) for t in templates] 259 | return templates 260 | 261 | def find_possible_trd2(trd1, significant_contexts, reference, trd1_pos, lenmotif, N_lens=(5,6,7,8), trd2_lens=(2,3,4,5,6)): 262 | 263 | 264 | 265 | 266 | trd1 = ''.join(trd1[1]) 267 | 268 | seq_array = np.array([list(l) for l in significant_contexts]) 269 | ref_array = list(set([reference[i:i + lenmotif] for i in range(len(reference)-lenmotif)])) 270 | 271 | ref_array = np.array([list(l) for l in ref_array]) 272 | 273 | N_ref = len(ref_array) 274 | filtered_ref_array = None 275 | for short_motif in gen_variants(trd1): 276 | tmp_ref_array = ref_array.copy() 277 | 278 | for pos in range(trd1_pos, trd1_pos + len(trd1)): 279 | #print(ref_array, pos, trd1, trd1_pos) 280 | tmp_ref_array = tmp_ref_array[tmp_ref_array[:,pos] == short_motif[pos-trd1_pos]] 281 | if filtered_ref_array is None: 282 | 283 | filtered_ref_array = tmp_ref_array 284 | else: 285 | filtered_ref_array = np.concatenate((filtered_ref_array, tmp_ref_array)) 286 | 287 | ref_array = filtered_ref_array 288 | 289 | N_seqset = len(seq_array) 290 | 291 | templates = generate_RM_type_I_templates(trd1, N_lens=N_lens, trd2_lens=trd2_lens) 292 | 293 | trd2_testing_results = [] 294 | 295 | # THINKKKKK!!! 296 | print('TRD2 sequence optimization...') 297 | 298 | for template in tqdm(templates): 299 | 300 | if trd1_pos + len(template) >= lenmotif: 301 | continue 302 | 303 | subseq = seq_array.copy() 304 | ref_subseq = ref_array.copy() 305 | 306 | for pos in range(trd1_pos, trd1_pos + len(template)): 307 | if template[pos - trd1_pos] == 'N': 308 | continue 309 | 310 | if len(subseq) != 0: 311 | subseq = subseq[subseq[:, pos+1] == template[pos - trd1_pos]] 312 | 313 | if len(ref_subseq) != 0: 314 | ref_subseq = ref_subseq[ref_subseq[:, pos+1] == template[pos - trd1_pos]] 315 | 316 | subseq_N = len(subseq) 317 | ref_subseq_N = len(ref_subseq) 318 | 319 | try: 320 | chi_res = chi2_contingency( 321 | [ 322 | [subseq_N, N_seqset-subseq_N], 323 | [ref_subseq_N, N_ref-ref_subseq_N], 324 | ] 325 | )[0] 326 | except ValueError: 327 | chi_res = 0 328 | 329 | trd2_testing_results.append( 330 | ( 331 | chi_res, template 332 | ) 333 | ) 334 | 335 | 336 | trd2_testing_results.sort(reverse=True) 337 | 338 | print('Best results:') 339 | for t in trd2_testing_results[:10]: 340 | print(t[0], t[1]) 341 | 342 | 343 | return trd2_testing_results[:20] 344 | 345 | 346 | 347 | 348 | 349 | --------------------------------------------------------------------------------