433 |
434 |
437 |
440 |
443 |
446 |
449 |
452 |
455 |
458 |
--------------------------------------------------------------------------------
/PERF/rep_utils.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # pylint: disable=C0111, C0301
3 |
4 | from __future__ import print_function, division
5 | from itertools import product
6 | from Bio import SeqIO
7 | from tqdm import tqdm
8 | import sys, gzip, os
9 | from os import remove as del_file
10 | import multiprocessing as multi
11 |
12 | if sys.version_info.major == 2:
13 | from utils import rev_comp, rawcharCount, getGC, get_targetids
14 | from analyse import analyse_fasta
15 | from annotation import annotate
16 | elif sys.version_info.major == 3:
17 | from .utils import rev_comp, rawcharCount, getGC, get_targetids
18 | from .analyse import analyse_fasta
19 | from .annotation import annotate
20 |
21 | def num_factors(num):
22 | factors = []
23 | for i in range(1,num):
24 | if num%i == 0: factors.append(i)
25 | return factors
26 |
27 | def expand_repeat(string, size):
28 | """Expands a motif to highest motif size, used for checking duplicates"""
29 | return_string = ''
30 | i = 0
31 | while len(return_string) < size:
32 | return_string += string[i]
33 | i += 1
34 | if i >= len(string):
35 | i = 0
36 | return return_string
37 |
38 |
39 | def get_cycles(string):
40 | cycles = []
41 | for i in range(len(string)):
42 | cycles.append(string[i:] + string[:i])
43 | return cycles
44 |
45 |
46 | def generate_repeats(sizes, atomic):
47 | """Generates all possible motifs for repeats in a given length range"""
48 | generated_repeats = []
49 | alphabet = ['A', 'C', 'G', 'T']
50 | expanded_set = set()
51 | repeat_set = set()
52 | sizes.sort()
53 | min_size = sizes[0]
54 | max_size = sizes[-1]
55 | non_atomic_repeats = dict()
56 | for s in range(1, max_size):
57 | if s not in sizes:
58 | non_atomic_repeats[s] = set()
59 | if atomic:
60 | for combination in product(alphabet, repeat=s):
61 | repeat = ''.join(combination)
62 | expanded = expand_repeat(repeat, max_size)
63 | non_atomic_repeats[s].add(expanded)
64 | for i in sizes:
65 | factors = num_factors(i)
66 | for combination in product(alphabet, repeat=i):
67 | repeat = ''.join(combination)
68 | repeat_revcomp = rev_comp(repeat)
69 | expanded = expand_repeat(repeat, max_size)
70 | atomic_check = False
71 | if atomic:
72 | for factor in factors:
73 | if factor not in sizes and expanded in non_atomic_repeats[factor]:
74 | atomic_check = True
75 | if expanded in expanded_set:
76 | continue
77 | elif atomic and atomic_check:
78 | continue
79 | else:
80 | repeat_cycles = get_cycles(repeat)
81 | for cycle in repeat_cycles:
82 | strand = '+'
83 | string = expand_repeat(cycle, max_size)
84 | expanded_set.add(string)
85 | if cycle not in repeat_set:
86 | repeat_set.add(cycle)
87 | if len(cycle) >= min_size:
88 | generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand]))
89 | if repeat_revcomp == repeat:
90 | continue
91 | repeat_cycles = get_cycles(repeat_revcomp)
92 | for cycle in repeat_cycles:
93 | strand = '-'
94 | string = expand_repeat(cycle, max_size)
95 | expanded_set.add(string)
96 | if cycle not in repeat_set:
97 | repeat_set.add(cycle)
98 | if len(cycle) >= min_size:
99 | generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand]))
100 | return generated_repeats
101 |
102 |
103 | def build_rep_set(repeat_file, length_cutoff=None, unit_cutoff=None):
104 | """
105 | Outputs the repeats info dictionary used by the get_ssrs function.
106 | Takes list of repeat motifs from repeats file(output by generate_repeats function) as input.
107 | Creates a dictionary with expanded repeat as the key and (class, motif_length, strand) as values.
108 | Works either by "length_cutoff=" or by "unit_cutoff=" arguments.
109 | """
110 | repeats_out = dict()
111 | motif_fallback = dict()
112 | motif_cutoff = dict()
113 | repeat_lengths = set()
114 | if length_cutoff is not None:
115 | for line in repeat_file:
116 | motif_dict = dict()
117 | L = line.strip().split('\t')
118 | motif = L[0]
119 | motif_length = int(L[2])
120 | motif = expand_repeat(motif, length_cutoff)
121 | motif_dict['class'] = L[1]
122 | motif_dict['motif_length'] = motif_length
123 | motif_dict['strand'] = L[3]
124 | repeats_out[motif] = motif_dict
125 | repeats_out['cutoff'] = [length_cutoff]
126 |
127 | elif unit_cutoff is not None:
128 | cutoffs = set()
129 | for line in repeat_file:
130 | motif_dict = dict()
131 | L = line.strip().split('\t')
132 | motif = L[0]
133 | motif_length = int(L[2])
134 | motif = motif*unit_cutoff[motif_length]
135 | cutoffs.add(len(motif))
136 | motif_dict['class'] = L[1]
137 | motif_dict['motif_length'] = motif_length
138 | motif_dict['strand'] = L[3]
139 | repeats_out[motif] = motif_dict
140 | repeats_out['cutoff'] = sorted(list(cutoffs))
141 |
142 | return repeats_out
143 |
144 |
145 |
146 | def get_ssrs(seq_record, repeats_info, out):
147 | """Native function that identifies repeats in fasta files."""
148 | if type(out) == str:
149 | out_file = open(out, 'w')
150 | else:
151 | out_file = out
152 | length_cutoffs = repeats_info['cutoff']
153 | input_seq = str(seq_record.seq).upper()
154 | input_seq_length = len(input_seq)
155 | for length_cutoff in length_cutoffs:
156 | fallback = length_cutoff - 1
157 | sub_start = 0 # substring start
158 | sub_stop = sub_start + length_cutoff # substring stop
159 | while sub_stop <= input_seq_length:
160 | sub_stop = sub_start + length_cutoff
161 | sub_seq = input_seq[sub_start:sub_stop]
162 | if sub_seq in repeats_info:
163 | match = True
164 | repeat_data = repeats_info[sub_seq]
165 | motif_length = repeat_data['motif_length']
166 | rep_class = repeat_data['class']
167 | strand = repeat_data['strand']
168 | offset = length_cutoff % motif_length
169 | repeat_seq = input_seq[sub_start+offset:sub_start+offset+motif_length]
170 | i = 0
171 | while match:
172 | j = sub_stop
173 | if sub_stop >= input_seq_length:
174 | match = False
175 | match_length = sub_stop - sub_start
176 | num_units = int(match_length/motif_length)
177 | print(seq_record.id, sub_start, sub_stop, rep_class, match_length, strand, num_units, sub_seq[:motif_length], sep="\t", file=out_file)
178 | sub_start = sub_stop - fallback
179 | elif input_seq[j] == repeat_seq[i]:
180 | sub_stop += 1
181 | i += 1
182 | if i >= motif_length:
183 | i = 0
184 | else:
185 | match = False
186 | match_length = sub_stop - sub_start
187 | num_units = int(match_length/motif_length)
188 | print(seq_record.id, sub_start, sub_stop, rep_class, match_length, strand, num_units, sub_seq[:motif_length], sep="\t", file=out_file)
189 | sub_start = sub_stop - fallback
190 | else:
191 | sub_start += 1
192 | if type(out) == str:
193 | out_file.close()
194 |
195 |
196 | def fasta_ssrs(args, repeats_info):
197 |
198 | if args.input.endswith('gz'):
199 | handle = gzip.open(args.input, 'rt')
200 | else:
201 | handle = open(args.input, 'r')
202 |
203 | seq_nucleotide_info = dict()
204 | num_records = rawcharCount(args.input, '>')
205 | records = SeqIO.parse(handle, 'fasta')
206 | target_ids = get_targetids(args.filter_seq_ids, args.target_seq_ids)
207 |
208 | if args.threads > 1:
209 | i = 0
210 | pool = multi.Pool(processes=args.threads)
211 | for record in records:
212 | out_name = './temp_%s.tsv' %(i)
213 | i += 1
214 | if (args.info or args.analyse)==True:
215 | for a in record.seq.upper():
216 | try: seq_nucleotide_info[a] += 1
217 | except KeyError: seq_nucleotide_info[a] = 1
218 | if args.min_seq_length <= len(record.seq) <= args.max_seq_length and record.id in target_ids:
219 | pool.apply_async(get_ssrs, (record, repeats_info, out_name,))
220 |
221 | pool.close()
222 | pool.join()
223 |
224 | # Concat all the output files into one.
225 | temp_outs = tqdm(range(num_records), total=num_records)
226 | for o in temp_outs:
227 | name = './temp_%s.tsv' %(o)
228 | temp_outs.set_description("Concatenating file: %d " %(o))
229 | with open(name, 'r') as fh:
230 | for line in fh:
231 | print(line.strip(), file=args.output)
232 | del_file(name)
233 |
234 | elif args.threads == 1:
235 | records = tqdm(records, total=num_records)
236 | for record in records:
237 | records.set_description("Processing %s" %(record.id))
238 | if (args.info or args.analyse)==True:
239 | for a in record.seq.upper():
240 | try: seq_nucleotide_info[a] += 1
241 | except KeyError: seq_nucleotide_info[a] = 1
242 | if args.min_seq_length <= len(record.seq) <= args.max_seq_length and record.id in target_ids:
243 | get_ssrs(record, repeats_info, args.output)
244 |
245 | if (args.info or args.analyse)==True:
246 | line = "#File_name: %s\n#Total_sequences: %d\n#Total_bases: %d\n#GC: %f"\
247 | %(os.path.basename(args.input), num_records, sum(seq_nucleotide_info.values()),\
248 | round(getGC(seq_nucleotide_info), 2))
249 | print(line, file=args.output)
250 | args.output.close()
251 |
252 | if args.annotate is not None:
253 | annotate(args)
254 |
255 | # Specifies to generate a HTML report
256 | if args.analyse:
257 | analyse_fasta(args)
258 |
--------------------------------------------------------------------------------
/PERF/utils.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | # pylint: disable=C0111, C0301
3 |
4 | from __future__ import print_function, division
5 | import sys, gzip
6 | from itertools import takewhile, repeat, islice
7 | from tqdm import tqdm
8 | from collections import Counter, defaultdict
9 |
10 |
11 | kmers = {
12 | 1: 'Monomer', 2: 'Dimer', 3: 'Trimer', 4: 'Tetramer', 5: 'Pentamer',
13 | 6: 'Hexamer', 7: 'Heptamer', 8: 'Octamer', 9: 'Nonamer', 10: 'Decamer',
14 | 11: 'Undecamer', 12: 'Dodecamer', 13: 'Tridecamer', 14: 'Tetradecamer', 15: 'Pentadecamer',
15 | 16: 'Hexadecamer', 17: 'Heptadecamer', 18: 'Octadecamer', 19: 'Nonadecamer', 20: 'Icosamer',
16 | 21: 'Uncosamer', 22: 'Docosamer', 23: 'Tricosamer', 24: 'Tetracosamer', 25: 'Pentacosamer',
17 | 26: 'Hexacosamer', 27: 'Heptacosamer', 28: 'Octacosamer', 29: 'Nonacosamer', 30: 'Triacontamer',
18 | 31: 'Untriacontamer', 32: 'Dotriacontamer', 33: 'Tritriacontamer', 34: 'Tetratriacontamer', 35: 'Pentatriacontamer',
19 | 36: 'Hexatriacontamer', 37: 'Heptatriacontamer', 38: 'Octatriacontamer', 39: 'Nonatriacontamer', 40: 'Tetracontamer',
20 | 41: 'Untetracontamer', 42: 'Dotetracontamer', 43: 'Tritetracontamer', 44: 'Tetratetracontamer', 45: 'Pentatetracontamer',
21 | 46: 'Hexatetracontamer', 47: 'Heptatetracontamer', 48: 'Octatetracontamer', 49: 'Nonatetracontamer', 50: 'Pentacontamer',
22 | }
23 |
24 |
25 | def get_cycles(string):
26 | cycles = set()
27 | for i in range(len(string)):
28 | cycles.add(string[i:] + string[:i])
29 | cycles = sorted(list(cycles))
30 | return cycles
31 |
32 |
33 | def build_cycVariations(string):
34 | cycles = get_cycles(string)
35 | rev_cycles = get_cycles(rev_comp(string))
36 | for r in rev_cycles:
37 | if r not in cycles: cycles.append(r)
38 | return cycles
39 |
40 |
41 | def getGC(basesCounter):
42 | totalBases = sum(basesCounter.values())
43 | try:
44 | GC = (float(basesCounter['G'] + basesCounter['C'])/(totalBases-basesCounter['N']))*100
45 | except KeyError:
46 | GC = (float(basesCounter['G'] + basesCounter['C'])/totalBases)*100
47 | return GC
48 |
49 |
50 | def rev_comp(string):
51 | """Outputs reverse complement of a nucleotide sequence"""
52 | if sys.version_info.major == 2:
53 | import string as st
54 | complement = string.translate(st.maketrans('ACGT', 'TGCA'))
55 | else:
56 | complement = string.translate(str.maketrans('ACGT', 'TGCA'))
57 | return complement[::-1]
58 |
59 |
60 | def rawcharCount(filename, char):
61 | if filename.endswith('gz'):
62 | f = gzip.open(filename, 'rb')
63 | else:
64 | f = open(filename, 'rb')
65 | bufgen = takewhile(lambda x: x, (f.read(1024*1024) for _ in repeat(None)))
66 | return sum( buf.count(char.encode('ASCII')) for buf in bufgen if buf )
67 |
68 |
69 | def get_targetids(filter_seq_ids, target_seq_ids):
70 | """
71 | The function returns the set of desired sequence ids
72 | across which repeats will be identified.
73 | """
74 | target_ids = univset()
75 | if filter_seq_ids:
76 | target_ids = univset()
77 | filter_ids = []
78 | with open(filter_seq_ids) as fh:
79 | for line in fh:
80 | line = line.strip()
81 | line = line.lstrip('>')
82 | line = line.split(' ')[0]
83 | filter_ids.append(line)
84 | target_ids = target_ids - set(filter_ids)
85 |
86 | elif target_seq_ids:
87 | target_ids = []
88 | with open(target_seq_ids) as fh:
89 | for line in fh:
90 | line = line.strip()
91 | line = line.lstrip('>')
92 | line = line.split(' ')[0]
93 | target_ids.append(line)
94 | target_ids = set(target_ids)
95 |
96 | return target_ids
97 |
98 |
99 | class univset(object):
100 | def __init__(self):
101 | self._diff = set()
102 |
103 | def __sub__(self, other):
104 | S = univset()
105 | if type(other) == set:
106 | S._diff = self._diff | other
107 | return S
108 | else:
109 | S._diff = self._diff | other._diff
110 | return S
111 |
112 | def __rsub__(self, other):
113 | return other & self._diff
114 |
115 | def __contains__(self, obj):
116 | return not obj in self._diff
117 |
118 | def __and__(self, other):
119 | return other - self._diff
120 |
121 | def __rand__(self, other):
122 | return other - self._diff
123 |
124 | def __repr__(self):
125 | if self._diff == set():
126 | return "ANY"
127 | else:
128 | return "ANY - %s"%self._diff
129 |
130 | def __or__(self, other):
131 | S = univset()
132 | S._diff = self._diff - other
133 | return S
134 |
135 | def __xor__(self, other):
136 | return (self - other) | (other - self)
137 |
138 | def add(self, elem):
139 | if elem in self._diff:
140 | self._diff.remove(elem)
141 |
142 | def update(self, elem):
143 | self._diff = self._diff - other
144 |
145 | def __ror__(self, other):
146 | return self.__or__(other)
147 |
148 | def union(self, other):
149 | return self.__or__(other)
150 |
151 | def difference(self, other):
152 | return self.__sub__(other)
153 |
154 | def intersection(self, other):
155 | return self.__and__(other)
156 |
157 | def symmetric_difference(self, other):
158 | return self.__xor__(other)
159 |
160 | def __lt__(self, other):
161 | return self.issubset(other)
162 |
163 | def __eq__(self, other):
164 | if type(other) == set:
165 | return False
166 | try:
167 | return self._diff == other._diff
168 | except AttributeError:
169 | return False
170 |
171 | def __ne__(self, other):
172 | return not self.__eq__(other)
173 |
174 | def __le__(self, other):
175 | return self.__lt__(other) or self.__eq__(other)
176 |
177 | def __gt__(self, other):
178 | return self.issuperset(other)
179 |
180 | def __gt__(self, other):
181 | return self.issuperset(other) or self == other
182 |
183 |
184 | class dotDict(dict):
185 | """
186 | Example:
187 | m = dotDict({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer'])
188 | """
189 | def __init__(self, *args, **kwargs):
190 | super(dotDict, self).__init__(*args, **kwargs)
191 | for arg in args:
192 | if isinstance(arg, dict):
193 | for k, v in arg.items():
194 | self[k] = v
195 |
196 | if kwargs:
197 | for k, v in kwargs.items():
198 | self[k] = v
199 |
200 | def __getattr__(self, attr):
201 | return self.get(attr)
202 |
203 | def __setattr__(self, key, value):
204 | self.__setitem__(key, value)
205 |
206 | def __setitem__(self, key, value):
207 | super(dotDict, self).__setitem__(key, value)
208 | self.__dict__.update({key: value})
209 |
210 | def __delattr__(self, item):
211 | self.__delitem__(item)
212 |
213 | def __delitem__(self, key):
214 | super(dotDict, self).__delitem__(key)
215 | del self.__dict__[key]
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # PERF
2 | []()
3 | []()
4 | []()
5 | ## Introduction
6 | PERF is a Python package developed for fast and accurate identification of microsatellites from DNA sequences. Microsatellites or Simple Sequence Repeats (SSRs) are short tandem repeats of 1-6nt motifs. They are present in all genomes, and have a wide range of uses and functional roles. The existing tools for SSR identification have one or more caveats in terms of speed, comprehensiveness, accuracy, ease-of-use, flexibility and memory usage. PERF was designed to address all these problems.
7 |
8 | PERF is a recursive acronym that stands for "PERF is an Exhaustive Repeat Finder". It is compatible with both Python 2 (tested on Python 2.7) and 3 (tested on Python 3.5). Its key features are:
9 | - Fast run time. As an example, identification of all SSRs from the entire human genome takes less than 7 minutes. The speed can be further improved ~3 to 4 fold using [PyPy](https://pypy.org/) (human genome finishes in less than 2 minutes using PyPy v5.8.0)
10 | - Linear time and space complexity (O(n))
11 | - Identifies perfect SSRs
12 | - 100% accurate and comprehensive - Does not miss any repeats or does not pick any incorrect ones
13 | - Easy to use - The only required argument is the input DNA sequence in FASTA format
14 | - Flexible - Most of the parameters are customizable by the user at runtime
15 | - Repeat cutoffs can be specified either in terms of the total repeat length or in terms of number of repeating units
16 | - TSV output and HTML report. The default output is an easily parseable and exportable tab-separated format. Optionally, PERF also generates an interactive HTML report that depicts trends in repeat data as concise charts and tables
17 |
18 | ## Change log
19 |
20 | ## [0.4.6] - 2021-04-22
21 | ### Fixes
22 | - Fixed usage of unit options file input for fastq input.
23 | - Fixed usage of repeats input file.
24 |
25 | ## [0.4.5] - 2020-05-07
26 | ### Added
27 | - Annotation of repeats w.r.t to genomic context using a GFF or GTF file. (option -g).
28 | - Multi-threading. Parallel identification of repeats in different sequences.
29 | - Identification of perfect repeats in fastq files.
30 | - Analysis report for repeats in fastq files.
31 | - Option to identify atomic repeats.
32 |
33 | ### Changed
34 | - Analysis report rebuilt with Semantic ui and Apex Charts.
35 | - Visualisation of repeat annotation data in analysis report.
36 |
37 | ### Fixes
38 | - Python2 compatability fixed.
39 | - Bug fixes for PyPi compatability.
40 | - Import error issues.
41 |
42 | ## Installation
43 | PERF can be directly installed using pip with the package name `perf_ssr`.
44 | ```bash
45 | $ pip install perf_ssr
46 | ```
47 |
48 | This name was chosen for the package so as not to clash with the existing `perf` package.
49 |
50 | Alternatively, it can be installed from the source code:
51 | ```bash
52 | # Download the git repo
53 | $ git clone https://github.com/RKMlab/perf.git
54 |
55 | # Install
56 | $ cd perf
57 | $ python setup.py install
58 | ```
59 | Both of the methods add a console command `PERF`, which can be executed from any directory. It can also be used without installation by running the `core.py` file in the `PERF` subfolder:
60 |
61 | ```bash
62 | $ git clone https://github.com/RKMlab/perf.git
63 | $ cd perf/PERF
64 | $ python core.py -h # Print the help message of PERF (see below)
65 | ```
66 |
67 | ## Usage
68 | The help message and available options can be accessed using
69 | ```bash
70 | $ PERF -h # Short option
71 | $ PERF --help # Long option
72 | ```
73 | which gives the following output
74 | ```
75 | usage: core.py [-h] -i [-o ] [--format ] [--version]
76 | [-rep ] [-m ] [-M ] [-s ] [-S ]
77 | [--include-atomic] [-l | -u INT or FILE] [-a] [--info]
78 | [-g ] [--anno-format ] [--gene-key ]
79 | [--up-promoter ] [--down-promoter ]
80 | [-f | -F ] [-t ]
81 |
82 | Required arguments:
83 | -i , --input
84 | Input sequence file.
85 |
86 | Optional arguments:
87 | -o , --output
88 | Output file name. Default: Input file name + _perf.tsv
89 | --format Input file format. Default: fasta, Permissible: fasta,
90 | fastq
91 | --version show program's version number and exit
92 | -rep , --repeats
93 | File with list of repeats (Not allowed with -m and/or
94 | -M)
95 | -m , --min-motif-size
96 | Minimum size of a repeat motif in bp (Not allowed with
97 | -rep)
98 | -M , --max-motif-size
99 | Maximum size of a repeat motif in bp (Not allowed with
100 | -rep)
101 | -s , --min-seq-length
102 | Minimum size of sequence length for consideration (in
103 | bp)
104 | -S , --max-seq-length
105 | Maximum size of sequence length for consideration (in
106 | bp)
107 | --include-atomic An option to include factor atomic repeats for minimum
108 | motif sizes greater than 1.
109 | -l , --min-length
110 | Minimum length cutoff of repeat
111 | -u INT or FILE, --min-units INT or FILE
112 | Minimum number of repeating units to be considered.
113 | Can be an integer or a file specifying cutoffs for
114 | different motif sizes.
115 | -a, --analyse Generate a summary HTML report.
116 | --info Sequence file info recorded in the output.
117 | -f , --filter-seq-ids
118 | List of sequence ids in fasta file which will be
119 | ignored.
120 | -F , --target-seq-ids
121 | List of sequence ids in fasta file which will be used.
122 | -t , --threads
123 | Number of threads to run the process on. Default is 1.
124 |
125 | Annotation arguments:
126 | -g , --annotate
127 | Genic annotation input file for annotation, Both GFF
128 | and GTF can be processed. Use --anno-format to specify
129 | format.
130 | --anno-format Format of genic annotation file. Valid inputs: GFF,
131 | GTF. Default: GFF
132 | --gene-key Attribute key for geneId. The default identifier is
133 | "gene". Please check the annotation file and pick a
134 | robust gene identifier from the attribute column.
135 | --up-promoter Upstream distance(bp) from TSS to be considered as
136 | promoter region. Default 1000
137 | --down-promoter
138 | Downstream distance(bp) from TSS to be considered as
139 | promoter region. Default 1000
140 | ```
141 | The details of each option are given below:
142 |
143 | ### `-i or --input`
144 | **Expects:** *FILE*
145 | **Default:** *None*
146 | This is the only required argument for the program. The input file must be a valid FASTA/FASTQ file. PERF uses [Biopython's](http://biopython.org/wiki/SeqIO) FASTA parser to read the input fasta files. It accepts both single-line and multi-line sequences. Files with multiple sequences are also valid. To see more details about the FASTA format, see [this page](http://bioperl.org/formats/sequence_formats/FASTA_sequence_format).
147 |
148 | ### `-o or --output`
149 | **Expects:** *STRING (to be used as filename)*
150 | **Default:** *Input Filename + _perf.tsv (see below)*
151 | If this option is not provided, the default output filename will be the same as the input filename, with its extension replaced with '_perf.tsv'. For example, if the input filename is `my_seq.fa`, the default output filename will be `my_seq_perf.tsv`. If the input filename does not have any extension, `_perf.tsv` will be appended to the filename. Please note that even in the case of no identified SSRs, the output file is still created (therefore overwriting any previous file of the same name) but with no content in the file.
152 | #### Output for fasta
153 | The output is a tab-delimited file, with one SSR record per line.
154 | The output columns follow the [BED](https://genome.ucsc.edu/FAQ/FAQformat.html) format. The details of the columns are given below:
155 |
156 | | S.No | Column | Description |
157 | |:----:| ------ | ----------- |
158 | | 1 | Chromosome | Chromosome or Sequence Name as specified by the first word in the FASTA header |
159 | | 2 | Repeat Start | 0-based start position of SSR in the Chromosome |
160 | | 3 | Repeat Stop | End position of SSR in the Chromosome |
161 | | 4 | Repeat Class | Class of repeat as grouped by their cyclical variations |
162 | | 5 | Repeat Length | Total length of identified repeat in nt |
163 | | 6 | Repeat Strand | Strand of SSR based on their cyclical variation |
164 | | 7 | Motif Number | Number of times the base motif is repeated |
165 | | 8 | Actual Repeat | Starting sequence of the SSR irrespective of Repeat class and strand|
166 |
167 | An example output showing some of the largest repeats from *Drosophila melanogaster* is given below
168 | ```
169 | X 22012826 22014795 ACTGGG 1969 - 328 TCCCAG
170 | 2RHet 591337 591966 AATACT 629 - 104 ATTAGT
171 | 4 1042143 1042690 AAATAT 547 + 91 AAATAT
172 | 2RHet 598244 598789 AATACT 545 - 90 AGTATT
173 | XHet 122 663 AGAT 541 + 135 GATA
174 | X 22422335 22422827 AGAT 492 + 123 GATA
175 | 3R 975265 975710 AAAT 445 - 111 TTAT
176 | X 15442288 15442724 ACAGAT 436 + 72 ACAGAT
177 | 2L 22086818 22087152 AATACT 334 - 55 TATTAG
178 | YHet 137144 137466 AAGAC 322 - 64 CTTGT
179 | ```
180 |
181 | #### Output for fastq
182 | The output is a tab-delimited file, with data on each repeat class per line.
183 | | S.No | Column | Description |
184 | |:----:| ------ | ----------- |
185 | | 1 | Repeat Class | Class of repeat as grouped by their cyclical variations |
186 | | 2 | Number of reads | Number of reads having an instance of the repeat |
187 | | 3 | Frequency | Total number of instances of the repeat |
188 | | 4 | Bases | Total number of bases covered by the repeat |
189 | | 5 | Repeat reads per million reads | Number of |
190 | | 6 | Instances per million reads | Strand of SSR based on their cyclical variation |
191 | | 7 | Repeat Bases per MB of sequence | Number of times the base motif is repeated |
192 | | 8 | Length distribution | Starting sequence of the SSR irrespective of Repeat class and strand|
193 | | 9 | Motif distribution | Starting sequence of the SSR irrespective of Repeat class and strand|
194 |
195 |
196 | ### `--format`
197 | **Expects:** *STRING (specifying format of the file)*
198 | **Default:** *fasta*
199 | PERF was originally developed to identify repeats in FASTA files. In version 4.0.0 PERF can identify repeats in FASTQ sequence files as well. The default format the program expects is fasta. Specify input format as 'fasta' for FASTA files and 'fastq' for FASTQ files.
200 |
201 | ### `-a or --analyze`
202 | **Expects:** *None*
203 | **Default:** *False*
204 | In addition to the default tab-separated output, PERF can also generate a fully interactive HTML report for easy downstream analysis of the repeat data. The filename will be the same prefix as that of the main output. For example, if the input filename was `my_seq.fa`, the analysis report will be `my_seq_perf.html`. An example HTML report, generated from the repeat data of *Homo sapiens* (build hg19), can be accessed [here](https://raw.githubusercontent.com/RKMlab/perf/html-report/test_data/Human_hg19_perf.html) (Right click -> Save As).
205 |
206 | ### `-l or --min-length`
207 | **Expects:** *INTEGER*
208 | **Default:** *12*
209 | Minimum length cut-off to be considered when finding an SSR. The same cut-off will apply for SSRs of all motif lengths, even if the motif length is not a divisor of this value. In such cases, SSRs that end with a partial motif are also picked if they pass the length cut-off.
210 |
211 | ### `-u or --min-units`
212 | **Expects:** *INTEGER* OR *FILE*
213 | **Default:** *None*
214 | This option finds SSRs with a minimum number of repeating motifs. The argument accepts either an integer or file. If an integer is specified, the same value is used for all motif lengths. Instead, a specific value for each motif length using a two-column tab-separated file as demonstrated below:
215 |
216 | ```bash
217 | $ cat repeat_units.txt
218 | 1 10
219 | 2 6
220 | 3 4
221 | 4 3
222 | 5 2
223 | 6 2
224 | ```
225 |
226 | The first column specifies the motif length, and the second column specifies the minimum number of times the motif should be repeated to be considered an SSR. This file can be used to identify repeats with different number of repeating motifs: monomers repeated at least 10 times, dimers repeated at least 6 times etc., using the following command
227 | ``` bash
228 | $ PERF -i my_seq.fa -m 1 -M 6 -u repeat_units.txt
229 | ```
230 |
231 | ### `-rep or --repeats`
232 | **Expects:** *FILE*
233 | **Default:** *None*
234 | PERF provides an option to limit the search to specific repeat motifs. The repeats of interest should be specified via a file containing 4 tab-separated columns, as shown below:
235 |
236 | ```bash
237 | $ cat my_repeats.txt
238 | A A 1 +
239 | T A 1 -
240 | AG AG 2 +
241 | CT AG 2 -
242 | GA AG 2 +
243 | TC AG 2 -
244 | $ PERF -i my_seq.fa -rep my_repeats.txt # Find all A and AG repeats from my_seq.fa
245 | ```
246 |
247 | **Note:** This option is not allowed when `-m` or `-M` options are used.
248 | ### `-m or --min-motif-size`
249 | **Expects:** *INTEGER*
250 | **Default:** *1*
251 | Minimum length of motifs to be considered. By default, PERF ignores redundant motifs. For example, a stretch of 12 A's is considered a monomer repeat of 12 A's rather than a dimer repeat of 6 AA's. However, this is only true if `-m` is set to 1. If for example, `-m` is set to 2, then stretches of 12 A's are reported as dimer AA repeats. If this behavior isn't desired, we suggest using the `-rep` option (see above) to specify the motifs that should/shouldn't be included.
252 |
253 | ### `-M or --max-motif-size`
254 | **Expects:** *INTEGER*
255 | **Default:** *6*
256 | Maximum length of motifs to be considered. Setting a large value of `-M` has a non-trivial effect on both the runtime and memory usage of PERF. This is noticeable with `-M` values above 10.
257 |
258 | ### `--include-atomic`
259 | **Expects:** *None*
260 | **Default:** *False*
261 | Searches for atomic repeats when set to *True*. For example, when minimum motif size is set to 2bp, PERF ignores monomer repeats. When include atomic repeats is set to *True*, PERF identifies AA, CC, GG and TT as dimer repeats.
262 |
263 | ### `-s or --min-seq-length`
264 | **Expects:** *INTEGER*
265 | **Default:** *0*
266 | Minimum length of the input sequence to be searched for SSRs in bp. All sequences in the input file that are smaller than this length will be ignored.
267 |
268 | ### `-S or --max-seq-length`
269 | **Expects:** *INTEGER*
270 | **Default:** *Infinity*
271 | Maximum length of the input sequence to be searched for SSRs in bp. All sequences in the input file that are larger than this length will be ignored.
272 |
273 | ### `-f or --filter-seq-ids`
274 | **Expects:** *FILE*
275 | **Default:** *None*
276 | This option accepts a file with a list of sequence IDs in the input file that should be ignored. Useful for ignoring contigs, scaffold, or other poor quality sequences. The IDs can be FASTA headers (starting with '>' symbol) or just the names without the '>' symbol.
277 |
278 | ### `-F or --target-seq-ids`
279 | **Expects:** *FILE*
280 | **Default:** *None*
281 | This option accepts a file with a list of sequence IDs in the input file that should be analyzed. All other sequences will be ignored. Useful for analyzing specific chromosomes from a large input file. The IDs can be FASTA headers (starting with '>' symbol) or just the names without the '>' symbol.
282 |
283 | ### `--info`
284 | **Expects:** *None*
285 | **Default:** *False*
286 | This option when set to *True*, includes information about the input sequence files and repeat summary data in the output file.
287 |
288 | ```bash
289 | $ tail -5 test_input_perf.tsv
290 | gi|514486271|gb|KE346361.1| 2667759 2667775 ATC 16 + 5 CAT
291 | #File_name: test_input.fa
292 | #Total_sequences: 2
293 | #Total_bases: 6462134
294 | #GC: 53.970000
295 | ```
296 |
297 | ### `-g or --annotate`
298 | **Expects:** *FILE*
299 | **Default:** *None*
300 | Input a genomic feature file to annotate the repeats in the genomic context. PERF accepts both GFF and GTF format genomic feature files. Each repeat is annotated w.r.t the closest gene and classified either as Genic, Exonic, Intronic and Intergenic according to the position of the repeat. Besides this, the repeat is also checked if it falls in the promoter region of the gene. Annotation adds 7 columns to the default perf output which already consist 8 columns.
301 |
302 | | S.No | Column | Description |
303 | |:----:| ------ | ----------- |
304 | | 9 | Gene name | Name of the closest gene |
305 | | 10 | Gene Start | Start position of gene in the Chromosome |
306 | | 11 | Gene Stop | End position of gene in the Chromosome |
307 | | 12 | Strand | The strand orientation of the gene |
308 | | 13 | Genomic annotation | Annotation of the repeat w.r.t to the gene. Possible annotations are {Genic, Exonic, Intronic, Intergenic} |
309 | | 14 | Promoter annotation | If repeat falls in the promoter region of the closest gene. The default promoter region is 1Kb upstream and downstream of TSS. |
310 | | 15 | Distance from TSS | Distance of the repeat from the TSS of the gene. |
311 |
312 | ### `--anno-format`
313 | **Expects:** *STRING*
314 | **Default:** *GFF*
315 | Option to specify the format of the input genomic feature file. Accepted inputs are GFF or GTF. More details about the GFF and GTF formats can be found [here](https://asia.ensembl.org/info/website/upload/gff.html).
316 |
317 | ### `--gene-key`
318 | **Expects:** *STRING*
319 | **Default:** *gene*
320 | The attribute key used for the name of the gene in the GFF/GTF file. In the below example GFF file, we have the location of a gene and it's mRNA and exon locations. The last column of the file specifies attributes associated with each feature, like ID, Parent, gene etc. PERF uses on of the attribute to identify the gene and also it's exons. In th below example the key "gene" can be used to identify gene and the exons of the gene as they have the same gene name. Please check your GFF/GTF file for a robust attribute key which can identify all genes and their corresponding exons. We are actively working on better annotation where we can identify genes and their exons based on the ID and Parent.
321 |
322 | ```
323 | # Sample GFF
324 | NC_004354.4 RefSeq gene 124370 126714 . - . ID=gene1;Name=CG17636;gbkey=Gene;gene=CG17636;gene_biotype=protein_coding;gene_synonym=DmelCG17636,EG:23E12.1;
325 | NC_004354.4 RefSeq mRNA 124370 126714 . - . ID=rna1;Parent=gene1;Name=NM_001103384.3;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3
326 | NC_004354.4 RefSeq exon 126626 126714 . - . ID=id13;Parent=rna1;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3
327 | NC_004354.4 RefSeq exon 125495 126259 . - . ID=id14;Parent=rna1;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3
328 | ```
329 |
330 | ### `--up-promoter`
331 | **Expects:** *INT*
332 | **Default:** *1000*
333 | Upstream distance(bp) from the TSS of the gene to be considered as promoter region. Default 1000.
334 |
335 | ### `--down-promoter`
336 | **Expects:** *INT*
337 | **Default:** *1000*
338 | Downstream distance(bp) from the TSS of the gene to be considered as promoter region. Default 1000.
339 |
340 | ### `--version`
341 | Prints the version info of PERF.
342 |
343 | ## Examples
344 |
345 | The following examples assume that the file with input sequence in FASTA format is named `my_seq.fa`.
346 |
347 | #### Basic Usage
348 | ``` bash
349 | # Find all monomer to hexamer repeats of >=12nt length
350 | $ PERF -i my_seq.fa
351 | # Specify output filename
352 | $ PERF -i my_seq.fa -o PERF_output.tsv
353 | # Specify fastq format
354 | $ PERF -i my_seq.fastq --format fastq
355 | ```
356 |
357 | #### Generate Analysis Report
358 | ``` bash
359 | # Find all monomer to hexamer repeats of >=12nt length and generate an HTML report
360 | $ PERF -i my_seq.fa -a
361 | # Specify output filename
362 | $ PERF -i my_seq.fa -o PERF_out.tsv -a # HTML file is called PERF_out.html
363 | ```
364 |
365 | #### Annotate Repeats
366 | ``` bash
367 | # Find all monomer to hexamer repeats of >=12nt length and annotate them
368 | $ PERF -i my_seq.fa -g my_seq.gff
369 | # Specify feature file format and set downstream promoter region to 500bp
370 | $ PERF -i my_seq.fa -g my_seq.gtf --anno-format gtf --down-promoter 500 # HTML file is called PERF_out.html
371 | ```
372 |
373 | #### Set Cut-off Criteria
374 | ```bash
375 | # Find all monomer to hexamer repeats of >=15nt length
376 | $ PERF -i my_seq.fa -l 15
377 | # Find SSRs with at least 6 repeating motifs (for all motif lengths)
378 | $ PERF -i my_seq.fa -u 6
379 | ```
380 |
381 | #### Identify Specific Repeats
382 | ``` bash
383 | $ cat my_repeats.txt
384 | AG AG 2 +
385 | CT AG 2 -
386 | GA AG 2 +
387 | TC AG 2 -
388 | # Find all AG repeats and generate an HTML report
389 | $ PERF -i my_seq.fa -rep my_repeats.txt -a
390 | ```
391 |
392 | #### Change Motif Length Cut-offs
393 | ```bash
394 | # Ignore monomer and dimer repeats, and repeats with <4 repeating units
395 | $ PERF -i my_seq.fa -m 3 -u 4
396 | # Report only tetramer repeats of >=16nt length, and generate HTML report
397 | $ PERF -i my_seq.fa -m 4 -M 4 -l 16 -a
398 |
399 | ```
400 |
401 | In all the above examples, the output of PERF is saved to `my_seq_perf.tsv` and the HTML report is saved to `my_seq_perf.html` unless `-o` is specified.
402 |
403 |
404 | ## Citation
405 |
406 | If you find PERF useful for your research, please cite it as follows:
407 |
408 | PERF: an exhaustive algorithm for ultra-fast and efficient identification of microsatellites from large DNA sequences
409 | *Akshay Kumar Avvaru, Divya Tej Sowpati, Rakesh Kumar Mishra*
410 | Bioinformatics, , btx721
411 | [doi](https://doi.org/10.1093/bioinformatics/btx721): 10.1093/bioinformatics/btx721
412 |
413 | ## Contact
414 | For queries or suggestions, please contact:
415 |
416 | Divya Tej Sowpati -
417 | Akshay Kumar Avvaru -
418 |
419 |
--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
1 | PERF
2 | ====
3 |
4 | Introduction
5 | ------------
6 |
7 | PERF is a Python package developed for fast and accurate identification of microsatellites from DNA sequences. Microsatellites or Simple Sequence Repeats (SSRs) are short tandem repeats of 1-6nt motifs. They are present in all genomes, and have a wide range of uses and functional roles. The existing tools for SSR identification have one or more caveats in terms of speed, comprehensiveness,
8 | accuracy, ease-of-use, flexibility and memory usage. PERF was designed to address all these problems.
9 |
10 | PERF is a recursive acronym that stands for “PERF is an Exhaustive Repeat Finder”. It is compatible with both Python 2 (tested on Python 2.7) and 3 (tested on Python 3.5). Its key features are:
11 |
12 | - Fast run time, despite being a single-threaded application. As an example, identification of all SSRs from the entire human genome takes less than 7 minutes. The speed can be further improved ~3 to 4 fold using (human genome finishes in less than 2 minutes using PyPy v5.8.0)
13 | - Linear time and space complexity (O(n))
14 | - Identifies perfect SSRs
15 | - 100% accurate and comprehensive
16 | - Does not miss any repeats or does not pick any incorrect ones
17 | - Easy to use
18 | - The only required argument is the input DNA sequence in FASTA format
19 | - Flexible
20 | - Most of the parameters are customizable by the user at runtime
21 | - Repeat cutoffs can be specified either in terms of the total repeat length or in terms of number of repeating units
22 | - TSV output and HTML report
23 |
24 | The default output is an easily parseable and exportable tab-separated format. Optionally, PERF also generates an interactive HTML report that depicts trends in repeat data as concise charts and tables.
25 |
26 | Installation
27 | ------------
28 |
29 | PERF can be directly installed using pip with the package name
30 | ``perf_ssr``.
31 |
32 | .. code:: bash
33 |
34 | $ pip install perf_ssr
35 |
36 | This name was chosen for the package so as not to clash with the existing ``perf`` package.
37 |
38 | Alternatively, it can also be installed from the source code:
39 |
40 | .. code:: bash
41 |
42 | # Download the git repo
43 | $ git clone https://github.com/RKMlab/perf.git
44 |
45 | # Install
46 | $ cd perf
47 | $ python setup.py install
48 |
49 | Both of the methods add a console command ``PERF``, which can be executed from any directory. It can also be used without installation by running the ``core.py`` file in the ``PERF`` subfolder:
50 |
51 | .. code:: bash
52 |
53 | $ git clone https://github.com/RKMlab/perf.git
54 | $ cd perf/PERF
55 | $ python core.py -h # Print the help message of PERF (see below)
56 |
57 | Usage
58 | -----
59 |
60 | The help message and available options can be accessed using
61 |
62 | .. code:: bash
63 |
64 | $ PERF -h # Short option
65 | $ PERF --help # Long option
66 |
67 | which gives the following output
68 |
69 | ::
70 |
71 | usage: PERF [-h] -i [-o ] [-a] [-l | -u INT or FILE]
72 | [-rep ] [-m ] [-M ] [--version]
73 |
74 | Required arguments:
75 | -i , --input
76 | Input file in FASTA format
77 |
78 | Optional arguments:
79 | -o , --output
80 | Output file name. Default: Input file name + _perf.tsv
81 | -a, --analyse Generate a summary HTML report.
82 | -l , --min-length
83 | Minimum length cutoff of repeat
84 | -u INT or FILE, --min-units INT or FILE
85 | Minimum number of repeating units to be considered.
86 | Can be an integer or a file specifying cutoffs for
87 | different motif sizes.
88 | -rep , --repeats
89 | File with list of repeats (Not allowed with -m and/or
90 | -M)
91 | -m , --min-motif-size
92 | Minimum size of a repeat motif in bp (Not allowed with
93 | -rep)
94 | -M , --max-motif-size
95 | Maximum size of a repeat motif in bp (Not allowed with
96 | -rep)
97 | --version show program's version number and exit
--------------------------------------------------------------------------------
/pylint.rc:
--------------------------------------------------------------------------------
1 | [MASTER]
2 |
3 | # Specify a configuration file.
4 | #rcfile=
5 |
6 | # Python code to execute, usually for sys.path manipulation such as
7 | # pygtk.require().
8 | #init-hook=
9 |
10 | # Add files or directories to the blacklist. They should be base names, not
11 | # paths.
12 | ignore=CVS
13 |
14 | # Add files or directories matching the regex patterns to the blacklist. The
15 | # regex matches against base names, not paths.
16 | ignore-patterns=
17 |
18 | # Pickle collected data for later comparisons.
19 | persistent=yes
20 |
21 | # List of plugins (as comma separated values of python modules names) to load,
22 | # usually to register additional checkers.
23 | load-plugins=
24 |
25 | # Use multiple processes to speed up Pylint.
26 | jobs=1
27 |
28 | # Allow loading of arbitrary C extensions. Extensions are imported into the
29 | # active Python interpreter and may run arbitrary code.
30 | unsafe-load-any-extension=no
31 |
32 | # A comma-separated list of package or module names from where C extensions may
33 | # be loaded. Extensions are loading into the active Python interpreter and may
34 | # run arbitrary code
35 | extension-pkg-whitelist=
36 |
37 | # Allow optimization of some AST trees. This will activate a peephole AST
38 | # optimizer, which will apply various small optimizations. For instance, it can
39 | # be used to obtain the result of joining multiple strings with the addition
40 | # operator. Joining a lot of strings can lead to a maximum recursion error in
41 | # Pylint and this flag can prevent that. It has one side effect, the resulting
42 | # AST will be different than the one from reality. This option is deprecated
43 | # and it will be removed in Pylint 2.0.
44 | optimize-ast=no
45 |
46 |
47 | [MESSAGES CONTROL]
48 |
49 | # Only show warnings with the listed confidence levels. Leave empty to show
50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
51 | confidence=
52 |
53 | # Enable the message, report, category or checker with the given id(s). You can
54 | # either give multiple identifier separated by comma (,) or put this option
55 | # multiple time (only on the command line, not in the configuration file where
56 | # it should appear only once). See also the "--disable" option for examples.
57 | #enable=
58 |
59 | # Disable the message, report, category or checker with the given id(s). You
60 | # can either give multiple identifiers separated by comma (,) or put this
61 | # option multiple times (only on the command line, not in the configuration
62 | # file where it should appear only once).You can also use "--disable=all" to
63 | # disable everything first and then reenable specific checks. For example, if
64 | # you want to run only the similarities checker, you can use "--disable=all
65 | # --enable=similarities". If you want to run only the classes checker, but have
66 | # no Warning level messages displayed, use"--disable=all --enable=classes
67 | # --disable=W"
68 | disable=zip-builtin-not-iterating,oct-method,indexing-exception,file-builtin,parameter-unpacking,coerce-builtin,map-builtin-not-iterating,range-builtin-not-iterating,useless-suppression,raising-string,old-ne-operator,reload-builtin,long-suffix,filter-builtin-not-iterating,using-cmp-argument,old-octal-literal,print-statement,xrange-builtin,buffer-builtin,raw_input-builtin,coerce-method,reduce-builtin,cmp-builtin,getslice-method,unpacking-in-except,import-star-module-level,delslice-method,suppressed-message,cmp-method,unichr-builtin,basestring-builtin,old-division,unicode-builtin,nonzero-method,metaclass-assignment,old-raise-syntax,input-builtin,long-builtin,dict-view-method,apply-builtin,round-builtin,setslice-method,next-method-called,intern-builtin,hex-method,dict-iter-method,execfile-builtin,backtick,no-absolute-import,standarderror-builtin,locally-disabled
69 |
70 |
71 | [REPORTS]
72 |
73 | # Set the output format. Available formats are text, parseable, colorized, msvs
74 | # (visual studio) and html. You can also give a reporter class, eg
75 | # mypackage.mymodule.MyReporterClass.
76 | output-format=text
77 |
78 | # Put messages in a separate file for each module / package specified on the
79 | # command line instead of printing them on stdout. Reports (if any) will be
80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated
81 | # and it will be removed in Pylint 2.0.
82 | files-output=no
83 |
84 | # Tells whether to display a full report or only the messages
85 | reports=yes
86 |
87 | # Python expression which should return a note less than 10 (10 is the highest
88 | # note). You have access to the variables errors warning, statement which
89 | # respectively contain the number of errors / warnings messages and the total
90 | # number of statements analyzed. This is used by the global evaluation report
91 | # (RP0004).
92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
93 |
94 | # Template used to display messages. This is a python new-style format string
95 | # used to format the message information. See doc for all details
96 | #msg-template=
97 |
98 |
99 | [SPELLING]
100 |
101 | # Spelling dictionary name. Available dictionaries: none. To make it working
102 | # install python-enchant package.
103 | spelling-dict=
104 |
105 | # List of comma separated words that should not be checked.
106 | spelling-ignore-words=
107 |
108 | # A path to a file that contains private dictionary; one word per line.
109 | spelling-private-dict-file=
110 |
111 | # Tells whether to store unknown words to indicated private dictionary in
112 | # --spelling-private-dict-file option instead of raising a message.
113 | spelling-store-unknown-words=no
114 |
115 |
116 | [TYPECHECK]
117 |
118 | # Tells whether missing members accessed in mixin class should be ignored. A
119 | # mixin class is detected if its name ends with "mixin" (case insensitive).
120 | ignore-mixin-members=yes
121 |
122 | # List of module names for which member attributes should not be checked
123 | # (useful for modules/projects where namespaces are manipulated during runtime
124 | # and thus existing member attributes cannot be deduced by static analysis. It
125 | # supports qualified module names, as well as Unix pattern matching.
126 | ignored-modules=
127 |
128 | # List of class names for which member attributes should not be checked (useful
129 | # for classes with dynamically set attributes). This supports the use of
130 | # qualified names.
131 | ignored-classes=optparse.Values,thread._local,_thread._local
132 |
133 | # List of members which are set dynamically and missed by pylint inference
134 | # system, and so shouldn't trigger E1101 when accessed. Python regular
135 | # expressions are accepted.
136 | generated-members=
137 |
138 | # List of decorators that produce context managers, such as
139 | # contextlib.contextmanager. Add to this list to register other decorators that
140 | # produce valid context managers.
141 | contextmanager-decorators=contextlib.contextmanager
142 |
143 |
144 | [SIMILARITIES]
145 |
146 | # Minimum lines number of a similarity.
147 | min-similarity-lines=4
148 |
149 | # Ignore comments when computing similarities.
150 | ignore-comments=yes
151 |
152 | # Ignore docstrings when computing similarities.
153 | ignore-docstrings=yes
154 |
155 | # Ignore imports when computing similarities.
156 | ignore-imports=no
157 |
158 |
159 | [BASIC]
160 |
161 | # Good variable names which should always be accepted, separated by a comma
162 | good-names=i,j,k,ex,Run,_
163 |
164 | # Bad variable names which should always be refused, separated by a comma
165 | bad-names=foo,bar,baz,toto,tutu,tata
166 |
167 | # Colon-delimited sets of names that determine each other's naming style when
168 | # the name regexes allow several styles.
169 | name-group=
170 |
171 | # Include a hint for the correct naming format with invalid-name
172 | include-naming-hint=no
173 |
174 | # List of decorators that produce properties, such as abc.abstractproperty. Add
175 | # to this list to register other decorators that produce valid properties.
176 | property-classes=abc.abstractproperty
177 |
178 | # Regular expression matching correct class attribute names
179 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
180 |
181 | # Naming hint for class attribute names
182 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
183 |
184 | # Regular expression matching correct method names
185 | method-rgx=[a-z_][a-z0-9_]{2,30}$
186 |
187 | # Naming hint for method names
188 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
189 |
190 | # Regular expression matching correct constant names
191 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
192 |
193 | # Naming hint for constant names
194 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
195 |
196 | # Regular expression matching correct argument names
197 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
198 |
199 | # Naming hint for argument names
200 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
201 |
202 | # Regular expression matching correct variable names
203 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
204 |
205 | # Naming hint for variable names
206 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
207 |
208 | # Regular expression matching correct module names
209 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
210 |
211 | # Naming hint for module names
212 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
213 |
214 | # Regular expression matching correct class names
215 | class-rgx=[A-Z_][a-zA-Z0-9]+$
216 |
217 | # Naming hint for class names
218 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
219 |
220 | # Regular expression matching correct function names
221 | function-rgx=[a-z_][a-z0-9_]{2,30}$
222 |
223 | # Naming hint for function names
224 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
225 |
226 | # Regular expression matching correct attribute names
227 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
228 |
229 | # Naming hint for attribute names
230 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
231 |
232 | # Regular expression matching correct inline iteration names
233 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
234 |
235 | # Naming hint for inline iteration names
236 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
237 |
238 | # Regular expression which should only match function or class names that do
239 | # not require a docstring.
240 | no-docstring-rgx=^_
241 |
242 | # Minimum line length for functions/classes that require docstrings, shorter
243 | # ones are exempt.
244 | docstring-min-length=-1
245 |
246 |
247 | [ELIF]
248 |
249 | # Maximum number of nested blocks for function / method body
250 | max-nested-blocks=5
251 |
252 |
253 | [VARIABLES]
254 |
255 | # Tells whether we should check for unused import in __init__ files.
256 | init-import=no
257 |
258 | # A regular expression matching the name of dummy variables (i.e. expectedly
259 | # not used).
260 | dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
261 |
262 | # List of additional names supposed to be defined in builtins. Remember that
263 | # you should avoid to define new builtins when possible.
264 | additional-builtins=
265 |
266 | # List of strings which can identify a callback function by name. A callback
267 | # name must start or end with one of those strings.
268 | callbacks=cb_,_cb
269 |
270 | # List of qualified module names which can have objects that can redefine
271 | # builtins.
272 | redefining-builtins-modules=six.moves,future.builtins
273 |
274 |
275 | [LOGGING]
276 |
277 | # Logging modules to check that the string format arguments are in logging
278 | # function parameter format
279 | logging-modules=logging
280 |
281 |
282 | [FORMAT]
283 |
284 | # Maximum number of characters on a single line.
285 | max-line-length=100
286 |
287 | # Regexp for a line that is allowed to be longer than the limit.
288 | ignore-long-lines=^\s*(# )??$
289 |
290 | # Allow the body of an if to be on the same line as the test if there is no
291 | # else.
292 | single-line-if-stmt=no
293 |
294 | # List of optional constructs for which whitespace checking is disabled. `dict-
295 | # separator` is used to allow tabulation in dicts, etc.: {1 : 1,\n222: 2}.
296 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
297 | # `empty-line` allows space-only lines.
298 | no-space-check=trailing-comma,dict-separator
299 |
300 | # Maximum number of lines in a module
301 | max-module-lines=1000
302 |
303 | # String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
304 | # tab).
305 | indent-string=' '
306 |
307 | # Number of spaces of indent required inside a hanging or continued line.
308 | indent-after-paren=4
309 |
310 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
311 | expected-line-ending-format=
312 |
313 |
314 | [MISCELLANEOUS]
315 |
316 | # List of note tags to take in consideration, separated by a comma.
317 | notes=FIXME,XXX,TODO
318 |
319 |
320 | [CLASSES]
321 |
322 | # List of method names used to declare (i.e. assign) instance attributes.
323 | defining-attr-methods=__init__,__new__,setUp
324 |
325 | # List of valid names for the first argument in a class method.
326 | valid-classmethod-first-arg=cls
327 |
328 | # List of valid names for the first argument in a metaclass class method.
329 | valid-metaclass-classmethod-first-arg=mcs
330 |
331 | # List of member names, which should be excluded from the protected access
332 | # warning.
333 | exclude-protected=_asdict,_fields,_replace,_source,_make
334 |
335 |
336 | [DESIGN]
337 |
338 | # Maximum number of arguments for function / method
339 | max-args=5
340 |
341 | # Argument names that match this expression will be ignored. Default to name
342 | # with leading underscore
343 | ignored-argument-names=_.*
344 |
345 | # Maximum number of locals for function / method body
346 | max-locals=15
347 |
348 | # Maximum number of return / yield for function / method body
349 | max-returns=6
350 |
351 | # Maximum number of branch for function / method body
352 | max-branches=12
353 |
354 | # Maximum number of statements in function / method body
355 | max-statements=50
356 |
357 | # Maximum number of parents for a class (see R0901).
358 | max-parents=7
359 |
360 | # Maximum number of attributes for a class (see R0902).
361 | max-attributes=7
362 |
363 | # Minimum number of public methods for a class (see R0903).
364 | min-public-methods=2
365 |
366 | # Maximum number of public methods for a class (see R0904).
367 | max-public-methods=20
368 |
369 | # Maximum number of boolean expressions in a if statement
370 | max-bool-expr=5
371 |
372 |
373 | [IMPORTS]
374 |
375 | # Deprecated modules which should not be used, separated by a comma
376 | deprecated-modules=optparse
377 |
378 | # Create a graph of every (i.e. internal and external) dependencies in the
379 | # given file (report RP0402 must not be disabled)
380 | import-graph=
381 |
382 | # Create a graph of external dependencies in the given file (report RP0402 must
383 | # not be disabled)
384 | ext-import-graph=
385 |
386 | # Create a graph of internal dependencies in the given file (report RP0402 must
387 | # not be disabled)
388 | int-import-graph=
389 |
390 | # Force import order to recognize a module as part of the standard
391 | # compatibility libraries.
392 | known-standard-library=
393 |
394 | # Force import order to recognize a module as part of a third party library.
395 | known-third-party=enchant
396 |
397 | # Analyse import fallback blocks. This can be used to support both Python 2 and
398 | # 3 compatible code, which means that the block might have code that exists
399 | # only in one or another interpreter, leading to false positives when analysed.
400 | analyse-fallback-blocks=no
401 |
402 |
403 | [EXCEPTIONS]
404 |
405 | # Exceptions that will emit a warning when being caught. Defaults to
406 | # "Exception"
407 | overgeneral-exceptions=Exception
408 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython>=1.67
2 | regex>=2016.8.27
3 | tqdm>=4.8.4
4 |
--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 |
3 | from setuptools import setup, find_packages
4 |
5 | setup(
6 | name='perf_ssr',
7 | version='0.4.6',
8 | description='PERF is an exhaustive repeat finder',
9 | url='https://github.com/rkmlab/perf',
10 | keywords='ssr microsatellites',
11 | author='Divya Tej Sowpati',
12 | author_email='tej@ccmb.res.in',
13 | license='MIT',
14 | packages=find_packages(),
15 | install_requires=['biopython==1.69', 'tqdm>=4'], # biopython version 1.69 installs numpy
16 | entry_points={
17 | 'console_scripts': ['PERF=PERF.core:main']
18 | },
19 | include_package_data=True, # change path according to package name in MANIFEST.in
20 | )
--------------------------------------------------------------------------------
/test_data/repeat_options.txt:
--------------------------------------------------------------------------------
1 | A A 1 +
2 | T A 1 -
3 | AG AG 2 +
4 | CT AG 2 -
5 | GA AG 2 +
6 | TC AG 2 -
7 |
--------------------------------------------------------------------------------
/test_data/test_input.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RKMlab/perf/e343a1fa437033afce5b5a794079230530619983/test_data/test_input.fastq.gz
--------------------------------------------------------------------------------
/test_data/unit_options.txt:
--------------------------------------------------------------------------------
1 | 1 12
2 | 2 6
3 | 3 4
4 | 4 3
5 | 5 2
6 | 6 2
7 |
--------------------------------------------------------------------------------
/utils/repeat_generator.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 |
3 | # pylint: disable=C0103
4 | from __future__ import print_function
5 | import sys
6 | import argparse
7 | from itertools import product
8 |
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-m', '--min-motif-size', type=int, metavar='', default=1, help='Minimum size of a repeat motif in bp')
11 | parser.add_argument('-M', '--max-motif-size', type=int, metavar='', default=6, help='Maximum size of a repeat motif in bp')
12 | parser.add_argument('-fo', '--out', type=argparse.FileType('w'), metavar='', default=sys.stdout, help='Output file')
13 | args = parser.parse_args()
14 |
15 | def rev_comp(string):
16 | complement = string.translate(str.maketrans('ACGT', 'TGCA'))
17 | return complement[::-1]
18 |
19 |
20 | def expand_repeat(string, size):
21 | return_string = ''
22 | i = 0
23 | while len(return_string) < size:
24 | return_string += string[i]
25 | i += 1
26 | if i >= len(string):
27 | i = 0
28 | return return_string
29 |
30 |
31 | def get_cycles(string):
32 | cycles = []
33 | for i in range(len(string)):
34 | cycles.append(string[i:] + string[:i])
35 | return cycles
36 |
37 |
38 | def generate_repeats(min_size, max_size, output_file):
39 | alphabet = ['A', 'C', 'G', 'T']
40 | expanded_set = set()
41 | repeat_set = set()
42 | for i in range(min_size, max_size+1):
43 | for combination in product(alphabet, repeat=i):
44 | repeat = ''.join(combination)
45 | repeat_revcomp = rev_comp(repeat)
46 | expanded = expand_repeat(repeat, max_size)
47 | if expanded in expanded_set:
48 | continue
49 | else:
50 | repeat_cycles = get_cycles(repeat)
51 | for cycle in repeat_cycles:
52 | strand = '+'
53 | string = expand_repeat(cycle, max_size)
54 | expanded_set.add(string)
55 | if cycle not in repeat_set:
56 | repeat_set.add(cycle)
57 | print(cycle, repeat, str(len(cycle)), strand, sep='\t', file=output_file)
58 | if repeat_revcomp == repeat:
59 | continue
60 | repeat_cycles = get_cycles(repeat_revcomp)
61 | for cycle in repeat_cycles:
62 | strand = '-'
63 | string = expand_repeat(cycle, max_size)
64 | expanded_set.add(string)
65 | if cycle not in repeat_set:
66 | repeat_set.add(cycle)
67 | print(cycle, repeat, str(len(cycle)), strand, sep='\t', file=output_file)
68 |
69 | min_motif_size = args.min_motif_size
70 | max_motif_size = args.max_motif_size
71 | generate_repeats(min_motif_size, max_motif_size, args.output)
72 |
--------------------------------------------------------------------------------