├── .gitignore
├── LICENSE
├── MANIFEST.in
├── PERF
    ├── __init__.py
    ├── all_repeats_1-6nt.txt
    ├── analyse.py
    ├── annotation.py
    ├── core.py
    ├── fastq_utils.py
    ├── lib
    │   ├── src
    │   │   ├── all.js
    │   │   ├── anno_charts.js
    │   │   ├── apexcharts.min.js
    │   │   ├── jquery-3.5.0.min.js
    │   │   ├── jquery.multi-select.min.js
    │   │   ├── lodash.min.js
    │   │   ├── main_fasta.js
    │   │   ├── main_fastq.js
    │   │   ├── semantic.min.js
    │   │   ├── tables_fasta.js
    │   │   └── tables_fastq.js
    │   ├── styles
    │   │   ├── apexcharts.min.css
    │   │   ├── main.css
    │   │   ├── multi-select.min.css
    │   │   └── semantic.min.css
    │   ├── template_fasta.html
    │   └── template_fastq.html
    ├── rep_utils.py
    └── utils.py
├── README.md
├── README.rst
├── pylint.rc
├── requirements.txt
├── setup.cfg
├── setup.py
├── test_data
    ├── repeat_options.txt
    ├── test.fastq_perf.html
    ├── test_input.fa
    ├── test_input.fastq.gz
    ├── test_input.fastq_perf.html
    ├── test_input.fastq_perf.tsv
    ├── test_input_perf.html
    ├── test_input_perf.tsv
    └── unit_options.txt
└── utils
    └── repeat_generator.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | .vscode/
 2 | 
 3 | # Compiled python modules.
 4 | *.pyc
 5 | 
 6 | # Setuptools distribution folder.
 7 | /dist/
 8 | 
 9 | # Python egg metadata, regenerated from source files by setuptools.
10 | /*.egg-info
11 | 
12 | /build/
13 | 
14 | /test_data/million_100.fastq
15 | /test_data/tenK*


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 
 4 | Divya Tej Sowpati & Akshay Kumar Avvaru, 
 5 | Lab of Dr. Rakesh Mishra
 6 | 
 7 | Permission is hereby granted, free of charge, to any person obtaining a copy
 8 | of this software and associated documentation files (the "Software"), to deal
 9 | in the Software without restriction, including without limitation the rights
10 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
11 | copies of the Software, and to permit persons to whom the Software is
12 | furnished to do so, subject to the following conditions:
13 | 
14 | The above copyright notice and this permission notice shall be included in all
15 | copies or substantial portions of the Software.
16 | 
17 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
20 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23 | SOFTWARE.
24 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include PERF/lib/src/*js
2 | include PERF/lib/template_fasta.html
3 | include PERF/lib/template_fastq.html
4 | include PERF/lib/styles/*css
5 | 


--------------------------------------------------------------------------------
/PERF/__init__.py:
--------------------------------------------------------------------------------
1 | #! /usr/bin/env python
2 | 


--------------------------------------------------------------------------------
/PERF/analyse.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | from __future__ import print_function, division
  4 | import sys, os, json
  5 | from collections import Counter, defaultdict
  6 | import numpy as np
  7 | from pprint import pprint
  8 | 
  9 | if sys.version_info.major == 2:
 10 |     from utils import rev_comp, kmers, get_cycles, build_cycVariations
 11 | elif sys.version_info.major == 3:
 12 |     from .utils import rev_comp, kmers, get_cycles, build_cycVariations
 13 | 
 14 | 
 15 | def writetoHTML(html_file, defaultInfo, repeat_options, input_format):
 16 |     html_handle = open(html_file, 'w')
 17 |     current_dir = os.path.dirname(__file__)
 18 | 
 19 |     template = open('%s/lib/template_%s.html' %(current_dir, input_format), 'r').read()
 20 | 
 21 |     fontawesome_js = open('%s/lib/src/all.js' %(current_dir), 'r').read()
 22 |     semantic_css = open('%s/lib/styles/semantic.min.css' %(current_dir), 'r').read()
 23 |     multiselect_css = open('%s/lib/styles/multi-select.min.css' %(current_dir), 'r').read()
 24 |     apexcharts_css = open('%s/lib/styles/apexcharts.min.css' %(current_dir), 'r').read()
 25 |     main_css = open('%s/lib/styles/main.css' %(current_dir), 'r').read()
 26 | 
 27 |     jquery_js = open("%s/lib/src/jquery-3.5.0.min.js" %(current_dir), "r").read()
 28 |     semantic_js = open("%s/lib/src/semantic.min.js" %(current_dir), "r").read()
 29 |     multiselect_js = open('%s/lib/src/jquery.multi-select.min.js' %(current_dir), 'r').read()
 30 |     apexcharts_js = open('%s/lib/src/apexcharts.min.js' %(current_dir), 'r').read()
 31 |     lodash_js = open('%s/lib/src/lodash.min.js' %(current_dir), 'r').read()
 32 |     main_js = open('%s/lib/src/main_%s.js' %(current_dir, input_format), 'r').read()
 33 |     tables_js = open('%s/lib/src/tables_%s.js' %(current_dir, input_format), 'r').read()
 34 |     annocharts_js = ''
 35 |     if input_format == 'fasta':
 36 |         annocharts_js = open('%s/lib/src/anno_charts.js' %(current_dir), 'r').read()
 37 | 
 38 |     template = template.format(
 39 |         fontawesome_js = fontawesome_js, 
 40 |         semantic_css = semantic_css, 
 41 |         multiselect_css = multiselect_css, 
 42 |         apexcharts_css = apexcharts_css, 
 43 |         main_css = main_css, 
 44 |         jquery_js = jquery_js, 
 45 |         semantic_js = semantic_js, 
 46 |         multiselect_js = multiselect_js, 
 47 |         apexcharts_js = apexcharts_js, 
 48 |         lodash_js = lodash_js, 
 49 |         analyse_data_js = defaultInfo, 
 50 |         main_js = main_js, 
 51 |         tables_js = tables_js, 
 52 |         annocharts_js = annocharts_js,
 53 |         repeat_options = repeat_options,
 54 |     )
 55 | 
 56 |     print(template, file=html_handle)
 57 |     html_handle.close()
 58 |     print("HTML report successfully saved to " + html_file)
 59 | 
 60 | 
 61 | def get_parameters(args):
 62 |     runCommand = 'PERF' + ' '.join(sys.argv)
 63 | 
 64 | 
 65 | def analyse_fasta(args):
 66 |     repeatsOutFile = args.output.name
 67 |     html_report = os.path.splitext(repeatsOutFile)[0] + '.html'
 68 |     print("\nGenerating HTML report. This may take a while..", end="\n\n")
 69 |     
 70 | 
 71 |     all_repeat_classes = []
 72 |     kmer_classes = defaultdict(list)
 73 |     cyclical_variations = dict()
 74 |     for r in args.repeats:
 75 |         r = r.split('\t')[1]
 76 |         if r not in all_repeat_classes:
 77 |             all_repeat_classes.append(r)
 78 |             cyclical_variations[r] = build_cycVariations(r)
 79 | 
 80 |     inf = float('inf')
 81 |     defaultInfo = {}
 82 |     defaultInfo['info'] = { 'seqInfo': {}, 'repInfo': {} }
 83 |     
 84 |     if args.annotate: #if annotation is on the data is taken from t
 85 |         repeatsOutFile = os.path.splitext(repeatsOutFile)[0] + '_annotation.tsv'
 86 |         promUp = args.up_promoter
 87 |         promDown = args.down_promoter
 88 |         defaultInfo['info']['annoInfo'] = {'promUp': promUp, 'promDown': promDown}
 89 |         repAnno = {}
 90 |         TSS_dist = {}
 91 |         annoKeyDict = {}
 92 |     
 93 |     totalRepBases = 0
 94 |     totalRepFreq = 0
 95 |     longestLengths = [['seq', 'start', 'stop', 'repClass', 0, '+', 0, 'actualrep']]*100
 96 |     mostUnits = [['seq', 'start', 'stop', 'repClass', 0, '+', 0, 'actualrep']]*100
 97 |     minLength = inf
 98 |     minUnits = inf
 99 | 
100 |     plot_data = dict()
101 |     with open(repeatsOutFile, 'r') as repFile:
102 |         for line in repFile:
103 |             line = line.strip()
104 |             if line.startswith('#'):
105 |                 fields = line[1:].split(': ')
106 |                 defaultInfo['info']['seqInfo'][fields[0]] = fields[1]
107 |             else:
108 |                 fields = line.split('\t')
109 |                 fields[1] = int(fields[1])
110 |                 fields[2] = int(fields[2])
111 |                 fields[4] = int(fields[4])
112 |                 fields[6] = int(fields[6])
113 | 
114 |                 seq = fields[0]
115 |                 start = fields[1]
116 |                 end = fields[2]
117 |                 repClass = fields[3]
118 |                 repLength = fields[4]
119 |                 repOri = fields[5]
120 |                 repUnit = fields[6]
121 |                 actualRepeat = fields[7]
122 |                 if args.annotate:
123 |                     if repClass not in repAnno:
124 |                         repAnno[repClass] = {'EP': 0, 'GP': 0, 'GN': 0, 'IP': 0, 'DP': 0, 'EN': 0, 'IN': 0, 'DN': 0, 'UU': 0}
125 |                         TSS_dist[repClass] = []
126 |                     genicKey = fields[12]
127 |                     promKey = fields[13]
128 |                     try:
129 |                         tssD = int(fields[-1])
130 |                         if -5000 <= tssD <= 5000:
131 |                             TSS_dist[repClass].append(tssD)
132 |                     except:
133 |                         pass
134 |                     if genicKey == 'Intergenic':
135 |                         genicKey = 'Distal Intergenic'
136 |                     elif genicKey == '-':
137 |                         genicKey = 'Unannotated'
138 |                         promKey = 'Unannotated'
139 |                     annoKey = genicKey[0]+promKey[0]
140 |                     annoKeyDict[annoKey] = genicKey + '+' + promKey
141 |                     repAnno[repClass][annoKey] += 1 
142 | 
143 |                 totalRepBases += repLength
144 |                 totalRepFreq += 1
145 | 
146 |                 if repClass not in plot_data:
147 |                     plot_data[repClass] = dict()
148 |                     plot_data[repClass][repLength] = [0]*len(cyclical_variations[repClass])
149 |                 if repLength not in plot_data[repClass]: 
150 |                     plot_data[repClass][repLength] = [0]*len(cyclical_variations[repClass])
151 |                 plot_data[repClass][repLength][cyclical_variations[repClass].index(actualRepeat)] += 1
152 | 
153 |                 if minUnits > repUnit: minUnits = repUnit
154 |                 if minLength > repLength: minLength = repLength
155 | 
156 |                 if (longestLengths[-1][4] < repLength) or (longestLengths[-1][4] == repLength and repClass < longestLengths[-1][3]):
157 |                     longestLengths[-1] = fields
158 |                     longestLengths.sort(key=lambda x: x[4])
159 |                     longestLengths.reverse()
160 |                 if (mostUnits[-1][6] < repUnit) or (mostUnits[-1][6] == repUnit and repClass < longestLengths[-1][3]):
161 |                     mostUnits[-1] = fields
162 |                     mostUnits.sort(key=lambda x: x[6])
163 |                     mostUnits.reverse()
164 |     for r in all_repeat_classes:
165 |         kmer_classes[kmers[len(r)]].append(r)
166 |         if r not in plot_data:
167 |             plot_data[r] = 0
168 | 
169 |     repeat_options = ""
170 |     for kmer in kmer_classes:
171 |         repeat_options += '<optgroup label="%s">' %(kmer)
172 |         for r in kmer_classes[kmer]:
173 |             repeat_options += '<option value="%s">%s</option>' %(r, r)
174 |         repeat_options += '</optgroup>'
175 |     
176 |     totalBases = int(defaultInfo['info']['seqInfo']['Total_bases'])
177 |     defaultInfo['info']['repInfo']['lenFrequency'] = plot_data
178 |     defaultInfo['info']['repInfo']['numRepClasses'] = str(len(plot_data.keys())) + '/' + str(len(all_repeat_classes))
179 |     defaultInfo['info']['repInfo']['totalRepBases'] = totalRepBases
180 |     defaultInfo['info']['repInfo']['totalRepFreq'] = totalRepFreq
181 |     defaultInfo['info']['repInfo']['percentGenomeCovered'] = str(round((totalRepBases/totalBases)*100, 2)) + "%"
182 |     defaultInfo['info']['repInfo']['repDensityByFreq'] = round((totalRepFreq/totalBases)*1000000, 2)
183 |     defaultInfo['info']['repInfo']['repDensityByBases'] = round((totalRepBases/totalBases)*1000000, 2)
184 |     defaultInfo['info']['repInfo']['minLength'] = minLength
185 |     defaultInfo['info']['repInfo']['minUnits'] = minUnits
186 |     defaultInfo['info']['repInfo']['longestRepeats'] = []
187 |     defaultInfo['info']['repInfo']['mostRepeatUnits'] = []
188 |     defaultInfo['info']['repInfo']['allRepClasses'] = all_repeat_classes
189 |     if args.annotate:
190 |         for r in TSS_dist:
191 |             hist_values = np.histogram(TSS_dist[r], bins=200, range=(-5000,5000))
192 |             TSS_dist[r] = list(hist_values[0])
193 |             TSS_dist[r] = list(map(lambda x: int(x), TSS_dist[r]))
194 |         defaultInfo['info']['annoInfo']['TSS_histBinEdges'] = list(map(lambda x: int(x), hist_values[1]))
195 |         defaultInfo['info']['annoInfo']['repAnno'] = repAnno
196 |         defaultInfo['info']['annoInfo']['TSS_dist'] = TSS_dist
197 |         defaultInfo['info']['annoInfo']['annoKeyObj'] = annoKeyDict
198 |     for a in longestLengths:
199 |         testDict = {'seq': a[0], 'start': a[1], 'end': a[2], 'repClass': a[3], 'repLength': a[4], 'repOri': a[5], 'repUnit': a[6], 'actualRep': a[7]}
200 |         defaultInfo['info']['repInfo']['longestRepeats'].append(testDict)
201 |     for a in mostUnits:
202 |         testDict = {'seq': a[0], 'start': a[1], 'end': a[2], 'repClass': a[3], 'repLength': a[4], 'repOri': a[5], 'repUnit': a[6], 'actualRep': a[7]}
203 |         defaultInfo['info']['repInfo']['mostRepeatUnits'].append(testDict)
204 |     defaultInfo = 'const data =' + json.dumps(defaultInfo)
205 |     writetoHTML(html_report, defaultInfo, repeat_options, 'fasta')
206 | 
207 | def analyse_fastq(args, fastq_out):
208 | 
209 |     """Generates HTML report for fastq files."""
210 |     html_report = os.path.splitext(args.output.name)[0] + '.html'
211 |     print("\nGenerating HTML report. This may take a while..", end="\n\n")
212 |     
213 |     fastq_out['info']['seqInfo']['File_name'] = args.input.split('/')[-1]
214 |     n = fastq_out['info']['seqInfo']['Total_reads']
215 |     b = fastq_out['info']['seqInfo']['Total_bases']
216 |     total_repeats = fastq_out['info']['repInfo']['totalRepFreq']
217 |     reads_with_repeats = fastq_out['info']['repInfo']['totalRepReads']
218 |     total_repeat_bases = fastq_out['info']['repInfo']['totalRepBases']
219 |     all_repeat_classes = list(map(lambda x: x.split('\t')[1], args.repeats))
220 |     temp = []
221 |     for a in all_repeat_classes:
222 |         if a not in temp:
223 |             temp.append(a)
224 |     all_repeat_classes = temp
225 |     del temp
226 | 
227 |     kmer_classes = defaultdict(list)
228 |     for r in all_repeat_classes:
229 |         kmer_classes[kmers[len(r)]].append(r)
230 |     repeat_options = ""
231 |     for kmer in kmer_classes:
232 |         repeat_options += '<optgroup label="%s">' %(kmer)
233 |         for r in kmer_classes[kmer]:
234 |             repeat_options += '<option value="%s">%s</option>' %(r, r)
235 |         repeat_options += '</optgroup>'
236 | 
237 |     fastq_out['info']['repInfo']['numRepClasses'] = str(fastq_out['info']['repInfo']['numRepClasses']) + '/' + str(len(all_repeat_classes))
238 |     fastq_out['info']['repInfo']['allRepClasses'] = all_repeat_classes
239 |     fastq_out['info']['repInfo']['totalRepFreqNorm'] = round((total_repeats/n)*1000000, 2)
240 |     fastq_out['info']['repInfo']['totalRepReadsNorm'] = str(round((reads_with_repeats/n)*100, 2)) + '%'
241 |     fastq_out['info']['repInfo']['percentRepBases'] = str(round((total_repeat_bases/b)*100, 2)) + '%'
242 | 
243 |     rep_fastq_info = fastq_out['info']['repInfo']['repClassInfo']
244 |     for rep in sorted(rep_fastq_info, key= lambda k: (len(k), k)):
245 |         fastq_out['info']['repInfo']['repClassInfo'][rep]['reads_norm'] = round((rep_fastq_info[rep]['reads']/n)*1000000, 3)
246 |         fastq_out['info']['repInfo']['repClassInfo'][rep]['instances_norm'] = round((rep_fastq_info[rep]['instances']/n)*1000000, 3)
247 |         fastq_out['info']['repInfo']['repClassInfo'][rep]['bases_norm'] = round((rep_fastq_info[rep]['bases']/b)*1000000, 3)
248 |     
249 |     defaultInfo = 'const data =' + json.dumps(fastq_out)
250 |     writetoHTML(html_report, defaultInfo, repeat_options, 'fastq')
251 | 


--------------------------------------------------------------------------------
/PERF/annotation.py:
--------------------------------------------------------------------------------
  1 | #!usr/bin/python
  2 | from __future__ import print_function, division
  3 | from operator import itemgetter
  4 | from collections import defaultdict
  5 | from tqdm import tqdm
  6 | import os, sys, gzip
  7 | 
  8 | if sys.version_info.major == 2:
  9 |     from utils import rawcharCount
 10 | elif sys.version_info.major == 3:
 11 |     from .utils import rawcharCount
 12 | 
 13 | """
 14 | 
 15 |     CAUTION: Works currently for only sorted bed files.
 16 | 
 17 |     Preferential order of assigning annotation:
 18 |         Promoter >> Overlapping >> Intergenic
 19 | 
 20 |     Defaults:
 21 |         > Promoter distance is 1kb upstream and downstream of TSS.
 22 |         > Gene id considered is "gene".
 23 | 
 24 |     Built by checking on GFF3 file.
 25 | 
 26 |     # Sample GTF
 27 |     1 transcribed_unprocessed_pseudogene  gene        11869 14409 . + . gene_id "ENSG00000223972"; gene_name "DDX11L1"; gene_source "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; 
 28 |     1 processed_transcript                transcript  11869 14409 . + . gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1"; gene_sourc e "havana"; gene_biotype "transcribed_unprocessed_pseudogene"; transcript_name "DDX11L1-002"; transcript_source "havana";
 29 | 
 30 |     # Sample GFF
 31 |     X	Ensembl	Repeat	2419108	2419128	42	.	.	hid=trf; hstart=1; hend=21
 32 |     X	Ensembl	Repeat	2419108	2419410	2502	-	.	hid=AluSx; hstart=1; hend=303
 33 |     X	Ensembl	Repeat	2419108	2419128	0	.	.	hid=dust; hstart=2419108; hend=2419128
 34 | """
 35 | 
 36 | def select_anno(List):
 37 |     """Function to assign the hierarchically right choice of annotation"""
 38 |     if 'Exon' in List:
 39 |         return 'Exon'
 40 |     elif 'Intron' in List:
 41 |         return 'Intron'
 42 |     elif 'Genic' in List:
 43 |         return 'Genic'
 44 |     elif 'Intergenic' in List:
 45 |         return 'Intergenic'
 46 | 
 47 | 
 48 | def promoter(check):
 49 |     if check == 1:
 50 |         return 'Promoter'
 51 |     else:
 52 |         return 'Non-Promoter'
 53 | 
 54 | 
 55 | # Need to be updated for better parsing of the attributes
 56 | def process_attrs(attribute, annotype):
 57 |     
 58 |     """Processes the attribute field to build a dictionary with detailed profiling of the feature"""
 59 |     
 60 |     attr_obj = {}
 61 |     attributes = attribute.split(";")
 62 |     subdelim = " " if annotype=='GTF' else "="
 63 |     for a in attributes:
 64 |         attr = a.split(subdelim)
 65 |         attrName = attr[0].strip()
 66 |         attr_obj[attrName] = attr[1].strip()
 67 |     return attr_obj
 68 | 
 69 | 
 70 | # class GeneKeyError(Exception):
 71 | #     """
 72 | #         Exception raised for gene key which is absent.
 73 | #     """
 74 | 
 75 | #     def __init__(self, expression, message):
 76 | #         self.expression = 
 77 | #         self.message = message
 78 | 
 79 | 
 80 | 
 81 | def process_annofile(annofile, annotype, gene_id):
 82 | 
 83 |     """
 84 |         Processes the input annotation file and builds an object inclusive of all the available genic features.
 85 |         Order of columns for a typical GFF or GTF file:
 86 |             seqname  source  feature start   end score   strand  frame   attribute
 87 | 
 88 |         The output is an object is a constituent of two dictionaries:
 89 |             - An object for all the gene features. (key: chromosome_name, value: list of features in the chromosome).
 90 |             - An object for all the sub gene (exon, cds, etc.) features. (key: chromosome_name, 
 91 |                 value: (key: parent_geneid, value: list of features in the chromosome))
 92 |        
 93 |         The features for each chromosome are sorted based on their starts for easier downstream processing.
 94 |     """
 95 | 
 96 |     gene_obj = defaultdict(list)
 97 |     subgene_obj = defaultdict()
 98 |     if (annofile.endswith('gz')):
 99 |         annohandle = gzip.open(annofile, 'rt')
100 |     else:
101 |         annohandle = open(annofile, 'r')
102 |     for line in annohandle:
103 |         line = line.strip()
104 |         if line.startswith('#'):
105 |             pass
106 |         else:
107 |             fields = line.split('\t')
108 |             seqname, source, feature = fields[:3]
109 |             start = int(fields[3])
110 |             end = int(fields[4])
111 |             score, strand, frame, attribute = fields[5:9]
112 |             attr_obj = process_attrs(attribute, annotype)
113 | 
114 |             if feature in set(['gene', 'exon']):
115 |                 try:
116 |                     gene_name = attr_obj[gene_id]
117 |                 except KeyError:
118 |                     print('\nGeneKeyError:')
119 |                     print('The attribute "%s" is not among the attributes for gene. Please select a different one.' %(gene_id))
120 |                     print('The available ones are ['+ (", ".join(list(attr_obj.keys()))) +']', end='\n\n')
121 |                     sys.exit(1)
122 | 
123 |             if feature == 'gene':
124 |                 gene_obj[seqname].append([gene_name, start, end, strand])
125 |             elif feature == 'exon':
126 |                 try:
127 |                     subgene_obj[gene_name][feature].append([start, end, strand])
128 |                 except KeyError:
129 |                     subgene_obj[gene_name] = {feature: [[start, end, strand]]}
130 |     for i in gene_obj:
131 |         gene_obj[i] = sorted(gene_obj[i], key=itemgetter(1)) #sorting based on the start of the feature
132 |     for a in subgene_obj:
133 |         for b in subgene_obj[a]:
134 |             subgene_obj[a][b] = sorted(subgene_obj[a][b], key=itemgetter(0)) #sorting based on the start of the feature
135 | 
136 |     return {'gene': gene_obj, 'subgene': subgene_obj}
137 | 
138 | 
139 | def annotate(args):
140 |     """
141 |         Main function which iterates over the given input bedfile(perf_output)
142 |         Annotates each repeat location based on the close genic features.
143 | 
144 |         Simple outline:
145 |             - Works with the assumption that perf_output is sorted based on co-ordinates.
146 |             - For each repeat 
147 |                 * The features on its chromosome are retrived
148 |                 *
149 |     """
150 | 
151 |     rep_file = args.output.name
152 |     anno_file = args.annotate
153 |     annotype = args.anno_format
154 |     output_file = open(os.path.splitext(rep_file)[0] + '_annotation.tsv', 'w')
155 |     gene_id = args.gene_key
156 | 
157 |     promUp = args.up_promoter
158 |     promDown = args.down_promoter
159 | 
160 |     gffObject = process_annofile(anno_file, annotype, gene_id)
161 |     gene_obj = gffObject['gene']
162 |     subgene_obj = gffObject['subgene']
163 | 
164 |     print('', end='\n')
165 |     print('Generating annotations for identified repeats..')
166 |     print('', end='\n')
167 |     # Counting the number of lines in bed -------------------------------------
168 |     num_records = rawcharCount(rep_file, '\n')
169 |     with open(rep_file) as bed:
170 |         prevSeqName = "Initialise" # Initialise for checking the prevSeqName
171 |         minStartIndex = 0
172 |         for line in tqdm(bed, total=num_records):
173 |             # Object for the output entries to be appended --------------------
174 |             Annotations = {'Genic': [], 'Exon': [], 'Intron': []}
175 |             line = line.strip()
176 |             if line.startswith('#'):
177 |                 print(line.strip(), file = output_file)
178 |             else:
179 |                 fields = line.split('\t')
180 |                 seqname = fields[0]
181 |                 """
182 |                     If the seqname is not same the previous seq name the check
183 |                     starts from the first gene on the sequence.
184 |                 """
185 |                 if seqname != prevSeqName:
186 |                     minStartIndex = 0
187 |                 prevSeqName = seqname
188 |                 S1 = int(fields[1])
189 |                 E1 = int(fields[2])
190 |                 least_dist = float('inf')
191 |                 breakCheck = 0
192 |                 promoterCheck = 0
193 |                 try:
194 |                     for i, a in enumerate(gene_obj[seqname][minStartIndex:]):
195 |                         annotation = ''
196 |                         geneName = a[0]
197 |                         try:
198 |                             subgeneElements = subgene_obj[geneName]
199 |                         except KeyError:
200 |                             subgeneElements = {}
201 |                         S2 = a[1]
202 |                         E2 = a[2]
203 |                         Ori = a[3]
204 |                         # Transcription Start site
205 |                         TSS = S2
206 |                         if Ori == '-':
207 |                             TSS = E2
208 |                         
209 |                         """
210 |                             The below conditions make an optimal choice of a feature from where distance 
211 |                             relation comparisons for this paricular repeat can be initiated.
212 |                             
213 |                             Basic assumption-
214 | 
215 |                             S2-------Feature-------E2 |--- > maximum promoter distance---|                      |--- > maximum promoter distance---| S2-------Feature-------E2
216 |                                                       |--- > maximum promoter distance---| S1-----Repeat-----E1 |--- > maximum promoter distance---|
217 |                             
218 |                             With the condition of choosing the closest feature which is at least at a distance
219 |                             length of the promoter, we can omit comparisons with features which are much farther away.
220 | 
221 |                             The point where the comparisons stop is the closest feature which is greater than the
222 |                             distance of promoter from the end of the repeat.
223 |                         """
224 |                         if i == 0:
225 |                             least_start = S1 - E2
226 |                             minIndex = i
227 |                         else:
228 |                             if S1 - E2 > max([promUp, promDown]):
229 |                                 if least_start > (S1 - E2):
230 |                                     least_start = S1 - E2
231 |                                     minIndex = i
232 |                         if breakCheck == 1:
233 |                             break
234 |                         if (S2 - E1 > max([promUp, promDown])): #point to break the comparisons done
235 |                             breakCheck = 1
236 |                         
237 |                         
238 |                         # Checking if region comes in promoter --------------------
239 |                         # For positive strand orientation -------------------------
240 |                         if Ori == '-' and (TSS-promDown <= S1 <= TSS+promUp or TSS-promDown <= E1 <= TSS+promUp):
241 |                             promoterCheck = 1
242 |                         elif Ori == '+' and (TSS-promUp <= S1 <= TSS+promDown or TSS-promUp <= E1 <= TSS+promDown):
243 |                             promoterCheck = 1
244 |                         # If no Promoter found ------------------------------------
245 |                         # Checking if it overlaps ---------------------------------
246 |                         if (E2 - S1 >=0 and S2 - S1 <=0) or (E2 - E1 >=0 and S2 - E1 <= 0):
247 |                             annotation = 'Genic'
248 |                             # Removes the Intergenic entries cause a Genic overlap is found ------------------
249 |                             Annotations['Intergenic'] = []
250 |                             TSS = S2
251 |                             diffSS = S2 - S1
252 |                             diffES = E2 - S1
253 |                             diffSE = S2 - E1
254 |                             diffEE = E2 - E1
255 |                             if abs(diffSS) < abs(diffSE):
256 |                                 TSSdist = diffSS
257 |                             else:
258 |                                 TSSdist = diffSE
259 |                             distance = TSSdist
260 |                             # Checking overlap with subgene -------------------
261 |                             if 'exon' in subgeneElements:
262 |                                 for site in subgeneElements['exon']:
263 |                                     S3 = site[0]
264 |                                     E3 = site[1]
265 |                                     if (E3 - S1 >=0 and S3 - S1 <=0) or (E3 - E1 >=0 and S3 - E1 <= 0):
266 |                                         annotation = "Exon"
267 |                                         break
268 |                                     else:
269 |                                         annotation = "Intron"
270 |                         elif len(Annotations['Exon']) == 0 and len(Annotations['Intron']) == 0 and len(Annotations['Genic']) == 0:
271 |                             # If no Genic annotations are found closest distace from the closest gene is calculated
272 |                             TSS = S2
273 |                             diffSS = S2 - S1
274 |                             diffES = E2 - S1
275 |                             diffSE = S2 - E1
276 |                             diffEE = E2 - E1
277 |                             if abs(diffSS) < abs(diffSE):
278 |                                 TSSdist = diffSS
279 |                             else:
280 |                                 TSSdist = diffSE
281 |                             minDistance = min([abs(diffSS), abs(diffEE), abs(diffSE), abs(diffES)])
282 |                             annotation = 'Intergenic'
283 |                             distance = TSSdist
284 |                             if minDistance < least_dist:
285 |                                 least_dist = minDistance
286 |                                 Annotations[annotation] = [line + '\t' + '\t'.join(str(b) for b in a) + '\t' + annotation + '\t' + promoter(promoterCheck) + '\t' + str(distance)]
287 | 
288 |                         if (annotation == "Exon" or annotation == "Intron" or annotation == "Genic"):
289 |                             Annotations[annotation].append(line + '\t' + '\t'.join(str(b) for b in a) + '\t' + annotation + '\t' + promoter(promoterCheck) + '\t' + str(distance))
290 | 
291 |                     minStartIndex += minIndex
292 | 
293 |                     if minStartIndex > 0:
294 |                         # Cautious assignment the closest genic feature to start comparisons from
295 |                         minStartIndex = minStartIndex - 1
296 |                 #If sequence is not found, reports as annotation not available
297 |                 except KeyError:
298 |                     Annotations = {'Genic': [], 'Exon': [], 'Intron': []}
299 |                     print(line + '\t' + '\t'.join(['-']*7), file = output_file)
300 |                 for anno in list(Annotations.keys()):
301 |                     if len(Annotations[anno]) == 0:
302 |                         del Annotations[anno]
303 |                 for anno in Annotations:
304 |                     feature_leastdist = float('inf')
305 |                     closest_entry = ""
306 |                     for entry in Annotations[anno]:
307 |                         feature_dist = int(entry.split('\t')[-1])
308 |                         if feature_dist < feature_leastdist:
309 |                             feature_leastdist = feature_dist
310 |                             closest_entry = entry
311 |                     if closest_entry != "":
312 |                         Annotations[anno] = closest_entry
313 |                 if len(Annotations) > 1:
314 |                     anno_selected = select_anno(list(Annotations.keys()))
315 |                     print(Annotations[anno_selected], file = output_file)
316 |                 else:
317 |                     for anno in Annotations:
318 |                         print(Annotations[anno], file = output_file)
319 |     output_file.close()


--------------------------------------------------------------------------------
/PERF/core.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | 
  3 | # pylint: disable=C0103, C0301
  4 | 
  5 | from __future__ import print_function, division
  6 | import sys, argparse, gzip, json, ntpath
  7 | from os.path import splitext
  8 | from datetime import datetime
  9 | import multiprocessing as multi
 10 | 
 11 | if sys.version_info.major == 2:
 12 |     from utils import rawcharCount, dotDict, getGC, get_targetids
 13 |     from rep_utils import generate_repeats, get_ssrs, build_rep_set, fasta_ssrs
 14 |     from fastq_utils import fastq_ssrs
 15 | elif sys.version_info.major == 3:
 16 |     from .utils import rawcharCount, dotDict, getGC, get_targetids
 17 |     from .rep_utils import generate_repeats, get_ssrs, build_rep_set, fasta_ssrs
 18 |     from .fastq_utils import fastq_ssrs
 19 | 
 20 | inf = float('inf')
 21 | 
 22 | 
 23 | def getArgs():
 24 |     """
 25 |     Parses command line arguments and returns them to the caller
 26 |     """
 27 |     __version__ = 'v0.4.6'
 28 |     parser = argparse.ArgumentParser()
 29 |     parser._action_groups.pop()
 30 | 
 31 |     required = parser.add_argument_group('Required arguments')
 32 |     required.add_argument('-i', '--input', required=True, metavar='<FILE>', help='Input sequence file.')
 33 |     
 34 |     optional = parser.add_argument_group('Optional arguments')
 35 |     
 36 |     #Basic options
 37 |     optional.add_argument('-o', '--output', type=argparse.FileType('w'), metavar='<FILE>', default=sys.stdout, help='Output file name. Default: Input file name + _perf.tsv')
 38 |     optional.add_argument('--format', metavar='<STR>', default='fasta', help='Input file format. Default: fasta, Permissible: fasta, fastq')
 39 |     optional.add_argument('--version', action='version', version='PERF ' + __version__)
 40 |         
 41 |     #Selections options based on motif size and seq lengths
 42 |     optional.add_argument('-rep', '--repeats', type=argparse.FileType('r'), metavar='<FILE>', help='File with list of repeats (Not allowed with -m and/or -M)')
 43 |     optional.add_argument('-m', '--min-motif-size', type=int, metavar='<INT>', help='Minimum size of a repeat motif in bp (Not allowed with -rep)')
 44 |     optional.add_argument('-M', '--max-motif-size', type=int, metavar='<INT>', help='Maximum size of a repeat motif in bp (Not allowed with -rep)')
 45 |     optional.add_argument('-s', '--min-seq-length', type=int, metavar = '<INT>', default=0, help='Minimum size of sequence length for consideration (in bp)')
 46 |     optional.add_argument('-S', '--max-seq-length', type=float, metavar='<FLOAT>', default=inf, help='Maximum size of sequence length for consideration (in bp)')
 47 |     optional.add_argument('--include-atomic', action='store_true', default=False, help='An option to include factor atomic repeats for minimum motif sizes greater than 1.')
 48 | 
 49 |     #Cutoff options (min_length or min_units)    
 50 |     cutoff_group = optional.add_mutually_exclusive_group()
 51 |     cutoff_group.add_argument('-l', '--min-length', type=int, metavar='<INT>', help='Minimum length cutoff of repeat')
 52 |     cutoff_group.add_argument('-u', '--min-units', metavar='INT or FILE', help="Minimum number of repeating units to be considered. Can be an integer or a file specifying cutoffs for different motif sizes.")
 53 |     
 54 |     # Analysis options
 55 |     optional.add_argument('-a', '--analyse', action='store_true', default=False, help='Generate a summary HTML report.')
 56 |     optional.add_argument('--info', action='store_true', default=False, help='Sequence file info recorded in the output.')
 57 |     
 58 |     #Annotation options
 59 |     annotation = parser.add_argument_group('Annotation arguments')
 60 |     annotation.add_argument('-g', '--annotate', metavar='<FILE>', help='Genic annotation input file for annotation, Both GFF and GTF can be processed. Use --anno-format to specify format.')
 61 |     annotation.add_argument('--anno-format', metavar='<STR>',default='GFF', type=str, help='Format of genic annotation file. Valid inputs: GFF, GTF. Default: GFF')
 62 |     annotation.add_argument('--gene-key', metavar='<STR>', default='gene', type=str, help='Attribute key for geneId. The default identifier is "gene". Please check the annotation file and pick a robust gene identifier from the attribute column.')
 63 |     annotation.add_argument('--up-promoter', metavar='<INT>', type=int, default=1000, help='Upstream distance(bp) from TSS to be considered as promoter region. Default 1000')
 64 |     annotation.add_argument('--down-promoter', metavar='<INT>', type=int, default=1000, help='Downstream distance(bp) from TSS to be considered as promoter region. Default 1000')    
 65 |     
 66 |     
 67 |     #Filter based on seqIds
 68 |     seqid_group = optional.add_mutually_exclusive_group()
 69 |     seqid_group.add_argument('-f', '--filter-seq-ids', metavar='<FILE>', help='List of sequence ids in fasta file which will be ignored.')
 70 |     seqid_group.add_argument('-F', '--target-seq-ids', metavar='<FILE>', help='List of sequence ids in fasta file which will be used.')
 71 | 
 72 |     #Multiprocessing threads
 73 |     optional.add_argument('-t', '--threads', type=int, metavar='<INT>', default=1, help='Number of threads to run the process on. Default is 1.')
 74 | 
 75 |     args = parser.parse_args()
 76 | 
 77 |     if args.repeats and (args.min_motif_size or args.max_motif_size):
 78 |         parser.error("-rep is not allowed with -m/-M")
 79 |     if args.repeats is None:
 80 |         if args.min_motif_size is None:
 81 |             args.min_motif_size = 1
 82 |         if args.max_motif_size is None:
 83 |             args.max_motif_size = 6
 84 |     
 85 |     if args.output.name == "<stdout>":
 86 |         args.output = open(splitext(args.input)[0] + '_perf.tsv', 'w')
 87 | 
 88 |     return args
 89 | 
 90 | 
 91 | def ssr_native(args, length_cutoff=False, unit_cutoff=False):
 92 |     """
 93 |     Identifies microsatellites using native string matching.
 94 |     As the entire sequence is scanned in a single iteration, the speed is vastly improved
 95 |     """
 96 |     repeat_file = args.repeats
 97 |     if length_cutoff:
 98 |         length_cutoff = args.min_length
 99 |         repeats_info = build_rep_set(repeat_file, length_cutoff=length_cutoff)
100 |         print('Using length cutoff of %d' % (length_cutoff), file=sys.stderr)
101 |     elif unit_cutoff:
102 |         repeats_info = build_rep_set(repeat_file, unit_cutoff=unit_cutoff)
103 |         print('Using unit cutoff of ', unit_cutoff, file=sys.stderr)
104 | 
105 |     if args.format == 'fasta':
106 |         fasta_ssrs(args, repeats_info)
107 | 
108 |     elif args.format == 'fastq':
109 |         fastq_ssrs(args, repeats_info)
110 | 
111 | 
112 | def main():
113 |     """
114 |     Main function of the script
115 |     """
116 |     args = getArgs()
117 | 
118 | 
119 |     # User specifies motif size range instead of giving a repeats file
120 |     if args.repeats is None:
121 |         min_motif_size = args.min_motif_size
122 |         max_motif_size = args.max_motif_size
123 |         sizes = list(range(min_motif_size, max_motif_size+1))
124 |         args.repeats = generate_repeats(sizes, args.include_atomic)
125 |     # User specifies minimum length
126 |     if args.min_length:
127 |         ssr_native(args, length_cutoff=args.min_length)
128 | 
129 |     # User specific minimum number of units
130 |     elif args.min_units:
131 |         unit_cutoff = dict()
132 |         try:
133 |             args.min_units = int(args.min_units)
134 |             min_motif_size = args.min_motif_size
135 |             max_motif_size = args.max_motif_size
136 |             for m in range(min_motif_size, max_motif_size+1): unit_cutoff[m] = args.min_units
137 |         except ValueError:
138 |             try:
139 |                 max_motif_size = 0
140 |                 min_motif_size = float('inf')
141 |                 with open(args.min_units, 'r') as input_units:
142 |                     for line in input_units:
143 |                         L = line.strip().split()
144 |                         try:
145 |                             L[0] = int(L[0])
146 |                             if (L[0] < min_motif_size): min_motif_size= L[0]
147 |                             if (L[0] > max_motif_size): max_motif_size= L[0]
148 |                             L[1] = int(L[1])
149 |                             if L[1] == 1:
150 |                                 print('Warning: Repeat unit of 1 used for size %d.' % (L[0]), file=sys.stderr)
151 |                             unit_cutoff[L[0]] = L[1]
152 |                         except ValueError:
153 |                             sys.exit('Invalid file format given for minimum units. Refer to help for more details')
154 |                 args.repeats = generate_repeats(list(unit_cutoff.keys()), args.include_atomic)
155 |             except FileNotFoundError:
156 |                 sys.exit('Units file specified is not found. Please provide a valid file')
157 |         ssr_native(args, unit_cutoff=unit_cutoff)
158 | 
159 |     # Default settings
160 |     elif args.min_length is None and args.min_units is None:
161 |         args.min_length = 12
162 |         ssr_native(args, length_cutoff=args.min_length)
163 | 
164 | if __name__ == '__main__':
165 |     main()
166 | 


--------------------------------------------------------------------------------
/PERF/fastq_utils.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # pylint: disable=C0111, C0301
  3 | 
  4 | from __future__ import print_function, division
  5 | from datetime import datetime
  6 | from itertools import islice
  7 | from collections import Counter, defaultdict
  8 | import sys, gzip
  9 | import multiprocessing as multi
 10 | 
 11 | if sys.version_info.major == 2:
 12 |     from utils import dotDict, build_cycVariations
 13 |     from analyse import analyse_fastq
 14 | elif sys.version_info.major == 3:
 15 |     from .utils import dotDict, build_cycVariations
 16 |     from .analyse import analyse_fastq
 17 | 
 18 | 
 19 | def get_ssrs_fastq(seq_record, repeats_info):
 20 |     """Native function to identify repeats in fastq files"""
 21 | 
 22 |     repeats = defaultdict(list)
 23 |     num_repeats = 0
 24 |     read = 0
 25 |     
 26 |     length_cutoffs = repeats_info['cutoff'] # All possible length cutoffs
 27 |     input_seq = str(seq_record.seq).upper()
 28 |     input_seq_length = len(input_seq)
 29 |     for length_cutoff in length_cutoffs:
 30 |         fallback = length_cutoff - 1
 31 |         sub_start = 0  # substring start
 32 |         sub_stop = sub_start + length_cutoff  # substring stop
 33 |         while sub_stop <= input_seq_length:
 34 |             sub_stop = sub_start + length_cutoff
 35 |             sub_seq = input_seq[sub_start:sub_stop]
 36 |             if sub_seq in repeats_info:
 37 |                 num_repeats += 1
 38 |                 read = 1
 39 |                 match = True
 40 |                 repeat_data = repeats_info[sub_seq]
 41 |                 motif_length = repeat_data['motif_length']
 42 |                 rep_class = repeat_data['class']
 43 |                 strand = repeat_data['strand']
 44 |                 offset = length_cutoff % motif_length
 45 |                 repeat_seq = input_seq[sub_start+offset:sub_start+offset+motif_length]
 46 |                 i = 0
 47 |                 while match:
 48 |                     j = sub_stop
 49 |                     if sub_stop >= input_seq_length:
 50 |                         match = False
 51 |                         match_length = sub_stop - sub_start
 52 |                         repeats[rep_class].append('%s-%d'%(sub_seq[:motif_length], match_length))
 53 |                         sub_start = sub_stop - fallback
 54 |                     elif input_seq[j] == repeat_seq[i]:
 55 |                         sub_stop += 1
 56 |                         i += 1
 57 |                         if i >= motif_length:
 58 |                             i = 0
 59 |                     else:
 60 |                         match = False
 61 |                         match_length = sub_stop - sub_start
 62 |                         repeats[rep_class].append('%s-%d'%(sub_seq[:motif_length], match_length))
 63 |                         sub_start = sub_stop - fallback
 64 |             else:
 65 |                 sub_start += 1
 66 |     return {'read': read, 'num_repeats': num_repeats, 'repeats': repeats}
 67 | 
 68 | 
 69 | def process_fastq(args, repeats_info):
 70 |     """Processes fastq files for identification of repeats."""
 71 | 
 72 |     print('\nProcessing fastq file...')
 73 | 
 74 |     if args.input.endswith('gz'):
 75 |         handle = gzip.open(args.input, 'rt')
 76 |     else:
 77 |         handle = open(args.input, 'r')
 78 | 
 79 |     n,b = [0,0]
 80 |     readlen_freq = Counter()
 81 |     start_time = datetime.now()
 82 |     
 83 |     total_repeats = 0
 84 |     reads_with_repeats = 0
 85 |     total_repeat_bases = 0
 86 |     fastq_repeats = dict()
 87 | 
 88 |     fastq_repeat_info = dict()
 89 |     repeat_class_info = dict()
 90 |     lenFreq_data = dict()
 91 |     
 92 |     while 1:
 93 |         lines_gen = list(islice(handle, 4))
 94 |         if len(lines_gen) == 0:
 95 |             for rep in fastq_repeats:
 96 |                 cycles = build_cycVariations(rep)
 97 |                 for instance in fastq_repeats[rep]:
 98 |                     f = fastq_repeats[rep][instance]
 99 |                     m = instance.split('-')[0]
100 |                     l = int(instance.split('-')[1])
101 |                     if l not in lenFreq_data[rep]:
102 |                         lenFreq_data[rep][l] = [0]*len(cycles)
103 |                     lenFreq_data[rep][l][cycles.index(m)] += f
104 |                     repeat_class_info[rep]['lengths'].update([l]*f)
105 |                     repeat_class_info[rep]['motifs'].update([m]*f)
106 |                     repeat_class_info[rep]['instances'] += f
107 |                     repeat_class_info[rep]['bases'] += l*f
108 |                     total_repeat_bases += l*f
109 |                 fastq_repeats[rep] = Counter()
110 |             time_diff = datetime.now() - start_time
111 |             print('Processed reads: %d | Time elapsed: %s | Rate: %d iters/s\r' %(n, time_diff, n/(time_diff.seconds+1)), end = '')
112 |             sys.stdout.flush()
113 |             break            
114 |         
115 |         read_id = lines_gen[0].strip()
116 |         read_seq = lines_gen[1].strip()
117 |         read_len = len(read_seq)
118 |         n += 1 # total reads
119 |         b += len(read_seq) # total bases
120 |         readlen_freq.update([read_len]) #updating the read length frequencies
121 | 
122 |         if n%50000 == 0:
123 |             for rep in fastq_repeats:
124 |                 cycles = build_cycVariations(rep)
125 |                 for instance in fastq_repeats[rep]:
126 |                     f = fastq_repeats[rep][instance]
127 |                     m = instance.split('-')[0]
128 |                     l = int(instance.split('-')[1])
129 |                     if l not in lenFreq_data[rep]:
130 |                         lenFreq_data[rep][l] = [0]*len(cycles)
131 |                     lenFreq_data[rep][l][cycles.index(m)] += f
132 |                     repeat_class_info[rep]['lengths'].update([l]*f)
133 |                     repeat_class_info[rep]['motifs'].update([m]*f)
134 |                     repeat_class_info[rep]['instances'] += f
135 |                     repeat_class_info[rep]['bases'] += l*f
136 |                     total_repeat_bases += l*f
137 |                 fastq_repeats[rep] = Counter()
138 |             time_diff = datetime.now() - start_time
139 |             print('Processed reads: %d | Time elapsed: %s | Rate: %d iters/s\r' %(n, time_diff, n/(time_diff.seconds+1)), end = '')
140 |             sys.stdout.flush()
141 |         
142 |         record = dotDict({'id': read_id, 'seq': read_seq})
143 |         # read length should be greater than minimum repeat length
144 |         if  args.min_seq_length <= read_len <= args.max_seq_length:            
145 |             rep_identified = get_ssrs_fastq(record, repeats_info)
146 |             for rep in rep_identified['repeats']:
147 |                 try:
148 |                     fastq_repeats[rep].update(rep_identified['repeats'][rep])
149 |                     repeat_class_info[rep]['reads'] += 1
150 |                 except KeyError:
151 |                     fastq_repeats[rep] = Counter(rep_identified['repeats'][rep])
152 |                     repeat_class_info[rep] = {'lengths': Counter(), 'motifs': Counter(), 'instances': 0, 'reads': 1, 'bases': 0}
153 |                     lenFreq_data[rep] = {}
154 |             total_repeats += rep_identified['num_repeats']
155 |             reads_with_repeats += rep_identified['read']
156 |     
157 |     fastq_repeat_info['totalRepReads'] = reads_with_repeats
158 |     fastq_repeat_info['totalRepFreq'] = total_repeats
159 |     fastq_repeat_info['totalRepBases'] = total_repeat_bases
160 |     fastq_repeat_info['numRepClasses'] = len(lenFreq_data.keys())
161 |     fastq_repeat_info['lenFrequency'] = lenFreq_data
162 |     fastq_repeat_info['repClassInfo'] = repeat_class_info
163 | 
164 |     print('') #A line for proper printing of the output
165 |     min_readlen = min(readlen_freq.keys())
166 |     max_readlen = max(readlen_freq.keys())
167 |     if min_readlen == max_readlen:
168 |         readlen_range = '%d bp' %(min_readlen)
169 |     else:
170 |         readlen_range = '%d-%d (bp)' %(min_readlen, max_readlen)
171 |     return {'info': { 'seqInfo': {'Total_reads': n, 'Total_bases': b, 'Readlen_freq': readlen_freq, 
172 |             'Readlen_range': readlen_range}, 'repInfo': fastq_repeat_info}}
173 | 
174 | 
175 | def ssr_fastq_output(fastq_out, out_file):
176 |     """PERF OUTPUT for fastq files."""
177 |     
178 |     
179 |     n = fastq_out['info']['seqInfo']['Total_reads']
180 |     b = fastq_out['info']['seqInfo']['Total_bases']
181 |     readlen_freq = fastq_out['info']['seqInfo']['Readlen_freq']
182 |     readlen_range = fastq_out['info']['seqInfo']['Readlen_range']
183 | 
184 |     fastq_repeat_info = fastq_out['info']['repInfo']
185 |     reads_with_repeats = fastq_repeat_info['totalRepReads']
186 |     total_repeats = fastq_repeat_info['totalRepFreq']
187 |     total_repeat_classes = fastq_repeat_info['numRepClasses']
188 |     repeat_class_info = fastq_repeat_info['repClassInfo']
189 |     
190 |     print('#Total_reads: %d'%(n), file=out_file)
191 |     print('#Total_bases: %d' %(b), file=out_file)
192 |     print('#Total_repeat_instances: %d' %(total_repeats), file=out_file)
193 |     print('#Total_reads_with_repeats: %d' %(reads_with_repeats), file=out_file)
194 |     print('#Total_repeats_per_million_reads: %f' %(round((total_repeats/n)*1000000, 3)), file=out_file)
195 |     print('#Read_length_distribution: ', readlen_freq.most_common(), file=out_file)
196 | 
197 |     print('repeatClass', 'reads', 'instances', 'bases', 'reads_per_million', 'instances_per_million',
198 |         'bases_norm', 'length_distribution', 'motif_distribution', sep='\t', file=out_file)
199 |     
200 |     for rep in sorted(repeat_class_info, key= lambda k: (len(k), k)):
201 |         rep_info = repeat_class_info[rep]
202 |         print(  
203 |                 rep, int(rep_info['reads']), int(rep_info['instances']), int(rep_info['bases']),
204 |                 round((rep_info['reads']/n)*1000000, 3),
205 |                 round((rep_info['instances']/n)*1000000, 3), 
206 |                 round((rep_info['bases']/b)*1000000, 3),
207 |                 ';'.join(['-'.join([str(y) for y in x]) for x in sorted(rep_info['lengths'].items())]),
208 |                 ';'.join(['-'.join([str(y) for y in x]) for x in sorted(rep_info['motifs'].items())]),
209 |                 sep='\t', file=out_file  
210 |             )
211 | 
212 | 
213 | def fastq_ssrs(args, repeats_info):
214 | 
215 |     fastq_out = process_fastq(args, repeats_info)
216 |     ssr_fastq_output(fastq_out, args.output)
217 |     if args.analyse:
218 |         analyse_fastq(args, fastq_out)
219 |     args.output.close()


--------------------------------------------------------------------------------
/PERF/lib/src/anno_charts.js:
--------------------------------------------------------------------------------
  1 | const repeatAnnoDist = function(data, repeats, percent=true) {
  2 |     const annoKeyObj = {
  3 |         'Exon': ['EP', 'EN'],
  4 |         'Intron': ['IP', 'IN'],
  5 |         'Genic': ['GP', 'GN'],
  6 |         'Intergenic': ['DP', 'DN'],
  7 |     };
  8 |     const promKeyObj = {
  9 |         'Promoter': ['EP', 'IP', 'GP', 'DP'],
 10 |         'Non-Promoter': ['EN', 'IN', 'GN', 'DN']
 11 |     };
 12 | 
 13 |     let annoKeys = Object.keys(annoKeyObj);
 14 |     let promKeys = Object.keys(promKeyObj);
 15 |  
 16 |     let annodata = {};
 17 |     if (repeats === "all") { 
 18 |         let obj = {};
 19 |         let annototal = 0;
 20 |         let promtotal = 0;
 21 |         annoKeys.forEach(a => {
 22 |             let val = 0;
 23 |             Object.keys(data).forEach( r => {
 24 |                 annoKeyObj[a].forEach( a => {
 25 |                     val += data[r][a]
 26 |                 })
 27 |             })
 28 |             obj[a] = val;
 29 |             annototal += val;
 30 |         });
 31 |         
 32 |         promKeys.forEach(a => {
 33 |             let val = 0;
 34 |             Object.keys(data).forEach( (r, i) => {
 35 |                 promKeyObj[a].forEach( a => { val += data[r][a] })
 36 |             })
 37 |             obj[a] = val;
 38 |             promtotal += val;
 39 |         });
 40 |         
 41 |         if (percent){
 42 |             annoKeys.forEach(a => { obj[a] = ((obj[a]*100)/annototal).toFixed(2) })
 43 |             promKeys.forEach(a => { obj[a] = ((obj[a]*100)/promtotal).toFixed(2) })
 44 |         }
 45 |         annodata = obj
 46 |     }
 47 |     
 48 |     else {
 49 |         
 50 |         for (let rep of repeats) {
 51 |             const repindex = Object.keys(data).indexOf(rep);
 52 |             if (repindex > -1) {
 53 |               let obj = {};
 54 |               annoKeys.forEach(a => {
 55 |                   obj[a] = _.sum(annoKeyObj[a].map(d => { return data[rep][d]; }));
 56 |               });
 57 |               promKeys.forEach(p => {
 58 |                   obj[p] = _.sum(promKeyObj[p].map(d => { return data[rep][d]; }));
 59 |               });
 60 |               annodata[rep] = obj;
 61 |             }
 62 |         }
 63 |         for (let rep of repeats) {
 64 |             const repindex = Object.keys(data).indexOf(rep);
 65 |             if (repindex > -1) {
 66 |               const annototal = _.sum(annoKeys.map(a => {return annodata[rep][a] }));
 67 |               const promtotal = _.sum(promKeys.map(p => {return annodata[rep][p] }));
 68 |               if (percent) {
 69 |                   for (let a of annoKeys) { annodata[rep][a] = ((annodata[rep][a]*100)/annototal).toFixed(2); }
 70 |                   for (let a of promKeys) { annodata[rep][a] = ((annodata[rep][a]*100)/promtotal).toFixed(2); }
 71 |               }
 72 |             }
 73 |         }
 74 |     }
 75 | 
 76 | 
 77 |     return annodata;
 78 | }
 79 | 
 80 | 
 81 | const kmerAnnoDist = function(plotdata, freqdata, stacktype="annotation"){
 82 |     const annoKeys = ['Exon', 'Intron', 'Intergenic', 'Genic', 'Promoter', 'Non-Promoter'];
 83 |     const repeatsObj = {}
 84 |     const kmerFreq = {}
 85 |     const kmers = [];
 86 |     repeats.forEach(d => {
 87 |       repeatsObj[d.kmer] = _.map(d.repeats, 'class'); 
 88 |       kmerFreq[d.kmer]
 89 |       kmers.push(d.kmer);
 90 |     })
 91 |     const annoKmerData = {};
 92 |     for (const anno of annoKeys) {
 93 |       annoKmerData[anno] = {'Monomer': 0, 'Dimer': 0, 'Trimer': 0, 'Tetramer': 0, 'Pentamer': 0, 'Hexamer': 0};
 94 |     }
 95 |     
 96 |     for (let kmer in repeatsObj) {
 97 |       const classes = repeatsObj[kmer];
 98 |       const kmerData = repeatAnnoDist(plotdata, classes, false)
 99 |       const repFreqData = repeatFrequency(freqdata, classes, 'kmer', 'freq');
100 |       kmerFreq[kmer] = _.sumBy(repFreqData, 'value');
101 |       for (let rep in kmerData) { 
102 |         for (let anno in kmerData[rep]){
103 |           annoKmerData[anno][kmer] += kmerData[rep][anno]
104 |         } 
105 |       }
106 |     }
107 | 
108 |     const outdata = {data: {} };
109 |     if (stacktype === 'kmer') {
110 |       outdata.keys = kmers;
111 |       for (const anno in annoKmerData) {
112 |         outdata.data[anno] = [];
113 |         for (const kmer of kmers) {
114 |           // const total = kmerFreq[kmer];
115 |           const annototal = annoKmerData['Exon'][kmer] + annoKmerData['Intron'][kmer] + annoKmerData['Intergenic'][kmer] + annoKmerData['Genic'][kmer]
116 |           if (annototal > 0) {
117 |             outdata.data[anno].push(((annoKmerData[anno][kmer]*100)/annototal).toFixed(2))
118 |           }
119 |         }
120 |       }
121 |     }
122 |     else {
123 |       for (const kmer of kmers) {
124 |           outdata.data[kmer] = []
125 |           outdata.keys = []
126 |           for (const anno in annoKmerData) {
127 |             const total = _.sum(Object.values(annoKmerData[anno]));
128 |             if (total > 0) {
129 |               outdata.keys.push(anno);
130 |               outdata.data[kmer].push(((annoKmerData[anno][kmer]*100)/total).toFixed(2));
131 |             }
132 |           }
133 |       }
134 |     }
135 |     return outdata;
136 | }
137 | 
138 | const tssHistData = function(data, bins, repeats, bin, datatype) {
139 |     const values = _.range(-4975, 4976, 50);
140 | 
141 |     const stepSize = parseInt(bin/50);
142 |     const steps = parseInt((values.length)/stepSize);
143 | 
144 |     const start = -5000 + parseInt(bin/2);
145 |     const end = 5001 - parseInt(bin/2);
146 |     const binCenters = _.range(start, end, bin);
147 | 
148 |     const y = {};
149 |     repeats.forEach(r => {
150 |         y[r] = Array(binCenters.length).fill(0);
151 |         const repindex = Object.keys(data).indexOf(r);
152 |         if (repindex > -1) {
153 |             for (let i=0; i<steps; i++){
154 |                 let val = _.sum(data[r].slice(i*stepSize, (i+1)*stepSize));
155 |                 y[r][i] = val;
156 |             }
157 |         }
158 | 
159 |     })
160 | 
161 |     for (let rep of repeats) {
162 |         if (datatype === 'density'){
163 |             const total = _.sum(y[rep]);
164 |             let a;
165 |             if (total != 0) {
166 |                 a = _.map(y[rep], (d, i) => { 
167 |                     let val = (d)/total; 
168 |                     return [binCenters[i], val.toFixed(3)]; 
169 |                 });
170 |             }
171 |             else {
172 |                 a = _.map(y[rep], (d, i) => { 
173 |                     return [binCenters[i], 0]; 
174 |                 });
175 |             }
176 |             y[rep] = a;
177 |         }
178 |         else {
179 |             let a = _.map(y[rep], (d, i) => {
180 |                  let val = d; return [binCenters[i], val]; 
181 |             })
182 |             y[rep] = a;
183 |         }
184 |         y[rep].push([5000, y[rep][y[rep].length - 1][1]]);
185 |         y[rep].unshift([-5000, y[rep][0][1]]);
186 |     }
187 | 
188 |     return { data: y };
189 | }
190 | 
191 | if (data.info.annoInfo) { 
192 |     
193 |     const annostackbar_activeSelected = ['A', 'C']; //allRepClasses;
194 |     let stack_group = false;
195 |     
196 |     $("#anno-stackbar-repeat-select").multiSelect({
197 |         selectableOptgroup: true,
198 |         afterSelect: function(d){ d.forEach(function(e){ if (annostackbar_activeSelected.indexOf(e) == -1) { annostackbar_activeSelected.push(e) } })},
199 |         afterDeselect: function(d){ d.forEach(element => { annostackbar_activeSelected.splice(annostackbar_activeSelected.indexOf(element), 1); }); } 
200 |     });
201 |     
202 |     const annostackbar_data = repeatAnnoDist(data.info.annoInfo.repAnno, allRepClasses, false);
203 |     var annostackbar_options = {
204 |         series: [],
205 |         chart: { type: 'bar', stacked: true, stackType: '100%' },
206 |         plotOptions: {},
207 |         stroke: { width: 1, colors: ['#fff'] },
208 |         title: { text: 'Repeat Genomic distribution' },
209 |         xaxis: {  },
210 |         tooltip: { y: { formatter: function (val) { return val } } },
211 |         fill: { opacity: 1 },
212 |         legend: { position: 'top', horizontalAlign: 'left', offsetX: 40 }
213 |     };
214 |     var annostackbar_chart = new ApexCharts(document.querySelector("#anno-stackbar-plot-area"), annostackbar_options);
215 |     annostackbar_chart.render();
216 |     
217 |     const annoLabels = ['Exon', 'Intron', 'Genic', 'Intergenic']
218 |     const plot_annostackbar = function(){
219 |         const annostackbar_series = []
220 |         if (!(stack_group)) {
221 |             annoLabels.forEach(function(a){ 
222 |                 const d = _.map(annostackbar_activeSelected, o => {
223 |                     return parseFloat(annostackbar_data[o][a])
224 |                 })
225 |                 annostackbar_series.push({name: a, data: d});
226 |             })
227 |             annostackbar_chart.updateOptions({series: annostackbar_series, xaxis: {categories: annostackbar_activeSelected}});
228 |         }
229 | 
230 |         else {
231 |             annoLabels.forEach(function(a){ 
232 |                 const d = _.map(annostackbar_activeSelected, o => {
233 |                     return parseFloat(annostackbar_data[o][a])
234 |                 })
235 |                 annostackbar_series.push({name: a, data: [_.sum(d)]});
236 |             })
237 |             annostackbar_chart.updateOptions({series: annostackbar_series, xaxis: {categories: ['All selected repeats']}});
238 |         }
239 |     }
240 | 
241 |     $('.ui.checkbox.anno-stackbar').checkbox({ onChange: function(){
242 |         stack_group = !(stack_group);
243 |         plot_annostackbar();
244 |     }});
245 | 
246 |     $("#anno-stackbar-plot-button").click(function(){ plot_annostackbar(); });
247 |     plot_annostackbar();
248 | 
249 | 
250 |     const annoarea_activeSelected = ['A', 'C']; //allRepClasses;
251 |     let binSize = 500;
252 |     $("#anno-area-repeat-select").multiSelect({
253 |         selectableOptgroup: true,
254 |         afterSelect: function(d){ d.forEach(function(e){ if (annoarea_activeSelected.indexOf(e) == -1) { annoarea_activeSelected.push(e) } })},
255 |         afterDeselect: function(d){ d.forEach(element => { annoarea_activeSelected.splice(annoarea_activeSelected.indexOf(element), 1); }); } 
256 |     });
257 | 
258 |     $('.ui .dropdown.bin-size').dropdown({
259 |         values: [
260 |             {name: 100, value: 100},
261 |             {name: 200, value: 200},
262 |             {name: 500, value: 500, selected:true},
263 |             {name: 1000, value: 1000}
264 |         ],
265 |         onChange: function(value) { binSize = value; }
266 |     });
267 | 
268 |     var annoarea_options = {
269 |         series: [],
270 |         chart: { type: 'area'},
271 |         plotOptions: {},
272 |         stroke: { width: 1 },
273 |         title: { text: 'Average repeat distribution around TSS' },
274 |         xaxis: { type: 'numeric', min: -5000, max: 5000, tickAmount: 10000/binSize, axisTicks: { height: 8 }},
275 |         tooltip: { 
276 |             y: { formatter: function (val) { return val } },
277 |             x: { formatter: function (val) { return `${val-parseInt(binSize/2)}bp - ${val+parseInt(binSize/2)}bp` } } 
278 |         },
279 |         fill: { opacity: 1 },
280 |         legend: { position: 'top', horizontalAlign: 'left', offsetX: 40 }
281 |     };
282 |     var annoarea_chart = new ApexCharts(document.querySelector("#anno-area-plot-area"), annoarea_options);
283 |     annoarea_chart.render();
284 | 
285 |     const plot_annoarea = function() {
286 |         const annoarea_data = tssHistData(data.info.annoInfo.TSS_dist, data.info.annoInfo.TSS_histBinEdges, annoarea_activeSelected, binSize, 'density')['data'];
287 |         const series = []
288 |         for (let rep of Object.keys(annoarea_data)) { series.push({ name: rep, data: _.map(annoarea_data[rep], o => { return [o[0], parseFloat(o[1])];})}) }
289 |                 annoarea_chart.updateOptions({ series: series, xaxis: { type: 'numeric', min: -5000, max: 5000, tickAmount: 20, axisTicks: { height: 8 }} });
290 |     }
291 | 
292 |     $("#anno-area-plot-button").click(function(){ plot_annoarea(); });
293 |     plot_annoarea();
294 | 
295 | }
296 | 
297 | else { document.getElementById('anno-charts-main').style.display = 'none'; }


--------------------------------------------------------------------------------
/PERF/lib/src/jquery.multi-select.min.js:
--------------------------------------------------------------------------------
1 | !function(e){"use strict";var t=function(t,s){this.options=s,this.$element=e(t),this.$container=e("<div/>",{class:"ms-container"}),this.$selectableContainer=e("<div/>",{class:"ms-selectable"}),this.$selectionContainer=e("<div/>",{class:"ms-selection"}),this.$selectableUl=e("<ul/>",{class:"ms-list",tabindex:"-1"}),this.$selectionUl=e("<ul/>",{class:"ms-list",tabindex:"-1"}),this.scrollTo=0,this.elemsSelector="li:visible:not(.ms-optgroup-label,.ms-optgroup-container,."+s.disabledClass+")"};t.prototype={constructor:t,init:function(){var t=this,s=this.$element;if(0===s.next(".ms-container").length){s.css({position:"absolute",left:"-9999px"}),s.attr("id",s.attr("id")?s.attr("id"):Math.ceil(1e3*Math.random())+"multiselect"),this.$container.attr("id","ms-"+s.attr("id")),this.$container.addClass(t.options.cssClass),s.find("option").each(function(){t.generateLisFromOption(this)}),this.$selectionUl.find(".ms-optgroup-label").hide(),t.options.selectableHeader&&t.$selectableContainer.append(t.options.selectableHeader),t.$selectableContainer.append(t.$selectableUl),t.options.selectableFooter&&t.$selectableContainer.append(t.options.selectableFooter),t.options.selectionHeader&&t.$selectionContainer.append(t.options.selectionHeader),t.$selectionContainer.append(t.$selectionUl),t.options.selectionFooter&&t.$selectionContainer.append(t.options.selectionFooter),t.$container.append(t.$selectableContainer),t.$container.append(t.$selectionContainer),s.after(t.$container),t.activeMouse(t.$selectableUl),t.activeKeyboard(t.$selectableUl);var l=t.options.dblClick?"dblclick":"click";t.$selectableUl.on(l,".ms-elem-selectable",function(){t.select(e(this).data("ms-value"))}),t.$selectionUl.on(l,".ms-elem-selection",function(){t.deselect(e(this).data("ms-value"))}),t.activeMouse(t.$selectionUl),t.activeKeyboard(t.$selectionUl),s.on("focus",function(){t.$selectableUl.focus()})}var i=s.find("option:selected").map(function(){return e(this).val()}).get();t.select(i,"init"),"function"==typeof t.options.afterInit&&t.options.afterInit.call(this,this.$container)},generateLisFromOption:function(t,s,l){for(var i=this,n=i.$element,o="",a=e(t),r=0;r<t.attributes.length;r++){var c=t.attributes[r];"value"!==c.name&&"disabled"!==c.name&&(o+=c.name+'="'+c.value+'" ')}var d=e("<li "+o+"><span>"+i.escapeHTML(a.text())+"</span></li>"),h=d.clone(),p=a.val(),f=i.sanitize(p);d.data("ms-value",p).addClass("ms-elem-selectable").attr("id",f+"-selectable"),h.data("ms-value",p).addClass("ms-elem-selection").attr("id",f+"-selection").hide(),(a.prop("disabled")||n.prop("disabled"))&&(h.addClass(i.options.disabledClass),d.addClass(i.options.disabledClass));var u=a.parent("optgroup");if(u.length>0){var m=u.attr("label"),v=i.sanitize(m),b=i.$selectableUl.find("#optgroup-selectable-"+v),g=i.$selectionUl.find("#optgroup-selection-"+v);if(0===b.length){var $='<ul class="ms-optgroup"><li class="ms-optgroup-label"><span>'+m+"</span></li></ul>";b=e('<li class="ms-optgroup-container"></li>'),g=e('<li class="ms-optgroup-container"></li>'),b.attr("id","optgroup-selectable-"+v),g.attr("id","optgroup-selection-"+v),b.append(e($)),g.append(e($)),i.options.selectableOptgroup&&(b.find(".ms-optgroup-label").on("click",function(){var t=u.children(":not(:selected, :disabled)").map(function(){return e(this).val()}).get();i.select(t)}),g.find(".ms-optgroup-label").on("click",function(){var t=u.children(":selected:not(:disabled)").map(function(){return e(this).val()}).get();i.deselect(t)})),i.$selectableUl.append(b),i.$selectionUl.append(g)}s=void 0===s?b.find("ul").children().length:s+1,d.insertAt(s,b.children()),h.insertAt(s,g.children())}else s=void 0===s?i.$selectableUl.children().length:s,d.insertAt(s,i.$selectableUl),h.insertAt(s,i.$selectionUl)},addOption:function(t){var s=this;void 0!==t.value&&null!==t.value&&(t=[t]),e.each(t,function(t,l){if(void 0!==l.value&&null!==l.value&&0===s.$element.find("option[value='"+l.value+"']").length){var i=e('<option value="'+l.value+'">'+l.text+"</option>"),n=void 0===l.nested?s.$element:e("optgroup[label='"+l.nested+"']");t=parseInt(void 0===l.index?n.children().length:l.index);l.optionClass&&i.addClass(l.optionClass),l.disabled&&i.prop("disabled",!0),i.insertAt(t,n),s.generateLisFromOption(i.get(0),t,l.nested)}})},escapeHTML:function(t){return e("<div>").text(t).html()},activeKeyboard:function(t){var s=this;t.on("focus",function(){e(this).addClass("ms-focus")}).on("blur",function(){e(this).removeClass("ms-focus")}).on("keydown",function(l){switch(l.which){case 40:case 38:return l.preventDefault(),l.stopPropagation(),void s.moveHighlight(e(this),38===l.which?-1:1);case 37:case 39:return l.preventDefault(),l.stopPropagation(),void s.switchList(t);case 9:if(s.$element.is("[tabindex]")){l.preventDefault();var i=parseInt(s.$element.attr("tabindex"),10);return i=l.shiftKey?i-1:i+1,void e('[tabindex="'+i+'"]').focus()}l.shiftKey&&s.$element.trigger("focus")}if(e.inArray(l.which,s.options.keySelect)>-1)return l.preventDefault(),l.stopPropagation(),void s.selectHighlighted(t)})},moveHighlight:function(e,t){var s=e.find(this.elemsSelector),l=s.filter(".ms-hover"),i=null,n=s.first().outerHeight(),o=e.height();this.$container.prop("id");if(s.removeClass("ms-hover"),1===t){if(0===(i=l.nextAll(this.elemsSelector).first()).length)if((r=l.parent()).hasClass("ms-optgroup")){var a=r.parent().next(":visible");i=a.length>0?a.find(this.elemsSelector).first():s.first()}else i=s.first()}else if(-1===t){var r;if(0===(i=l.prevAll(this.elemsSelector).first()).length)if((r=l.parent()).hasClass("ms-optgroup")){var c=r.parent().prev(":visible");i=c.length>0?c.find(this.elemsSelector).last():s.last()}else i=s.last()}if(i.length>0){i.addClass("ms-hover");var d=e.scrollTop()+i.position().top-o/2+n/2;e.scrollTop(d)}},selectHighlighted:function(e){var t=e.find(this.elemsSelector),s=t.filter(".ms-hover").first();s.length>0&&(e.parent().hasClass("ms-selectable")?this.select(s.data("ms-value")):this.deselect(s.data("ms-value")),t.removeClass("ms-hover"))},switchList:function(e){e.blur(),this.$container.find(this.elemsSelector).removeClass("ms-hover"),e.parent().hasClass("ms-selectable")?this.$selectionUl.focus():this.$selectableUl.focus()},activeMouse:function(t){var s=this;this.$container.on("mouseenter",s.elemsSelector,function(){e(this).parents(".ms-container").find(s.elemsSelector).removeClass("ms-hover"),e(this).addClass("ms-hover")}),this.$container.on("mouseleave",s.elemsSelector,function(){e(this).parents(".ms-container").find(s.elemsSelector).removeClass("ms-hover")})},refresh:function(){this.destroy(),this.$element.multiSelect(this.options)},destroy:function(){e("#ms-"+this.$element.attr("id")).remove(),this.$element.off("focus"),this.$element.css("position","").css("left",""),this.$element.removeData("multiselect")},select:function(t,s){"string"==typeof t&&(t=[t]);var l=this,i=this.$element,n=e.map(t,function(e){return l.sanitize(e)}),o=this.$selectableUl.find("#"+n.join("-selectable, #")+"-selectable").filter(":not(."+l.options.disabledClass+")"),a=this.$selectionUl.find("#"+n.join("-selection, #")+"-selection").filter(":not(."+l.options.disabledClass+")"),r=i.find("option:not(:disabled)").filter(function(){return e.inArray(this.value,t)>-1});if("init"===s&&(o=this.$selectableUl.find("#"+n.join("-selectable, #")+"-selectable"),a=this.$selectionUl.find("#"+n.join("-selection, #")+"-selection")),o.length>0){o.addClass("ms-selected").hide(),a.addClass("ms-selected").show(),r.prop("selected",!0),l.$container.find(l.elemsSelector).removeClass("ms-hover");var c=l.$selectableUl.children(".ms-optgroup-container");if(c.length>0)c.each(function(){var t=e(this).find(".ms-elem-selectable");t.length===t.filter(".ms-selected").length&&e(this).find(".ms-optgroup-label").hide()}),l.$selectionUl.children(".ms-optgroup-container").each(function(){e(this).find(".ms-elem-selection").filter(".ms-selected").length>0&&e(this).find(".ms-optgroup-label").show()});else if(l.options.keepOrder&&"init"!==s){var d=l.$selectionUl.find(".ms-selected");d.length>1&&d.last().get(0)!=a.get(0)&&a.insertAfter(d.last())}"init"!==s&&(i.trigger("change"),"function"==typeof l.options.afterSelect&&l.options.afterSelect.call(this,t))}},deselect:function(t){"string"==typeof t&&(t=[t]);var s=this,l=this.$element,i=e.map(t,function(e){return s.sanitize(e)}),n=this.$selectableUl.find("#"+i.join("-selectable, #")+"-selectable"),o=this.$selectionUl.find("#"+i.join("-selection, #")+"-selection").filter(".ms-selected").filter(":not(."+s.options.disabledClass+")"),a=l.find("option").filter(function(){return e.inArray(this.value,t)>-1});if(o.length>0){n.removeClass("ms-selected").show(),o.removeClass("ms-selected").hide(),a.prop("selected",!1),s.$container.find(s.elemsSelector).removeClass("ms-hover");var r=s.$selectableUl.children(".ms-optgroup-container");if(r.length>0)r.each(function(){e(this).find(".ms-elem-selectable").filter(":not(.ms-selected)").length>0&&e(this).find(".ms-optgroup-label").show()}),s.$selectionUl.children(".ms-optgroup-container").each(function(){0===e(this).find(".ms-elem-selection").filter(".ms-selected").length&&e(this).find(".ms-optgroup-label").hide()});l.trigger("change"),"function"==typeof s.options.afterDeselect&&s.options.afterDeselect.call(this,t)}},select_all:function(){var t=this.$element,s=t.val();if(t.find('option:not(":disabled")').prop("selected",!0),this.$selectableUl.find(".ms-elem-selectable").filter(":not(."+this.options.disabledClass+")").addClass("ms-selected").hide(),this.$selectionUl.find(".ms-optgroup-label").show(),this.$selectableUl.find(".ms-optgroup-label").hide(),this.$selectionUl.find(".ms-elem-selection").filter(":not(."+this.options.disabledClass+")").addClass("ms-selected").show(),this.$selectionUl.focus(),t.trigger("change"),"function"==typeof this.options.afterSelect){var l=e.grep(t.val(),function(t){return e.inArray(t,s)<0});this.options.afterSelect.call(this,l)}},deselect_all:function(){var e=this.$element,t=e.val();e.find("option").prop("selected",!1),this.$selectableUl.find(".ms-elem-selectable").removeClass("ms-selected").show(),this.$selectionUl.find(".ms-optgroup-label").hide(),this.$selectableUl.find(".ms-optgroup-label").show(),this.$selectionUl.find(".ms-elem-selection").removeClass("ms-selected").hide(),this.$selectableUl.focus(),e.trigger("change"),"function"==typeof this.options.afterDeselect&&this.options.afterDeselect.call(this,t)},sanitize:function(e){var t,s=0;if(0==e.length)return s;var l;for(t=0,l=e.length;t<l;t++)s=(s<<5)-s+e.charCodeAt(t),s|=0;return s}},e.fn.multiSelect=function(){var s=arguments[0],l=arguments;return this.each(function(){var i=e(this),n=i.data("multiselect"),o=e.extend({},e.fn.multiSelect.defaults,i.data(),"object"==typeof s&&s);n||i.data("multiselect",n=new t(this,o)),"string"==typeof s?n[s](l[1]):n.init()})},e.fn.multiSelect.defaults={keySelect:[32],selectableOptgroup:!1,disabledClass:"disabled",dblClick:!1,keepOrder:!1,cssClass:""},e.fn.multiSelect.Constructor=t,e.fn.insertAt=function(e,t){return this.each(function(){0===e?t.prepend(this):t.children().eq(e-1).after(this)})}}(window.jQuery);


--------------------------------------------------------------------------------
/PERF/lib/src/main_fasta.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |     The "main.js" contains core JS component supporting the PERF-Analysis module.
  3 | 
  4 |     This web application is developed with Semantic-ui frame work.
  5 |     The charts are build using Apex-Chart js charting library.
  6 | 
  7 |     All the data for the report is derived from analyse_data.js
  8 |     data = {info: {seqInfo: {}, repInfo: {}, plotInfo: {}}}
  9 | 
 10 |     plotInfo i s a dictionary with key as the repeat class and value as a dictionary
 11 |     plotInfo: { REPEAT_CLASS: { LENGTH: FREQUENCY } }
 12 | 
 13 | */
 14 | 
 15 | // Updating report data
 16 | for (const key in data.info.seqInfo){$(`.value.${key}`).html(data.info.seqInfo[key])};
 17 | for (const key in data.info.repInfo){if (key != 'lenFrequency') { $(`.value.${key}`).html(data.info.repInfo[key]); }};
 18 | 
 19 | const menuLayout = function(){
 20 |     const w = window.innerWidth;
 21 |     if (w < 800) { 
 22 |         const navmenu = document.getElementById('navmenu');
 23 |         navmenu.classList.remove('vertical');
 24 |         navmenu.parentElement.style.width = '100%';
 25 |         document.getElementById('content-display').style.width = '100%';
 26 |     }
 27 |     
 28 |     else {
 29 |         const navmenu = document.getElementById('navmenu')
 30 |         navmenu.classList.add('vertical');
 31 |         navmenu.parentElement.style.width = '5%';
 32 |         document.getElementById('content-display').style.width = '95%';
 33 |     }
 34 | }
 35 | window.onresize = function(){ menuLayout(); }
 36 | 
 37 | const numPrefixObj = ["Monomer","Dimer","Trimer","Tetramer","Pentamer","Hexamer","Heptamer","Octamer","Nonamer","Decamer","Undecamer","Dodecamer","Tridecamer","Tetradecamer","Pentadecamer","Hexadecamer","Heptadecamer","Octadecamer","Nonadecamer","Icosamer","Uncosamer","Docosamer","Tricosamer","Tetracosamer","Pentacosamer","Hexacosamer","Heptacosamer","Octacosamer","Nonacosamer","Triacontamer","Untriacontamer","Dotriacontamer","Tritriacontamer","Tetratriacontamer","Pentatriacontamer","Hexatriacontamer","Heptatriacontamer","Octatriacontamer","Nonatriacontamer","Tetracontamer","Untetracontamer","Dotetracontamer","Tritetracontamer","Tetratetracontamer","Pentatetracontamer","Hexatetracontamer","Heptatetracontamer","Octatetracontamer","Nonatetracontamer","Pentacontamer"]
 38 | const plotData = data.info.repInfo.lenFrequency;
 39 | const allRepClasses = data.info.repInfo.allRepClasses;
 40 | 
 41 | $('.ui.dropdown').dropdown();
 42 | $('.chart .item').tab();
 43 | $('.anno-chart .item').tab();
 44 | $('.ui .units').dropdown({values: [{name: 'length', value: 1, selected:true}, {name: 'units', value: 0}]});
 45 | 
 46 | /* 
 47 |     Bar graph
 48 |     - For bar graph we curate the data in barData with Repeat class as the key and [bases, frequency] as the value.
 49 | 
 50 |     - bar_activeSelected is the list of Repeat classes which are considered to be plotted.
 51 |         - can be selected by sort selection which is bar_sortSelected.
 52 |         - or by repeat selection dropdown stored in bar_repSelected.
 53 |         - sorted_barKeys stores the all repClasses sorted based on the datatype selected.
 54 |     
 55 |     The dataflow for the barChart is as follows.
 56 |     - There are two plot buttons dedicated individually to either plot with data based sorted keys or desired set of keys.
 57 |     - Based on which plot button is pressed that repeat set gets placed in the bar_activeSelected variable.
 58 |     - And then subsequently the data for the bar_activeSelected keys is plotted.
 59 |     - The data type and the sort customisation of the plots only deal with the bar_activeSelected keys.
 60 | 
 61 | */
 62 | 
 63 | const barData = {};
 64 | allRepClasses.forEach(function(e){
 65 |     if ((Object.keys(plotData).indexOf(e) != -1) && (plotData[e] != 0)) {
 66 |         const lengths = _.map(Object.keys(plotData[e]), d => { return parseInt(d); });
 67 |         let frequency = 0; let bases = 0;
 68 |         for (let l in lengths) { 
 69 |             l = lengths[l]; frequency += _.sum(plotData[e][l]); bases += _.sum(plotData[e][l]) * l;
 70 |         }
 71 |         barData[e] = [bases, frequency];
 72 |     } else { barData[e] = [0, 0]; }
 73 | })
 74 | let bar_dataType = 1;
 75 | let bar_sortSelected = [];
 76 | let bar_sortOrder = 1;
 77 | let bar_numReps = 10;
 78 | let bar_repSelected = [];
 79 | let sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; });
 80 | let bar_activeSelected = [];
 81 | 
 82 | const bar_options = {
 83 |     chart: { type: 'bar' },
 84 |     plotOptions: { bar: { horizontal: false, columnwidth: '55%' } },
 85 |     series: [{ data: [] }],
 86 |     dataLabels: { enabled: false },
 87 |     yaxis: { 'title': { 'text': 'Frequency', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }},
 88 |     xaxis: { categories: [], 'title': { 'text': 'Repeat Class', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }},
 89 |     title: { text: 'Repeat Frequency', align: 'left' }
 90 | }
 91 | const bar_chart = new ApexCharts(document.querySelector('#bar-plot-area'), bar_options);
 92 | bar_chart.render();
 93 | const plotBar = function(keys){ 
 94 |     const values = [];
 95 |     keys.forEach(function(e){ values.push(barData[e][bar_dataType]); });
 96 |     const name = (bar_dataType == 1) ? 'Frequency' : 'Bases';
 97 |     bar_chart.updateOptions({series: [{'name': name, data: values}], yaxis: { title: { 'text': name, 'style': { 'fontSize': '16px', 'font-weight': 'bold' } } }, xaxis: {categories: keys}, animate: true})
 98 | }
 99 | 
100 | $('#bar-numRep').change(function(){ bar_numReps = this.value; });
101 | $('.ui .dropdown.sort-order').dropdown({
102 |     values: [{name: 'top', value: 1, selected:true}, {name: 'bottom', value: 0}],
103 |     onChange: function(value) { bar_sortOrder = value; }
104 | });
105 | $('#bar-sortPlot-button').click(function(){ 
106 |     if (bar_sortOrder == 1) { bar_sortSelected = sorted_barKeys.slice(sorted_barKeys.length - bar_numReps); bar_sortSelected.reverse();}
107 |     else { bar_sortSelected = sorted_barKeys.slice(0, bar_numReps); }
108 |     bar_activeSelected = bar_sortSelected; plotBar(bar_activeSelected); 
109 | })
110 | $('#bar-sortPlot-button').trigger("click")
111 | 
112 | $("#bar-repeat-select").multiSelect({
113 |     selectableOptgroup: true,
114 |     afterSelect: function(d){ d.forEach(function(e){ if (bar_repSelected.indexOf(e) == -1) { bar_repSelected.push(e) } })},
115 |     afterDeselect: function(d){ d.forEach(element => { bar_repSelected.splice(bar_repSelected.indexOf(element), 1); }); } 
116 | });
117 | $('#bar-repPlot-button').click(function(){ 
118 |     bar_repSelected = _.sortBy(bar_repSelected, o => {return allRepClasses.indexOf(o)}); 
119 |     bar_activeSelected = bar_repSelected; plotBar(bar_activeSelected); 
120 | })
121 | 
122 | // Once the data type is selected the global variable bar_dataType changes
123 | // also the sorted_barKeys is updated based on the datatype.
124 | $('.ui.checkbox.bar').checkbox({ onChange: function(val){
125 |     bar_dataType = this.value;
126 |     sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; }); 
127 |     plotBar(bar_activeSelected)
128 | }});
129 | 
130 | $('#asort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); plotBar(bar_activeSelected); })
131 | $('#dsort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); bar_activeSelected.reverse(); plotBar(bar_activeSelected); })
132 | $('#asort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); plotBar(bar_activeSelected); })
133 | $('#dsort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); bar_activeSelected.reverse(); plotBar(bar_activeSelected); })
134 | 
135 | 
136 | /* 
137 |     Pie graph
138 |     - For pie chart as we deal with frequency and bases data of repeat classes we continue using barData for data retrieval.
139 | 
140 |     - pie_activeSelected is the list of Repeat classes which are considered to be plotted.
141 |         - As there is only way to select repeats there will be no updating the repeats.
142 |     
143 |     The dataflow for the barChart is as follows.
144 |     - There are two plot buttons dedicated individually to either plot with data based sorted keys or desired set of keys.
145 |     - Based on which plot button is pressed that repeat set gets placed in the bar_activeSelected variable.
146 |     - And then subsequently the data for the bar_activeSelected keys is plotted.
147 |     - The data type and the sort customisation of the plots only deal with the bar_activeSelected keys.
148 | 
149 | */
150 | 
151 | let pie_activeSelected = allRepClasses;
152 | let pie_dataType = 1;
153 | let pie_group = true;
154 | $("#pie-kmer-toggle").checkbox({
155 |     onChecked: function(){ pie_group = true; plotPie(pie_activeSelected) },
156 |     onUnchecked: function(){ pie_group= false; plotPie(pie_activeSelected) }
157 | });
158 | $("#pie-repeat-select").multiSelect({
159 |     selectableOptgroup: true,
160 |     afterSelect: function(d){ d.forEach(function(e){ if (pie_activeSelected.indexOf(e) == -1) { pie_activeSelected.push(e) } })},
161 |     afterDeselect: function(d){ d.forEach(element => { pie_activeSelected.splice(pie_activeSelected.indexOf(element), 1); }); } 
162 | });
163 | $(".ui.checkbox.pie.radio.pie-data-type").checkbox({ onChange: function(){ pie_dataType = this.value; plotPie(pie_activeSelected) }});
164 | const pie_options = {
165 |     chart: { type: 'pie' },
166 |     labels: ['Monomers', 'Dimers', 'Trimers', 'Tetramers', 'Pentamers', 'Hexamers'],
167 |     series: [10, 10, 10, 10, 10, 10],
168 |     responsive: [{ breakpoint: 480 }],
169 |     colors: ["#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"],
170 |     // theme: { monochrome: { enabled: true }}
171 | }
172 | let pie_chart = new ApexCharts( document.querySelector("#pie-plot-area"), pie_options );
173 | pie_chart.render();
174 | 
175 | const plotPie = function(keys){ 
176 |     let values = [];
177 |     keys = keys.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) })
178 |     keys.forEach(function(e){ values.push(barData[e][pie_dataType]); });
179 |     if (pie_group == true) { 
180 |         values = [];
181 |         let group_keys = [];
182 |         const kmer_lengths = _.uniq(_.map(keys, e => { return e.length; })).sort();
183 |         kmer_lengths.forEach( e => { group_keys.push(numPrefixObj[e - 1]); values.push(0); });
184 |         keys.forEach(e => { values[kmer_lengths.indexOf(e.length)] += barData[e][pie_dataType]; })
185 |         keys = group_keys;
186 |     }
187 |     pie_chart.updateOptions({labels:keys, series: values, animate: true})
188 | }
189 | 
190 | $("#pie-plot-button").click(function(){
191 |     pie_activeSelected = _.sortBy(pie_activeSelected, o => {return allRepClasses.indexOf(o)});
192 |     plotPie(pie_activeSelected);
193 | });
194 | 
195 | //  Initialising Pie Chart
196 | plotPie(pie_activeSelected);
197 | 
198 | /* 
199 |     Line graph
200 |     - We retrieve the data for line plots from plotData.
201 | 
202 |     - Line plot has options to
203 |         - Select repeats which will be saved in line_activeSelected variable
204 |         - Length range in which it has to plot the data
205 |     
206 |     - Required data
207 |         - The minimum length/units criteria is retrieved from the repInfo.
208 | 
209 |     The dataflow for the lineChart is as follows.
210 |     - The data flow is prestty simple from the options selected the relavant data is retrieved and plotted
211 | 
212 | */
213 | 
214 | let minLength = data.info.repInfo.minLength;
215 | let minUnits = data.info.repInfo.minUnits;
216 | let minRange = 12;
217 | let maxRange = 50;
218 | let line_dataType = 1;
219 | let line_activeSelected = ['A', 'C'];
220 | 
221 | 
222 | $('.ui .dropdown.units').dropdown({
223 |     values: [{name: 'length', value: 1, selected:true}, {name: 'units', value: 0}],
224 |     onChange: function(value) { line_dataType = value;}
225 | });
226 | $("#line-repeat-select").multiSelect({
227 |     selectableOptgroup: true,
228 |     afterSelect: function(d){ d.forEach(function(e){ if (line_activeSelected.indexOf(e) == -1) { line_activeSelected.push(e) } }) },
229 |     afterDeselect: function(d){ d.forEach(element => { line_activeSelected.splice(line_activeSelected.indexOf(element), 1); }); } 
230 | });
231 | $('#line-min-len').change(function(){ minRange = parseInt(this.value); plotLine(line_activeSelected); });
232 | $('#line-max-len').change(function(){ maxRange = parseInt(this.value); plotLine(line_activeSelected); });
233 | const line_options = {
234 |     chart: { type: 'line', zoom: { enabled: false }},
235 |     dataLabels: { enabled: false },
236 |     stroke: { curve: 'straight', width: 2 },
237 |     series: [],
238 |     title: { text: 'Repeat sequence length(bp) vs Abundance', align: 'left' },
239 |     grid: { row: { colors: ['#f3f3f3', 'transparent'], opacity: 0.5 } },
240 |     tooltip: { x: {
241 |         formatter: function(val) { return `Length: ${val}bp` }
242 |     }},
243 |     markers: {size: 0},
244 |     yaxis: { title: { text: 'Frequency', 
245 |              style: { 'fontSize': '16px', 'font-weight': 'bold' } }},
246 |     xaxis: { title: { text: 'Length (bp)', 
247 |              style: { 'fontSize': '16px', 'font-weight': 'bold' } },
248 |             //  labels: { format: '%d' },
249 |              tickAmount: parseInt((maxRange - minRange)/2) },
250 |     legend: { position: 'top' }
251 | }
252 | const line_chart = new ApexCharts( document.querySelector("#line-plot-area"), line_options );
253 | line_chart.render();
254 | 
255 | const plotLine = function(keys) {
256 |     const xValues = _.range(minRange, maxRange + 1);
257 |     const series = [];
258 |     keys.forEach(function(key){
259 |         const data = [];
260 |         if (line_dataType == 0) {
261 |             for (let i = minRange; i <= maxRange; i++) {
262 |                 let val = 0;
263 |                 for ( let j = 0; j < key.length; j++) { const repLen = (i*key.length) + j; const v = (plotData[key][repLen]) ? _.sum(plotData[key][repLen]) : 0; val += v; }
264 |                 data.push(val);
265 |             }
266 |         }
267 |         else { for (let i = minRange; i <= maxRange; i++){ const val = (plotData[key][i]) ? _.sum(plotData[key][i]) : 0; data.push(val); } }
268 |         series.push({'name': key, 'data': data});
269 |     })
270 |     line_chart.updateOptions({series: series, xaxis: {categories: xValues}})
271 | }
272 | $('#line-plot-button').click(function(){ plotLine(line_activeSelected); })
273 | 
274 | // Initialsing line chart
275 | plotLine(line_activeSelected);
276 | 


--------------------------------------------------------------------------------
/PERF/lib/src/main_fastq.js:
--------------------------------------------------------------------------------
  1 | /*
  2 |     The "main.js" contains core JS component supporting the PERF-Analysis module.
  3 | 
  4 |     This web application is developed with Semantic-ui frame work.
  5 |     The charts are build using Apex-Chart js charting library.
  6 | 
  7 |     All the data for the report is derived from analyse_data.js
  8 |     data = { info: {seqInfo: {}, repInfo: {}, plotInfo: {}} }
  9 | 
 10 |     plotInfo i s a dictionary with key as the repeat class and value as a dictionary
 11 |     plotInfo: { REPEAT_CLASS: { LENGTH: FREQUENCY } }
 12 | 
 13 | */
 14 | 
 15 | // Updating report data
 16 | for (const key in data.info.seqInfo){$(`.value.${key}`).html(data.info.seqInfo[key])};
 17 | for (const key in data.info.repInfo){if (key != 'lenFrequency') { $(`.value.${key}`).html(data.info.repInfo[key]); }};
 18 | 
 19 | const menuLayout = function(){
 20 |     const w = window.innerWidth;
 21 |     if (w < 800) { 
 22 |         const navmenu = document.getElementById('navmenu');
 23 |         navmenu.classList.remove('vertical');
 24 |         navmenu.parentElement.style.width = '100%';
 25 |         document.getElementById('content-display').style.width = '100%';
 26 |     }
 27 |     
 28 |     else {
 29 |         const navmenu = document.getElementById('navmenu')
 30 |         navmenu.classList.add('vertical');
 31 |         navmenu.parentElement.style.width = '5%';
 32 |         document.getElementById('content-display').style.width = '95%';
 33 |     }
 34 | }
 35 | window.onresize = function(){ menuLayout(); }
 36 | 
 37 | const numPrefixObj = ["Monomer","Dimer","Trimer","Tetramer","Pentamer","Hexamer","Heptamer","Octamer","Nonamer","Decamer","Undecamer","Dodecamer","Tridecamer","Tetradecamer","Pentadecamer","Hexadecamer","Heptadecamer","Octadecamer","Nonadecamer","Icosamer","Uncosamer","Docosamer","Tricosamer","Tetracosamer","Pentacosamer","Hexacosamer","Heptacosamer","Octacosamer","Nonacosamer","Triacontamer","Untriacontamer","Dotriacontamer","Tritriacontamer","Tetratriacontamer","Pentatriacontamer","Hexatriacontamer","Heptatriacontamer","Octatriacontamer","Nonatriacontamer","Tetracontamer","Untetracontamer","Dotetracontamer","Tritetracontamer","Tetratetracontamer","Pentatetracontamer","Hexatetracontamer","Heptatetracontamer","Octatetracontamer","Nonatetracontamer","Pentacontamer"]
 38 | const plotData = data.info.repInfo.lenFrequency;
 39 | const allRepClasses = data.info.repInfo.allRepClasses;
 40 | 
 41 | $('.ui.dropdown').dropdown();
 42 | $('.chart .item').tab();
 43 | $('.anno-chart .item').tab();
 44 | $('.ui .units').dropdown({values: [{name: 'length', value: 1, selected:true}, {name: 'units', value: 0}]});
 45 | 
 46 | 
 47 | // const read_bar_options = {
 48 | //     chart: { type: 'bar' },
 49 | //     plotOptions: { bar: { horizontal: false, columnwidth: '55%' } },
 50 | //     series: [{ data: [] }],
 51 | //     dataLabels: { enabled: false },
 52 | //     yaxis: { 'title': { 'text': 'Number of reads', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }},
 53 | //     xaxis: { categories: [], 'title': { 'text': 'Read length(bp)', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }},
 54 | //     title: { text: 'Read length distribution', align: 'left' }
 55 | // }
 56 | // const read_bar_chart = new ApexCharts(document.querySelector('#read-len-plot'), read_bar_options);
 57 | // read_chart.render();
 58 | 
 59 | 
 60 | /* 
 61 |     Bar graph
 62 |     - For bar graph we curate the data in barData with Repeat class as the key and [bases, frequency] as the value.
 63 | 
 64 |     - bar_activeSelected is the list of Repeat classes which are considered to be plotted.
 65 |         - can be selected by sort selection which is bar_sortSelected.
 66 |         - or by repeat selection dropdown stored in bar_repSelected.
 67 |         - sorted_barKeys stores the all repClasses sorted based on the datatype selected.
 68 |     
 69 |     The dataflow for the barChart is as follows.
 70 |     - There are two plot buttons dedicated individually to either plot with data based sorted keys or desired set of keys.
 71 |     - Based on which plot button is pressed that repeat set gets placed in the bar_activeSelected variable.
 72 |     - And then subsequently the data for the bar_activeSelected keys is plotted.
 73 |     - The data type and the sort customisation of the plots only deal with the bar_activeSelected keys.
 74 | 
 75 | */
 76 | 
 77 | 
 78 | const barData = {};
 79 | allRepClasses.forEach(function(e){
 80 |     if ((Object.keys(plotData).indexOf(e) != -1) && (plotData[e] != 0)) {
 81 |         const lengths = _.map(Object.keys(plotData[e]), d => { return parseInt(d); });
 82 |         let frequency = 0; let bases = 0; let reads = 0;
 83 |         for (let l in lengths) { 
 84 |             l = lengths[l]; frequency += _.sum(plotData[e][l]); bases += _.sum(plotData[e][l]) * l;
 85 |         }
 86 |         reads = parseInt(data.info.repInfo.repClassInfo[e]['reads']);
 87 |         barData[e] = [bases, frequency, reads];
 88 |     } else { barData[e] = [0, 0, 0]; }
 89 | })
 90 | let bar_dataType = 1;
 91 | const bar_dataTypes = ['Bases', 'Frequency', 'Reads'];
 92 | let bar_sortSelected = [];
 93 | let bar_sortOrder = 1;
 94 | let bar_numReps = 10;
 95 | let bar_repSelected = [];
 96 | let sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; });
 97 | let bar_activeSelected = [];
 98 | 
 99 | const bar_options = {
100 |     chart: { type: 'bar' },
101 |     plotOptions: { bar: { horizontal: false, columnwidth: '55%' } },
102 |     series: [{ data: [] }],
103 |     dataLabels: { enabled: false },
104 |     yaxis: { 'title': { 'text': 'Frequency', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }},
105 |     xaxis: { categories: [], 'title': { 'text': 'Repeat Class', 'style': { 'fontSize': '16px', 'font-weight': 'bold' } }},
106 |     title: { text: 'Repeat Frequency', align: 'left' }
107 | }
108 | const bar_chart = new ApexCharts(document.querySelector('#bar-plot-area'), bar_options);
109 | bar_chart.render();
110 | const plotBar = function(keys){ 
111 |     const values = [];
112 |     keys.forEach(function(e){ values.push(barData[e][bar_dataType]); });
113 |     const name = bar_dataTypes[bar_dataType];
114 |     bar_chart.updateOptions({series: [{'name': name, data: values}], yaxis: { title: { 'text': name, 'style': { 'fontSize': '16px', 'font-weight': 'bold' } } }, xaxis: {categories: keys}, animate: true})
115 | }
116 | 
117 | $('#bar-numRep').change(function(){ bar_numReps = this.value; });
118 | $('.ui .dropdown.sort-order').dropdown({
119 |     values: [{name: 'top', value: 1, selected:true}, {name: 'bottom', value: 0}],
120 |     onChange: function(value) { bar_sortOrder = value; }
121 | });
122 | $('#bar-sortPlot-button').click(function(){ 
123 |     if (bar_sortOrder == 1) { bar_sortSelected = sorted_barKeys.slice(sorted_barKeys.length - bar_numReps); bar_sortSelected.reverse();}
124 |     else { bar_sortSelected = sorted_barKeys.slice(0, bar_numReps); }
125 |     bar_activeSelected = bar_sortSelected; plotBar(bar_activeSelected); 
126 | })
127 | $('#bar-sortPlot-button').trigger("click")
128 | 
129 | $("#bar-repeat-select").multiSelect({
130 |     selectableOptgroup: true,
131 |     afterSelect: function(d){ d.forEach(function(e){ if (bar_repSelected.indexOf(e) == -1) { bar_repSelected.push(e) } })},
132 |     afterDeselect: function(d){ d.forEach(element => { bar_repSelected.splice(bar_repSelected.indexOf(element), 1); }); } 
133 | });
134 | $('#bar-repPlot-button').click(function(){ 
135 |     bar_repSelected = _.sortBy(bar_repSelected, o => {return allRepClasses.indexOf(o)}); 
136 |     bar_activeSelected = bar_repSelected; plotBar(bar_activeSelected); 
137 | })
138 | 
139 | // Once the data type is selected the global variable bar_dataType changes
140 | // also the sorted_barKeys is updated based on the datatype.
141 | $('.ui.checkbox.bar').checkbox({ onChange: function(val){
142 |     bar_dataType = this.value;
143 |     sorted_barKeys = _.sortBy(Object.keys(barData), k => { return barData[k][bar_dataType]; }); 
144 |     plotBar(bar_activeSelected)
145 | }});
146 | 
147 | $('#asort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); plotBar(bar_activeSelected); })
148 | $('#dsort-alpha').click(function(){ bar_activeSelected = bar_activeSelected.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) }); bar_activeSelected.reverse(); plotBar(bar_activeSelected); })
149 | $('#asort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); plotBar(bar_activeSelected); })
150 | $('#dsort-num').click(function(){ bar_activeSelected = _.sortBy( bar_activeSelected, k => { return barData[k][bar_dataType];}); bar_activeSelected.reverse(); plotBar(bar_activeSelected); })
151 | 
152 | 
153 | /* 
154 |     Pie graph
155 |     - For pie chart as we deal with frequency and bases data of repeat classes we continue using barData for data retrieval.
156 | 
157 |     - pie_activeSelected is the list of Repeat classes which are considered to be plotted.
158 |         - As there is only way to select repeats there will be no updating the repeats.
159 |     
160 |     The dataflow for the barChart is as follows.
161 |     - There are two plot buttons dedicated individually to either plot with data based sorted keys or desired set of keys.
162 |     - Based on which plot button is pressed that repeat set gets placed in the bar_activeSelected variable.
163 |     - And then subsequently the data for the bar_activeSelected keys is plotted.
164 |     - The data type and the sort customisation of the plots only deal with the bar_activeSelected keys.
165 | 
166 | */
167 | 
168 | let pie_activeSelected = allRepClasses;
169 | let pie_dataType = 1;
170 | let pie_group = true;
171 | $("#pie-kmer-toggle").checkbox({
172 |     onChecked: function(){ pie_group = true; plotPie(pie_activeSelected) },
173 |     onUnchecked: function(){ pie_group= false; plotPie(pie_activeSelected) }
174 | });
175 | $("#pie-repeat-select").multiSelect({
176 |     selectableOptgroup: true,
177 |     afterSelect: function(d){ d.forEach(function(e){ if (pie_activeSelected.indexOf(e) == -1) { pie_activeSelected.push(e) } })},
178 |     afterDeselect: function(d){ d.forEach(element => { pie_activeSelected.splice(pie_activeSelected.indexOf(element), 1); }); } 
179 | });
180 | $(".ui.checkbox.pie.radio.pie-data-type").checkbox({ onChange: function(){ pie_dataType = this.value; console.log(pie_dataType); plotPie(pie_activeSelected) }});
181 | const pie_options = {
182 |     chart: { type: 'pie' },
183 |     labels: ['Monomers', 'Dimers', 'Trimers', 'Tetramers', 'Pentamers', 'Hexamers'],
184 |     series: [10, 10, 10, 10, 10, 10],
185 |     responsive: [{ breakpoint: 480 }],
186 |     colors: ["#3366cc", "#dc3912", "#ff9900", "#109618", "#990099", "#0099c6", "#dd4477", "#66aa00", "#b82e2e", "#316395", "#994499", "#22aa99", "#aaaa11", "#6633cc", "#e67300", "#8b0707", "#651067", "#329262", "#5574a6", "#3b3eac"],
187 |     // theme: { monochrome: { enabled: true }}
188 | }
189 | let pie_chart = new ApexCharts( document.querySelector("#pie-plot-area"), pie_options );
190 | pie_chart.render();
191 | 
192 | const plotPie = function(keys){ 
193 |     let values = [];
194 |     keys = keys.sort(function(a, b){ return allRepClasses.indexOf(a) - allRepClasses.indexOf(b) })
195 |     keys.forEach(function(e){ values.push(barData[e][pie_dataType]); });
196 |     if (pie_group == true) { 
197 |         values = [];
198 |         let group_keys = [];
199 |         const kmer_lengths = _.uniq(_.map(keys, e => { return e.length; })).sort();
200 |         kmer_lengths.forEach( e => { group_keys.push(numPrefixObj[e - 1]); values.push(0); });
201 |         keys.forEach(e => { values[kmer_lengths.indexOf(e.length)] += barData[e][pie_dataType]; })
202 |         keys = group_keys;
203 |     }
204 |     pie_chart.updateOptions({labels:keys, series: values, animate: true})
205 | }
206 | 
207 | $("#pie-plot-button").click(function(){
208 |     pie_activeSelected = _.sortBy(pie_activeSelected, o => {return allRepClasses.indexOf(o)});
209 |     plotPie(pie_activeSelected);
210 | });
211 | 
212 | //  Initialising Pie Chart
213 | plotPie(pie_activeSelected);
214 | 
215 | /* 
216 |     Line graph
217 |     - We retrieve the data for line plots from plotData.
218 | 
219 |     - Line plot has options to
220 |         - Select repeats which will be saved in line_activeSelected variable
221 |         - Length range in which it has to plot the data
222 |     
223 |     - Required data
224 |         - The minimum length/units criteria is retrieved from the repInfo.
225 | 
226 |     The dataflow for the lineChart is as follows.
227 |     - The data flow is prestty simple from the options selected the relavant data is retrieved and plotted
228 | 
229 | */
230 | 
231 | let minLength = data.info.repInfo.minLength;
232 | let minUnits = data.info.repInfo.minUnits;
233 | let minRange = 12;
234 | let maxRange = 50;
235 | let line_dataType = 1;
236 | let line_activeSelected = ['A', 'C'];
237 | 
238 | 
239 | $('.ui .dropdown.units').dropdown({
240 |     values: [{name: 'length', value: 1, selected:true}, {name: 'units', value: 0}],
241 |     onChange: function(value) { line_dataType = value;}
242 | });
243 | $("#line-repeat-select").multiSelect({
244 |     selectableOptgroup: true,
245 |     afterSelect: function(d){ d.forEach(function(e){ if (line_activeSelected.indexOf(e) == -1) { line_activeSelected.push(e) } }) },
246 |     afterDeselect: function(d){ d.forEach(element => { line_activeSelected.splice(line_activeSelected.indexOf(element), 1); }); } 
247 | });
248 | $('#line-min-len').change(function(){ minRange = parseInt(this.value); plotLine(line_activeSelected); });
249 | $('#line-max-len').change(function(){ maxRange = parseInt(this.value); plotLine(line_activeSelected); });
250 | const line_options = {
251 |     chart: { type: 'line', zoom: { enabled: false }},
252 |     dataLabels: { enabled: false },
253 |     stroke: { curve: 'straight', width: 2 },
254 |     series: [],
255 |     title: { text: 'Repeat sequence length(bp) vs Abundance', align: 'left' },
256 |     grid: { row: { colors: ['#f3f3f3', 'transparent'], opacity: 0.5 } },
257 |     tooltip: { x: {
258 |         formatter: function(val) { return `Length: ${val}bp` }
259 |     }},
260 |     markers: {size: 0},
261 |     yaxis: { title: { text: 'Frequency', 
262 |              style: { 'fontSize': '16px', 'font-weight': 'bold' } }},
263 |     xaxis: { title: { text: 'Length (bp)', 
264 |              style: { 'fontSize': '16px', 'font-weight': 'bold' } },
265 |             //  labels: { format: '%d' },
266 |              tickAmount: parseInt((maxRange - minRange)/2) },
267 |     legend: { position: 'top' }
268 | }
269 | const line_chart = new ApexCharts( document.querySelector("#line-plot-area"), line_options );
270 | line_chart.render();
271 | 
272 | const plotLine = function(keys) {
273 |     const xValues = _.range(minRange, maxRange + 1);
274 |     const series = [];
275 |     keys.forEach(function(key){
276 |         const data = [];
277 |         if (line_dataType == 0) {
278 |             for (let i = minRange; i <= maxRange; i++) {
279 |                 let val = 0;
280 |                 for ( let j = 0; j < key.length; j++) { const repLen = (i*key.length) + j; const v = (plotData[key][repLen]) ? _.sum(plotData[key][repLen]) : 0; val += v; }
281 |                 data.push(val);
282 |             }
283 |         }
284 |         else { for (let i = minRange; i <= maxRange; i++){ const val = (plotData[key][i]) ? _.sum(plotData[key][i]) : 0; data.push(val); } }
285 |         series.push({'name': key, 'data': data});
286 |     })
287 |     line_chart.updateOptions({series: series, xaxis: {categories: xValues}})
288 | }
289 | $('#line-plot-button').click(function(){ plotLine(line_activeSelected); })
290 | 
291 | // Initialsing line chart
292 | plotLine(line_activeSelected);
293 | 


--------------------------------------------------------------------------------
/PERF/lib/src/tables_fasta.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |     The "tables.js" contains JS code which updates the data for
 3 |     tables present in the PERF-analysis module.
 4 | 
 5 |     This web application is developed with Semantic-ui frame work.
 6 |     The charts are build using Apex-Chart js charting library.
 7 | 
 8 |     All the data for the report is derived from analyse_data.js
 9 |     data = {info: {genomeInfo: {}, repInfo: {}, plotInfo: {}}}
10 | 
11 | */
12 | 
13 | 
14 | 
15 | const updateSummaryTableData = function(tableId, tableData) {
16 |     const tableDOM = document.getElementById(tableId);
17 | 
18 |     const table = document.createElement('table');
19 |     table.className = "ui sortable celled table";
20 |     const tableHead = document.createElement('thead')
21 |     const tableHeadRow = document.createElement('tr');
22 |     const header = ['Repeat Class', 'Frequency', '% Frequency', 'Bases', '% Bases']
23 |     header.forEach(function(e){ const headCell = document.createElement('th'); headCell.innerHTML = e; tableHeadRow.appendChild(headCell); })
24 |     tableHead.appendChild(tableHeadRow);
25 | 
26 |     const tableBody = document.createElement('tbody');
27 |     const totalRepBases = _.sum(_.map(Object.keys(tableData), o => { return tableData[o][0]; }));
28 |     const totalRepFreq = _.sum(_.map(Object.keys(tableData), o => { return tableData[o][1]; }));
29 |     const totals = [totalRepBases, totalRepFreq]
30 |     for (let rep in allRepClasses) {
31 |         rep = allRepClasses[rep];
32 |         const row = document.createElement('tr');
33 |         const rep_cell = document.createElement('td');
34 |         rep_cell.innerHTML = rep; row.appendChild(rep_cell);
35 | 
36 |         const rowData = []; 
37 |         tableData[rep].forEach(function(d, i){ rowData.push(d); rowData.push(((d/totals[i])*100).toFixed(3)); });
38 |         rowData.forEach(function(e){ const cell = document.createElement('td'); cell.innerHTML = e; row.appendChild(cell); })
39 | 
40 |         tableBody.appendChild(row);
41 |     }
42 | 
43 |     table.appendChild(tableHead);
44 |     table.appendChild(tableBody);
45 |     tableDOM.appendChild(table);
46 | }
47 | 
48 | updateSummaryTableData('rep-summary-table', barData);
49 | 
50 | const updateLongestRepeatsTableData = function(tableId, tableData) {
51 |     const tableDOM = document.getElementById(tableId);
52 |     const table = document.createElement('table');
53 |     table.className = "ui sortable celled table";
54 |     const tableHead = document.createElement('thead')
55 |     const tableHeadRow = document.createElement('tr');
56 |     const header = ['Sequence id', 'Start', 'Stop', 'Repeat Class', 'Repeat length', 'Strand', 'Units', 'Actual Repeat'];
57 |     header.forEach(function(e){ const headCell = document.createElement('th'); headCell.innerHTML = e; tableHeadRow.appendChild(headCell); })
58 |     tableHead.appendChild(tableHeadRow);
59 | 
60 |     const tableBody = document.createElement('tbody');
61 |     for (let d in tableData) {
62 |         d = tableData[d];
63 |         const row = document.createElement('tr');
64 |         const rowData = ["seq", "start", "end", "repClass", "repLength", "repOri", "repUnit", "actualRep"];
65 |         rowData.forEach(function(e){ const cell = document.createElement('td'); cell.innerHTML = d[e]; row.appendChild(cell); })
66 |         tableBody.appendChild(row);
67 |     }
68 | 
69 |     table.appendChild(tableHead);
70 |     table.appendChild(tableBody);
71 |     tableDOM.appendChild(table);
72 | }
73 | 
74 | updateLongestRepeatsTableData('longest-repeats-table', data.info.repInfo.longestRepeats);
75 | updateLongestRepeatsTableData('mostunits-repeats-table', data.info.repInfo.mostRepeatUnits);


--------------------------------------------------------------------------------
/PERF/lib/src/tables_fastq.js:
--------------------------------------------------------------------------------
 1 | /*
 2 |     The "tables.js" contains JS code which updates the data for
 3 |     tables present in the PERF-analysis module.
 4 | 
 5 |     This web application is developed with Semantic-ui frame work.
 6 |     The charts are build using Apex-Chart js charting library.
 7 | 
 8 |     All the data for the report is derived from analyse_data.js
 9 |     data = {info: {genomeInfo: {}, repInfo: {}, plotInfo: {}}}
10 | 
11 | */
12 | 
13 | 
14 | 
15 | const updateSummaryTableData = function(tableId, tableData) {
16 |     const tableDOM = document.getElementById(tableId);
17 | 
18 |     const table = document.createElement('table');
19 |     table.className = "ui sortable celled table";
20 |     const tableHead = document.createElement('thead')
21 |     const tableHeadRow = document.createElement('tr');
22 |     const header = ['Repeat Class', 'Frequency', 'Frequency per million', '% Frequency', 'Reads', 'Reads per million', '% Reads', 'Bases']
23 |     header.forEach(function(e){ const headCell = document.createElement('th'); headCell.innerHTML = e; tableHeadRow.appendChild(headCell); })
24 |     tableHead.appendChild(tableHeadRow);
25 | 
26 |     const tableBody = document.createElement('tbody');
27 | 
28 |     // const totalRepBases = _.sum(_.map(Object.keys(tableData), o => { return tableData[o][0]; }));
29 |     const totalRepFreq = tableData.totalRepFreq;
30 |     const totalRepReads = tableData.totalRepReads;
31 |     const totals = [totalRepReads, totalRepFreq]
32 |     const cell_keys = ['instances']
33 |     for (let rep in allRepClasses) {
34 |         rep = allRepClasses[rep];
35 |         const row = document.createElement('tr');
36 |         const rep_cell = document.createElement('td');
37 |         rep_cell.innerHTML = rep; row.appendChild(rep_cell);
38 | 
39 |         let rowData = [];
40 | 
41 |         if (rep in tableData.repClassInfo) {
42 |             const repInfo = tableData.repClassInfo[rep];
43 |             rowData.push(repInfo['instances']);
44 |             rowData.push(repInfo['instances_norm']);
45 |             rowData.push(((repInfo['instances']/totalRepFreq)*100).toFixed(3));
46 | 
47 |             rowData.push(repInfo['reads']);
48 |             rowData.push(repInfo['reads_norm']);
49 |             rowData.push(((repInfo['reads']/totalRepReads)*100).toFixed(3));
50 | 
51 |             rowData.push(repInfo['bases']);
52 |         }
53 |         else { rowData = Array(7).fill(0); }
54 |         rowData.forEach(function(e){ const cell = document.createElement('td'); cell.innerHTML = e; row.appendChild(cell); })
55 | 
56 |         tableBody.appendChild(row);
57 |     }
58 | 
59 |     table.appendChild(tableHead);
60 |     table.appendChild(tableBody);
61 |     tableDOM.appendChild(table);
62 | }
63 | 
64 | updateSummaryTableData('rep-summary-table', data.info.repInfo);


--------------------------------------------------------------------------------
/PERF/lib/styles/apexcharts.min.css:
--------------------------------------------------------------------------------
1 | .apexcharts-canvas{position:relative;user-select:none}.apexcharts-canvas ::-webkit-scrollbar{-webkit-appearance:none;width:6px}.apexcharts-canvas ::-webkit-scrollbar-thumb{border-radius:4px;background-color:rgba(0,0,0,.5);box-shadow:0 0 1px rgba(255,255,255,.5);-webkit-box-shadow:0 0 1px rgba(255,255,255,.5)}.apexcharts-inner{position:relative}.legend-mouseover-inactive{transition:.15s ease all;opacity:.2}.apexcharts-series-collapsed{opacity:0}.apexcharts-gridline,.apexcharts-text{pointer-events:none}.apexcharts-tooltip{border-radius:5px;box-shadow:2px 2px 6px -4px #999;cursor:default;font-size:14px;left:62px;opacity:0;pointer-events:none;position:absolute;top:20px;overflow:hidden;white-space:nowrap;z-index:12;transition:.15s ease all}.apexcharts-tooltip.light{border:1px solid #e3e3e3;background:rgba(255,255,255,.96)}.apexcharts-tooltip.dark{color:#fff;background:rgba(30,30,30,.8)}.apexcharts-tooltip *{font-family:inherit}.apexcharts-area-series .apexcharts-area,.apexcharts-line,.apexcharts-tooltip .apexcharts-marker{pointer-events:none}.apexcharts-tooltip.active{opacity:1;transition:.15s ease all}.apexcharts-tooltip-title{padding:6px;font-size:15px;margin-bottom:4px}.apexcharts-tooltip.light .apexcharts-tooltip-title{background:#eceff1;border-bottom:1px solid #ddd}.apexcharts-tooltip.dark .apexcharts-tooltip-title{background:rgba(0,0,0,.7);border-bottom:1px solid #222}.apexcharts-tooltip-text-value,.apexcharts-tooltip-text-z-value{display:inline-block;font-weight:600;margin-left:5px}.apexcharts-tooltip-text-z-label:empty,.apexcharts-tooltip-text-z-value:empty{display:none}.apexcharts-tooltip-text-value,.apexcharts-tooltip-text-z-value{font-weight:600}.apexcharts-tooltip-marker{width:12px;height:12px;position:relative;top:0;margin-right:10px;border-radius:50%}.apexcharts-tooltip-series-group{padding:0 10px;display:none;text-align:left;justify-content:left;align-items:center}.apexcharts-tooltip-series-group.active .apexcharts-tooltip-marker{opacity:1}.apexcharts-tooltip-series-group.active,.apexcharts-tooltip-series-group:last-child{padding-bottom:4px}.apexcharts-tooltip-y-group{padding:6px 0 5px}.apexcharts-tooltip-candlestick{padding:4px 8px}.apexcharts-tooltip-candlestick>div{margin:4px 0}.apexcharts-tooltip-candlestick span.value{font-weight:700}.apexcharts-xaxistooltip{opacity:0;padding:9px 10px;pointer-events:none;color:#373d3f;font-size:13px;text-align:center;border-radius:2px;position:absolute;z-index:10;background:#eceff1;border:1px solid #90a4ae;transition:.15s ease all}.apexcharts-xaxistooltip:after,.apexcharts-xaxistooltip:before{left:50%;border:solid transparent;content:" ";height:0;width:0;position:absolute;pointer-events:none}.apexcharts-xaxistooltip:after{border-color:rgba(236,239,241,0);border-width:6px;margin-left:-6px}.apexcharts-xaxistooltip:before{border-color:rgba(144,164,174,0);border-width:7px;margin-left:-7px}.apexcharts-xaxistooltip-bottom:after,.apexcharts-xaxistooltip-bottom:before{bottom:100%}.apexcharts-xaxistooltip-bottom:after{border-bottom-color:#eceff1}.apexcharts-xaxistooltip-bottom:before{border-bottom-color:#90a4ae}.apexcharts-xaxistooltip-top:after,.apexcharts-xaxistooltip-top:before{top:100%}.apexcharts-xaxistooltip-top:after{border-top-color:#eceff1}.apexcharts-xaxistooltip-top:before{border-top-color:#90a4ae}.apexcharts-xaxistooltip.active{opacity:1;transition:.15s ease all}.apexcharts-yaxistooltip{opacity:0;padding:4px 10px;pointer-events:none;color:#373d3f;font-size:13px;text-align:center;border-radius:2px;position:absolute;z-index:10;background:#eceff1;border:1px solid #90a4ae}.apexcharts-yaxistooltip:after,.apexcharts-yaxistooltip:before{top:50%;border:solid transparent;content:" ";height:0;width:0;position:absolute;pointer-events:none}.apexcharts-yaxistooltip:after{border-color:rgba(236,239,241,0);border-width:6px;margin-top:-6px}.apexcharts-yaxistooltip:before{border-color:rgba(144,164,174,0);border-width:7px;margin-top:-7px}.apexcharts-yaxistooltip-left:after,.apexcharts-yaxistooltip-left:before{left:100%}.apexcharts-yaxistooltip-left:after{border-left-color:#eceff1}.apexcharts-yaxistooltip-left:before{border-left-color:#90a4ae}.apexcharts-yaxistooltip-right:after,.apexcharts-yaxistooltip-right:before{right:100%}.apexcharts-yaxistooltip-right:after{border-right-color:#eceff1}.apexcharts-yaxistooltip-right:before{border-right-color:#90a4ae}.apexcharts-yaxistooltip.active{opacity:1}.apexcharts-xcrosshairs,.apexcharts-ycrosshairs{pointer-events:none;opacity:0;transition:.15s ease all}.apexcharts-xcrosshairs.active,.apexcharts-ycrosshairs.active{opacity:1;transition:.15s ease all}.apexcharts-ycrosshairs-hidden{opacity:0}.apexcharts-zoom-rect{pointer-events:none}.apexcharts-selection-rect{cursor:move}.svg_select_points,.svg_select_points_rot{opacity:0;visibility:hidden}.svg_select_points_l,.svg_select_points_r{cursor:ew-resize;opacity:1;visibility:visible;fill:#888}.apexcharts-canvas.zoomable .hovering-zoom{cursor:crosshair}.apexcharts-canvas.zoomable .hovering-pan{cursor:move}.apexcharts-xaxis,.apexcharts-yaxis{pointer-events:none}.apexcharts-menu-icon,.apexcharts-pan-icon,.apexcharts-reset-zoom-icon,.apexcharts-selection-icon,.apexcharts-zoom-icon,.apexcharts-zoom-in-icon,.apexcharts-zoom-out-icon{cursor:pointer;width:20px;height:20px;text-align:center}.apexcharts-menu-icon svg,.apexcharts-reset-zoom-icon svg,.apexcharts-zoom-icon svg,.apexcharts-zoom-in-icon svg,.apexcharts-zoom-out-icon svg{fill:#6e8192}.apexcharts-selection-icon svg{fill:#444;transform:scale(.76)}.apexcharts-reset-zoom-icon.selected svg,.apexcharts-selection-icon.selected svg,.apexcharts-zoom-icon.selected svg{fill:#008ffb}.apexcharts-menu-icon:hover svg,.apexcharts-reset-zoom-icon:hover svg,.apexcharts-selection-icon:not(.selected):hover svg,.apexcharts-zoom-icon:not(.selected):hover svg,.apexcharts-zoom-in-icon:hover svg,.apexcharts-zoom-out-icon:hover svg{fill:#333}.apexcharts-menu-icon,.apexcharts-selection-icon{position:relative}.apexcharts-reset-zoom-icon{margin-left:5px}.apexcharts-menu-icon,.apexcharts-reset-zoom-icon,.apexcharts-zoom-icon{transform:scale(.85)}.apexcharts-zoom-in-icon,.apexcharts-zoom-out-icon{transform:scale(.7)}.apexcharts-zoom-out-icon{margin-right:3px}.apexcharts-pan-icon{transform:scale(.62);position:relative;left:1px;top:0}.apexcharts-pan-icon svg{fill:#fff;stroke:#6e8192;stroke-width:2}.apexcharts-pan-icon.selected svg{stroke:#008ffb}.apexcharts-pan-icon:not(.selected):hover svg{stroke:#333}.apexcharts-toolbar{position:absolute;z-index:11;top:0;right:3px;max-width:176px;text-align:right;border-radius:3px;padding:0 6px 2px 6px;display:flex;justify-content:space-between;align-items:center}.apexcharts-toolbar svg{pointer-events:none}.apexcharts-menu{background:#fff;position:absolute;top:100%;border:1px solid #ddd;border-radius:3px;padding:3px;right:10px;opacity:0;min-width:110px;transition:.15s ease all;pointer-events:none}.apexcharts-menu.open{opacity:1;pointer-events:all;transition:.15s ease all}.apexcharts-menu-item{padding:6px 7px;font-size:12px;cursor:pointer}.apexcharts-menu-item:hover{background:#eee}@media screen and (min-width:768px){.apexcharts-canvas:hover .apexcharts-toolbar{opacity:1}}.apexcharts-datalabel.hidden{opacity:0}.apexcharts-datalabel,.apexcharts-datalabel-label,.apexcharts-datalabel-value,.apexcharts-pie-label{cursor:default;pointer-events:none}.apexcharts-pie-label-delay{opacity:0;animation-name:opaque;animation-duration:.3s;animation-fill-mode:forwards;animation-timing-function:ease}.apexcharts-canvas .hidden{opacity:0}.apexcharts-hide .apexcharts-series-points{opacity:0}.apexcharts-area-series .apexcharts-series-markers .apexcharts-marker.no-pointer-events,.apexcharts-line-series .apexcharts-series-markers .apexcharts-marker.no-pointer-events,.apexcharts-radar-series path,.apexcharts-radar-series polygon{pointer-events:none}.apexcharts-marker{transition:.15s ease all}@keyframes opaque{0%{opacity:0}100%{opacity:1}}


--------------------------------------------------------------------------------
/PERF/lib/styles/main.css:
--------------------------------------------------------------------------------
 1 | #navbar {
 2 |     position: fixed;
 3 |     width: 50px;
 4 |     height: 50px;
 5 |     top: 100px; right: 0px; bottom: 0px; left: 20px;
 6 |     z-index: 100;
 7 |     opacity: 0.5
 8 | }
 9 | 
10 | #navbar:hover {
11 |     opacity: 1;
12 | }
13 | 
14 | #content {
15 |     min-height: 100px;
16 | }
17 | 
18 | .ui.grid{
19 |     padding: 0 !important;
20 | }
21 | 
22 | .pushable.segment{
23 |     margin: 0 !important;
24 | }
25 | 
26 | .sumstat-segment {
27 |     box-shadow: none !important;
28 |     padding: 8px !important;
29 |     margin: 0px !important;
30 |     border: white !important;
31 | }
32 | 
33 | 
34 | 
35 | .ms-selectable {
36 |     float: none !important;
37 |     width: 100% !important;
38 | }
39 | 
40 | .ms-container {
41 |     width: 100% !important;
42 | }
43 | 
44 | .ms-selection {
45 |     margin-top: 10px;
46 |     float: none !important;
47 |     width: 100% !important;
48 | }
49 | 
50 | .button  {
51 |     margin-top: 5px !important
52 | }
53 | 
54 | .plot-options {
55 |     min-width: 200px !important;
56 | }


--------------------------------------------------------------------------------
/PERF/lib/styles/multi-select.min.css:
--------------------------------------------------------------------------------
1 | .ms-container{background:transparent url(../img/switch.png) no-repeat 50% 50%;width:370px}.ms-container:after{content:".";display:block;height:0;line-height:0;font-size:0;clear:both;min-height:0;visibility:hidden}.ms-container .ms-selectable,.ms-container .ms-selection{background:#fff;color:#555;float:left;width:45%}.ms-container .ms-selection{float:right}.ms-container .ms-list{-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,.075);box-shadow:inset 0 1px 1px rgba(0,0,0,.075);-webkit-transition:border linear .2s,box-shadow linear .2s;-moz-transition:border linear .2s,box-shadow linear .2s;-ms-transition:border linear .2s,box-shadow linear .2s;-o-transition:border linear .2s,box-shadow linear .2s;transition:border linear .2s,box-shadow linear .2s;border:1px solid #ccc;-webkit-border-radius:3px;-moz-border-radius:3px;border-radius:3px;position:relative;height:200px;padding:0;overflow-y:auto}.ms-container .ms-list.ms-focus{border-color:rgba(82,168,236,.8);-webkit-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(82,168,236,.6);-moz-box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(82,168,236,.6);box-shadow:inset 0 1px 1px rgba(0,0,0,.075),0 0 8px rgba(82,168,236,.6);outline:0}.ms-container ul{margin:0;list-style-type:none;padding:0}.ms-container .ms-optgroup-container{width:100%}.ms-container .ms-optgroup-label{margin:0;padding:5px 0 0 5px;cursor:pointer;color:#999}.ms-container .ms-selectable li.ms-elem-selectable,.ms-container .ms-selection li.ms-elem-selection{border-bottom:1px #eee solid;padding:2px 10px;color:#555;font-size:14px}.ms-container .ms-selectable li.ms-hover,.ms-container .ms-selection li.ms-hover{cursor:pointer;color:#fff;text-decoration:none;background-color:#08c}.ms-container .ms-selectable li.disabled,.ms-container .ms-selection li.disabled{background-color:#eee;color:#aaa;cursor:text}


--------------------------------------------------------------------------------
/PERF/lib/template_fastq.html:
--------------------------------------------------------------------------------
  1 | <!DOCTYPE html>
  2 | <html>
  3 | <head>
  4 |     <meta charset="utf-8" />
  5 |     <meta http-equiv="X-UA-Compatible" content="IE=edge">
  6 |     <title>PERF analysis report</title>
  7 |     <meta name="viewport" content="width=device-width, initial-scale=1">
  8 |     <script>
  9 |         {fontawesome_js}
 10 |     </script>
 11 |     <style>
 12 |         {semantic_css}
 13 |     </style>
 14 |     <style>
 15 |         {multiselect_css}
 16 |     </style>
 17 |     <style>
 18 |         {apexcharts_css}
 19 |     </style>
 20 |     <style>
 21 |         {main_css}
 22 |     </style>
 23 | </head>
 24 | <body>
 25 |     
 26 |     <!-- Header section -->
 27 |     <div class="ui masthead vertical segment" style="background-color: rgb(49, 9, 82); padding-top: 20px; padding-bottom: 20px">
 28 |         <div class="ui container">
 29 |             <div class="introduction">
 30 |                 <p class="ui header" style="color: white">PERF analysis report
 31 |                     <a class="ui label" href="https://github.com/RKMlab/perf" target="_blank" style="float: right">
 32 |                         <span style="font-size: 14px">
 33 |                             <i class="fab fa-github"></i>
 34 |                             PERF 
 35 |                         </span>
 36 |                     </a>
 37 |                 </p>
 38 |             </div>
 39 |         </div>
 40 |     </div>
 41 | 
 42 |     <div class="ui hidden divider"></div>
 43 | 
 44 |     <div class="ui contatiner" style="padding: 1vw">
 45 |         <div id='navbar'>
 46 |             <div class="ui vertical icon menu inverted" id = "navmenu" style="background-color: rgb(49, 9, 82)">
 47 |                 <a class="item active" title='Summary' href="#summary-main"><i class="fas fa-info-circle fa-2x"></i></a>
 48 |                 <a class="item" title="Tables" href="#table-main"><i class="fas fa-table fa-2x"></i></a>
 49 |                 <a class="item"><i class="fas fa-chart-bar fa-2x"></i></a>
 50 |             </div>
 51 |         </div>
 52 |         <div class="ui one column grid">
 53 |             <div class="column" id="content-display" style="width: 100%;">
 54 |                 
 55 |                 <!-------------------- Summary section -------------------->
 56 |                 <div class="ui top attached segment" id="summary-main">
 57 |                     <div class="ui top attached label inverted" style="font-size: 14px; background-color: rgb(49, 9, 82)">Summary</div>
 58 |                     <div class="ui hidden divider"></div>
 59 | 
 60 |                     <div class="ui two column stackable grid">
 61 |                         <div class="stretched row">
 62 |                             <div class="column center aligned" style="width: 35%">
 63 |                                 <div class="ui top attached segment">
 64 |                                     <div class="ui top attached inverted green label" style="font-size: 12px">Sequence Info</div>
 65 |                                     <div class="column center aligned">
 66 |                                         <div class="ui segment sumstat-segment">
 67 |                                             <div class="ui mini green statistic">
 68 |                                                 <div class="value File_name">Homo sapiens.fna</div>
 69 |                                                 <div class="label">File Name</div>
 70 |                                             </div>
 71 |                                         </div>
 72 |                                         <div class="ui segment sumstat-segment">
 73 |                                             <div class="ui mini green statistic">
 74 |                                                 <div class="value Total_reads">3.24Gb</div>
 75 |                                                 <div class="label">Total reads</div>
 76 |                                             </div>
 77 |                                         </div>
 78 |                                         <div class="ui segment sumstat-segment">
 79 |                                             <div class="ui mini green statistic">
 80 |                                                 <div class="value Readlen_range">45.4</div>
 81 |                                                 <div class="label">Read length range</div>
 82 |         
 83 |                                             </div>
 84 |                                         </div>
 85 |                                         <div class="ui segment sumstat-segment">
 86 |                                             <div class="ui mini green statistic">
 87 |                                                 <div class="value Total_bases">545</div>
 88 |                                                 <div class="label">Total bases</div>
 89 |                                             </div>
 90 |                                         </div>
 91 |                                     </div>
 92 |                                 </div>
 93 |                             </div>
 94 |                             
 95 |                             <div class="column center aligned" style="width: 65%">
 96 |                                 <div class="ui top attached segment">
 97 |                                     <div class="ui top attached inverted blue label" style="font-size: 12px">Repeat Info</div>
 98 |                                     <div class="ui two column stackable grid">
 99 |                                         <div class="stretched row">
100 |                                             <div class="column center aligned">
101 |                                                 <div class="ui segment sumstat-segment">
102 |                                                     <div class="ui mini blue statistic">
103 |                                                         <div class="value totalRepFreq">4,631,324</div>
104 |                                                         <div class="label">Total repeats</div>
105 |                                                     </div>
106 |                                                 </div>
107 |                                                 <div class="ui segment sumstat-segment">
108 |                                                     <div class="ui mini blue statistic">
109 |                                                         <div class="value totalRepReads">69.09 Mb</div>
110 |                                                         <div class="label">Reads with repeats</div>
111 |                                                     </div>
112 |                                                 </div>
113 |                                                 <div class="ui segment sumstat-segment">
114 |                                                     <div class="ui mini blue statistic">
115 |                                                         <div class="value percentRepReadsNorm">2.15 %</div>
116 |                                                         <div class="label">Percent reads with repeats</div>
117 |                                                     </div>
118 |                                                 </div>
119 |                                             </div>
120 |                                             <div class="column center aligned">
121 |                                                 <div class="ui segment sumstat-segment">
122 |                                                     <div class="ui mini blue statistic">
123 |                                                         <div class="value numRepClasses">501/501</div>
124 |                                                         <div class="label">Repeat classes</div>
125 |                                                     </div>
126 |                                                 </div>
127 |                                                 <div class="ui segment sumstat-segment">
128 |                                                     <div class="ui mini blue statistic">
129 |                                                         <div class="value totalRepFreqNorm">1428</div>
130 |                                                         <div class="label">Repeats per million reads</div>
131 |                                                     </div>
132 |                                                 </div>
133 |                                                 <div class="ui segment sumstat-segment">
134 |                                                     <div class="ui mini blue statistic">
135 |                                                         <div class="value percentRepBases">21467</div>
136 |                                                         <div class="label">Percent repeat bases</div>
137 |                                                     </div>
138 |                                                 </div>
139 |                                             </div>
140 |                                         </div>
141 |                                     </div>
142 |                                 </div>
143 |                             </div>
144 |                         </div>
145 |                     </div>
146 |                 </div>
147 | 
148 |                 <!-- <div class="ui top attached segment" id="reads-plot">
149 |                     <div class="ui top attached label inverted" style="font-size: 14px; background-color: rgb(49, 9, 82)">Read length distribution</div>
150 |                     
151 |                     <div class="container">
152 |                         <div class="column"  style="height: 30vh; overflow-y: scroll" id="read-len-plot"></div>
153 |                     </div>
154 |                 </div> -->
155 | 
156 |                 <!-------------------- Table section -------------------->
157 |                 <div class="ui top attached segment" id="table-main">
158 |                     <div class="ui top attached label inverted" style="font-size: 14px; background-color: rgb(49, 9, 82)">Summary Table</div>
159 |                     
160 |                     <div class="container">
161 |                         <div class="column"  style="height: 30vh; overflow-y: scroll" id="rep-summary-table"></div>
162 |                     </div>
163 |                 </div>
164 | 
165 | 
166 |                 <!-- Main charts  -->
167 |                 <div class="ui top attached segment" id="charts-main">
168 |                     
169 |                     <div class="ui top attached label inverted" style="font-size: 14px; background-color: rgb(49, 9, 82); padding: 0px">
170 |                         <div class="ui left floated icon menu inverted" style="background-color: rgb(49, 9, 82); cursor: none">
171 |                             <a class="item" title="">Basic charts</a>
172 |                         </div>
173 |                         <div class="ui left floated tabular menu inverted chart" style="background-color: rgb(49, 9, 82)">
174 |                             <a class="item active" title="Repeat class Frequency" data-tab="bar"><i class="fas fa-chart-bar"></i></a>
175 |                             <a class="item" title="Motif distributions" data-tab="pie"><i class="fas fa-chart-pie"></i></a>
176 |                             <a class="item" title="Repeat Length Frequency" data-tab="line"><i class="fas fa-chart-line"></i></a>
177 |                         </div>
178 |                     </div>
179 | 
180 |                     <div class="ui hidden divider"></div>
181 | 
182 |                     <div class="container">
183 |                         <div class="ui bottom attached tab segment active" data-tab="bar">
184 |                             <div class="ui two column stackable grid">
185 |                                 <div class="column plot-options" style="width: 30%" id="bar-options">
186 |                                     <div class="ui top attached segment">
187 |                                         <div class="ui top attached label" style="font-size: 12px">Options</div>
188 |                                         
189 | 
190 |                                         <div>
191 |                                             Show
192 |                                             <div class="ui inline dropdown sort-order" id="bar-sort-order" style="margin-left: 5px; margin-right: 5px">
193 |                                                 <input type="hidden" name="sort-order">
194 |                                                 <div class="default text">Order</div>
195 |                                                 <div class="menu"></div>
196 |                                                 <i class="fas fa-caret-square-down"></i>
197 |                                             </div>
198 |                                             <div class="ui input"><input id="bar-numRep" style="border-radius: 0px; border-left: none; border-right: none; border-top: none; border-bottom: 1px solid black; padding: 4px; font-weight: bold" type="number" value=10 min=1 max=501></div>
199 |                                         </div>
200 |                                         <button class="ui primary button" id="bar-sortPlot-button">Plot</button>
201 | 
202 |                                         <div class="ui hidden divider"></div>
203 |                                         <div>
204 |                                             <select multiple="multiple" id="bar-repeat-select" name="bar-repeat-select">
205 |                                                 {repeat_options}
206 |                                             </select>
207 |                                         </div>
208 |                                         <button class="ui primary button" id="bar-repPlot-button">Plot</button>                                    
209 |                                         
210 |                                         <div class="ui hidden divider"></div>
211 |                                         <div class="ui form">
212 |                                             <div class="inline fields">
213 |                                                 <label for="bar-datatype" style="font-size: 12px; font-weight: bold">Data:</label>
214 |                                                 <div class="field">
215 |                                                     <div class="bar ui radio checkbox">
216 |                                                         <input type="radio" name="bar-datatype" value="0" class="hidden">
217 |                                                         <label style="font-size: 12px; font-weight: bold">Bases</label>
218 |                                                     </div>
219 |                                                 </div>
220 |                                                 <div class="field">
221 |                                                     <div class="bar ui radio checkbox">
222 |                                                         <input type="radio" name="bar-datatype" checked value="1" class="hidden">
223 |                                                         <label style="font-size: 12px; font-weight: bold">Frequency</label>
224 |                                                     </div>
225 |                                                 </div>
226 |                                                 <div class="field">
227 |                                                     <div class="bar ui radio checkbox">
228 |                                                         <input type="radio" name="bar-datatype" value="2" class="hidden">
229 |                                                         <label style="font-size: 12px; font-weight: bold">Reads</label>
230 |                                                     </div>
231 |                                                 </div>
232 |                                             </div>
233 |                                         </div>
234 |                                         
235 |                                         <div class="ui hidden divider"></div>
236 |                                         <button class="ui secondary icon button" id="asort-alpha"><i class="fas fa-sort-alpha-down"></i></button>
237 |                                         <button class="ui secondary icon button" id="dsort-alpha"><i class="fas fa-sort-alpha-down"></i></button>
238 |                                         <button class="ui secondary icon button" id="asort-num"><i class="fas fa-sort-amount-down-alt fa-rotate-270"></i></button>
239 |                                         <button class="ui secondary icon button" id="dsort-num"><i class="fas fa-sort-amount-down fa-rotate-270"></i></button>
240 |                                     </div>
241 |                                 </div>
242 |                             
243 |                                 <div class="column" style="width: 70%">
244 |                                     <div class="plot-area" id="bar-plot-area"></div>
245 |                                 </div>
246 |                             </div>
247 |                         </div>
248 |                         
249 |                         <div class="ui bottom attached tab segment" data-tab="pie">
250 |                             <div class="ui two column stackable grid">
251 |                                 <div class="column plot-options" style="width: 30%">
252 |                                     <div class="ui top attached segment">
253 |                                         <div class="ui top attached label" style="font-size: 12px">Options</div>
254 |                                                                             
255 |                                         <div class="inline field">
256 |                                             <div class="ui toggle checkbox pie" id="pie-kmer-toggle">
257 |                                                 <input type="checkbox" tabindex="0" class="hidden" checked>
258 |                                                 <label style="font-size: 12px; font-weight: bold">Kmer grouping</label>
259 |                                             </div>
260 |                                         </div>
261 | 
262 |                                         <div class="ui hidden divider"></div>
263 |                                         
264 |                                         <div style="font-weight: bold; margin-bottom: 3px">Select Repeats: </div>
265 |                                         <div>
266 |                                             <select multiple="multiple" id="pie-repeat-select" name="pie-repeat-select">
267 |                                                 {repeat_options}
268 |                                             </select>
269 |                                         </div>
270 | 
271 |                                         <button class="ui primary button" id="pie-plot-button">
272 |                                             Plot
273 |                                         </button>
274 | 
275 |                                         <div class="ui hidden divider"></div>
276 | 
277 |                                         <div class="ui form">
278 |                                             <div class="inline fields">
279 |                                                 <label for="pie-datatype" style="font-size: 12px; font-weight: bold">Data:</label>
280 |                                                 <div class="field">
281 |                                                     <div class="pie ui radio checkbox pie-data-type">
282 |                                                         <input type="radio" name="pie-datatype" value="0" class="hidden">
283 |                                                         <label style="font-size: 12px; font-weight: bold">Bases</label>
284 |                                                     </div>
285 |                                                 </div>
286 |                                                 <div class="field">
287 |                                                     <div class="pie ui radio checkbox pie-data-type">
288 |                                                         <input type="radio" name="pie-datatype" checked value="1" class="hidden">
289 |                                                         <label style="font-size: 12px; font-weight: bold">Frequency</label>
290 |                                                     </div>
291 |                                                 </div>
292 |                                                 <div class="field">
293 |                                                     <div class="pie ui radio checkbox pie-data-type">
294 |                                                         <input type="radio" name="pie-datatype" value="2" class="hidden">
295 |                                                         <label style="font-size: 12px; font-weight: bold">Reads</label>
296 |                                                     </div>
297 |                                                 </div>
298 |                                             </div>
299 |                                         </div>
300 | 
301 |                                     </div>
302 |                                 </div>
303 |                                 <div class="column" style="width: 70%">
304 |                                     <div class="plot-area" id="pie-plot-area">
305 |                                         
306 |                                     </div>
307 |                                 </div>
308 |                             </div>
309 |                         </div>
310 |                         
311 |                         <div class="ui bottom attached tab segment" data-tab="line">
312 |                             <div class="ui two column stackable grid">
313 |                                 <div class="column plot-options" style="width: 30%">
314 |                                     <div class="ui top attached segment">
315 |                                         <div class="ui top attached label" style="font-size: 12px">Options</div>
316 |                                         <div>
317 |                                             <div class="ui input"><input id="line-min-len" style="border-radius: 0px; border-left: none; border-right: none; border-top: none; border-bottom: 1px solid black; padding: 4px; font-weight: bold" type="number" value=12 min=1></div>
318 |                                             <span style="margin-left: 2px; margin-right: 2px">to
319 |                                             <div class="ui input"><input id="line-max-len" style="border-radius: 0px; border-left: none; border-right: none; border-top: none; border-bottom: 1px solid black; padding: 4px; font-weight: bold" type="number" value=50 min=1></div>
320 |                                             <div class="ui inline dropdown units" style="margin-left: 5px; margin-right: 5px">
321 |                                                 <input type="hidden" name="units">
322 |                                                 <div class="default text">length</div>
323 |                                                 <div class="menu"></div>
324 |                                                 <i class="fas fa-caret-square-down"></i>
325 |                                             </div>
326 |                                             range
327 |                                         </div>
328 |                                         <div class="ui hidden divider"></div>
329 |                                         <div>
330 |                                             <select multiple="multiple" id="line-repeat-select" name="line-repeat-select">
331 |                                                 {repeat_options}
332 |                                             </select>
333 |                                         </div>
334 |                                         <button class="ui primary button" id="line-plot-button">
335 |                                             Plot
336 |                                         </button>
337 | 
338 |                                     </div>
339 |                                 </div>
340 |                                 <div class="column" class="plot-area" id="line-plot-area"  style="width: 70%">
341 |                                     <div>
342 |                                         
343 |                                     </div>
344 |                                 </div>
345 |                             </div>
346 |                         </div>
347 |                     </div>
348 |                 </div>
349 | 
350 |                 <!-- Annotation Charts -->
351 |                 <div class="ui top attached segment" id="anno-charts-main">
352 |                     
353 |                     <div class="ui top attached label inverted" style="font-size: 14px; background-color: rgb(49, 9, 82); padding: 0px">
354 |                         <div class="ui left floated icon menu inverted" style="background-color: rgb(49, 9, 82); cursor: none">
355 |                             <a class="item" title="">Annotation charts</a>
356 |                         </div>
357 |                         <div class="ui left floated tabular menu inverted anno-chart" style="background-color: rgb(49, 9, 82)">
358 |                             <a class="item active" title="Repeat class Genomic distributions" data-tab="anno-stackbar"><i class="fas fa-chart-pie"></i></a>
359 |                             <a class="item" title="Compare Repeat class Genomic distributions" data-tab="anno-area"><i class="fas fa-chart-area"></i></a>
360 |                         </div>
361 |                     </div>
362 | 
363 |                     <div class="ui hidden divider"></div>
364 | 
365 |                     <div class="container">
366 |                         <div class="ui bottom attached tab segment active" data-tab="anno-stackbar">
367 |                             <div class="ui two column stackable grid">
368 |                                 <div class="column plot-options" style="width: 30%">
369 |                                     <div class="ui top attached segment">
370 |                                         <div class="ui top attached label" style="font-size: 12px">Options</div>
371 |                                         
372 |                                         <div class="inline field" style="margin-bottom: 20px">
373 |                                             <div class="ui toggle checkbox anno-stackbar" id="anno-stackbar-group-toggle">
374 |                                                 <input type="checkbox" tabindex="0" class="hidden">
375 |                                                 <label style="font-size: 12px; font-weight: bold">Group</label>
376 |                                             </div>
377 |                                         </div>
378 |                                         
379 |                                         <div>
380 |                                             <select multiple="multiple" id="anno-stackbar-repeat-select" name="anno-stackbar-repeat-select">
381 |                                                 {repeat_options}
382 |                                             </select>
383 |                                         </div>
384 | 
385 |                                         <button class="ui primary button" id="anno-stackbar-plot-button">Plot</button> 
386 |                                     </div>
387 |                                 </div>
388 |                                 <div class="column" style="width: 70%">
389 |                                     <div class="plot-area" id="anno-stackbar-plot-area">
390 |                                         
391 |                                     </div>
392 |                                 </div>
393 |                             </div>
394 |                         </div>
395 | 
396 |                         <div class="ui bottom attached tab segment" data-tab="anno-area">
397 |                             <div class="ui two column stackable grid">
398 |                                 <div class="column plot-options" style="width: 30%">
399 |                                     <div class="ui top attached segment">
400 |                                         <div class="ui top attached label" style="font-size: 12px">Options</div>
401 | 
402 |                                         <div style="margin-bottom: 20px">
403 |                                             Bin size: 
404 |                                             <div class="ui inline dropdown bin-size" id="bin-size" style="margin-left: 5px; margin-right: 5px">
405 |                                                 <input type="hidden" name="bin-size">
406 |                                                 <div class="default text">Bin size</div>
407 |                                                 <div class="menu"></div>
408 |                                                 <i class="fas fa-caret-square-down"></i>
409 |                                             </div>
410 |                                         </div>
411 | 
412 |                                         <div>
413 |                                             <select multiple="multiple" id="anno-area-repeat-select" name="anno-area-repeat-select">
414 |                                                 {repeat_options}
415 |                                             </select>
416 |                                         </div>
417 |                                         <button class="ui primary button" id="anno-area-plot-button">Plot</button>
418 |                                     </div>
419 |                                 </div>
420 |                             
421 |                                 <div class="column" style="width: 70%">
422 |                                     <div class="plot-area" id="anno-area-plot-area"></div>
423 |                                 </div>
424 |                             </div>
425 |                         </div>
426 | 
427 |                     </div>
428 | 
429 |                 </div>
430 |             </div>
431 |         </div>
432 |     </div>
433 | </body>
434 | <script>
435 |     {jquery_js}
436 | </script>
437 | <script>
438 |     {semantic_js}
439 | </script>
440 | <script>
441 |     {multiselect_js}
442 | </script>
443 | <script>
444 |     {apexcharts_js}
445 | </script>
446 | <script>
447 |     {lodash_js}
448 | </script>
449 | <script>
450 |     {analyse_data_js}
451 | </script>
452 | <script>
453 |     {main_js}
454 | </script>
455 | <script>
456 |     {tables_js}
457 | </script>
458 | </html>


--------------------------------------------------------------------------------
/PERF/rep_utils.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # pylint: disable=C0111, C0301
  3 | 
  4 | from __future__ import print_function, division
  5 | from itertools import product
  6 | from Bio import SeqIO
  7 | from tqdm import tqdm
  8 | import sys, gzip, os
  9 | from os import remove as del_file
 10 | import multiprocessing as multi
 11 | 
 12 | if sys.version_info.major == 2:
 13 |     from utils import rev_comp, rawcharCount, getGC, get_targetids
 14 |     from analyse import analyse_fasta
 15 |     from annotation import annotate
 16 | elif sys.version_info.major == 3:
 17 |     from .utils import rev_comp, rawcharCount, getGC, get_targetids
 18 |     from .analyse import analyse_fasta
 19 |     from .annotation import annotate
 20 | 
 21 | def num_factors(num):
 22 |     factors = []
 23 |     for i in range(1,num):
 24 |         if num%i == 0: factors.append(i)
 25 |     return factors
 26 | 
 27 | def expand_repeat(string, size):
 28 |     """Expands a motif to highest motif size, used for checking duplicates"""
 29 |     return_string = ''
 30 |     i = 0
 31 |     while len(return_string) < size:
 32 |         return_string += string[i]
 33 |         i += 1
 34 |         if i >= len(string):
 35 |             i = 0
 36 |     return return_string
 37 | 
 38 | 
 39 | def get_cycles(string):
 40 |     cycles = []
 41 |     for i in range(len(string)):
 42 |         cycles.append(string[i:] + string[:i])
 43 |     return cycles
 44 | 
 45 | 
 46 | def generate_repeats(sizes, atomic):
 47 |     """Generates all possible motifs for repeats in a given length range"""
 48 |     generated_repeats = []
 49 |     alphabet = ['A', 'C', 'G', 'T']
 50 |     expanded_set = set()
 51 |     repeat_set = set()
 52 |     sizes.sort()
 53 |     min_size = sizes[0]
 54 |     max_size = sizes[-1]
 55 |     non_atomic_repeats = dict()
 56 |     for s in range(1, max_size):
 57 |         if s not in sizes:
 58 |             non_atomic_repeats[s] = set()
 59 |             if atomic:
 60 |                 for combination in product(alphabet, repeat=s):
 61 |                     repeat = ''.join(combination)
 62 |                     expanded = expand_repeat(repeat, max_size)
 63 |                     non_atomic_repeats[s].add(expanded)
 64 |     for i in sizes:
 65 |         factors = num_factors(i)
 66 |         for combination in product(alphabet, repeat=i):
 67 |             repeat = ''.join(combination)
 68 |             repeat_revcomp = rev_comp(repeat)
 69 |             expanded = expand_repeat(repeat, max_size)
 70 |             atomic_check = False
 71 |             if atomic:
 72 |                 for factor in factors:
 73 |                     if factor not in sizes and expanded in non_atomic_repeats[factor]:
 74 |                         atomic_check = True
 75 |             if expanded in expanded_set:
 76 |                 continue
 77 |             elif atomic and atomic_check:
 78 |                 continue
 79 |             else:
 80 |                 repeat_cycles = get_cycles(repeat)
 81 |                 for cycle in repeat_cycles:
 82 |                     strand = '+'
 83 |                     string = expand_repeat(cycle, max_size)
 84 |                     expanded_set.add(string)
 85 |                     if cycle not in repeat_set:
 86 |                         repeat_set.add(cycle)
 87 |                         if len(cycle) >= min_size:
 88 |                             generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand]))
 89 |                 if repeat_revcomp == repeat:
 90 |                     continue
 91 |                 repeat_cycles = get_cycles(repeat_revcomp)
 92 |                 for cycle in repeat_cycles:
 93 |                     strand = '-'
 94 |                     string = expand_repeat(cycle, max_size)
 95 |                     expanded_set.add(string)
 96 |                     if cycle not in repeat_set:
 97 |                         repeat_set.add(cycle)
 98 |                         if len(cycle) >= min_size:
 99 |                             generated_repeats.append('\t'.join([cycle, repeat, str(len(cycle)), strand]))
100 |     return generated_repeats
101 | 
102 | 
103 | def build_rep_set(repeat_file, length_cutoff=None, unit_cutoff=None):
104 |     """
105 |         Outputs the repeats info dictionary used by the get_ssrs function.
106 |         Takes list of repeat motifs from repeats file(output by generate_repeats function) as input.
107 |         Creates a dictionary with expanded repeat as the key and (class, motif_length, strand) as values.
108 |         Works either by "length_cutoff=" or by "unit_cutoff=" arguments.
109 |     """
110 |     repeats_out = dict()
111 |     motif_fallback = dict()
112 |     motif_cutoff = dict()
113 |     repeat_lengths = set()
114 |     if length_cutoff is not None:
115 |         for line in repeat_file:
116 |             motif_dict = dict()
117 |             L = line.strip().split('\t')
118 |             motif = L[0]
119 |             motif_length = int(L[2])
120 |             motif = expand_repeat(motif, length_cutoff)
121 |             motif_dict['class'] = L[1]
122 |             motif_dict['motif_length'] = motif_length
123 |             motif_dict['strand'] = L[3]
124 |             repeats_out[motif] = motif_dict
125 |         repeats_out['cutoff'] = [length_cutoff]
126 | 
127 |     elif unit_cutoff is not None:
128 |         cutoffs = set()
129 |         for line in repeat_file:
130 |             motif_dict = dict()
131 |             L = line.strip().split('\t')
132 |             motif = L[0]
133 |             motif_length = int(L[2])
134 |             motif = motif*unit_cutoff[motif_length]
135 |             cutoffs.add(len(motif))
136 |             motif_dict['class'] = L[1]
137 |             motif_dict['motif_length'] = motif_length
138 |             motif_dict['strand'] = L[3]
139 |             repeats_out[motif] = motif_dict
140 |         repeats_out['cutoff'] = sorted(list(cutoffs))
141 | 
142 |     return repeats_out
143 | 
144 | 
145 | 
146 | def get_ssrs(seq_record, repeats_info, out):
147 |     """Native function that identifies repeats in fasta files."""
148 |     if type(out) == str:
149 |         out_file = open(out, 'w')
150 |     else:
151 |         out_file = out
152 |     length_cutoffs = repeats_info['cutoff']
153 |     input_seq = str(seq_record.seq).upper()
154 |     input_seq_length = len(input_seq)
155 |     for length_cutoff in length_cutoffs:
156 |         fallback = length_cutoff - 1
157 |         sub_start = 0  # substring start
158 |         sub_stop = sub_start + length_cutoff  # substring stop
159 |         while sub_stop <= input_seq_length:
160 |             sub_stop = sub_start + length_cutoff
161 |             sub_seq = input_seq[sub_start:sub_stop]
162 |             if sub_seq in repeats_info:
163 |                 match = True
164 |                 repeat_data = repeats_info[sub_seq]
165 |                 motif_length = repeat_data['motif_length']
166 |                 rep_class = repeat_data['class']
167 |                 strand = repeat_data['strand']
168 |                 offset = length_cutoff % motif_length
169 |                 repeat_seq = input_seq[sub_start+offset:sub_start+offset+motif_length]
170 |                 i = 0
171 |                 while match:
172 |                     j = sub_stop
173 |                     if sub_stop >= input_seq_length:
174 |                         match = False
175 |                         match_length = sub_stop - sub_start
176 |                         num_units = int(match_length/motif_length)
177 |                         print(seq_record.id, sub_start, sub_stop, rep_class, match_length, strand, num_units, sub_seq[:motif_length], sep="\t", file=out_file)
178 |                         sub_start = sub_stop - fallback
179 |                     elif input_seq[j] == repeat_seq[i]:
180 |                         sub_stop += 1
181 |                         i += 1
182 |                         if i >= motif_length:
183 |                             i = 0
184 |                     else:
185 |                         match = False
186 |                         match_length = sub_stop - sub_start
187 |                         num_units = int(match_length/motif_length)
188 |                         print(seq_record.id, sub_start, sub_stop, rep_class, match_length, strand, num_units, sub_seq[:motif_length], sep="\t", file=out_file)
189 |                         sub_start = sub_stop - fallback
190 |             else:
191 |                 sub_start += 1
192 |     if type(out) == str:
193 |         out_file.close()
194 |     
195 | 
196 | def fasta_ssrs(args, repeats_info):
197 |     
198 |     if args.input.endswith('gz'):
199 |         handle = gzip.open(args.input, 'rt')
200 |     else:
201 |         handle = open(args.input, 'r')
202 | 
203 |     seq_nucleotide_info = dict()
204 |     num_records = rawcharCount(args.input, '>')
205 |     records = SeqIO.parse(handle, 'fasta')
206 |     target_ids = get_targetids(args.filter_seq_ids, args.target_seq_ids)
207 |     
208 |     if args.threads > 1:
209 |         i = 0
210 |         pool = multi.Pool(processes=args.threads)
211 |         for record in records:
212 |             out_name = './temp_%s.tsv' %(i)
213 |             i += 1
214 |             if (args.info or args.analyse)==True:
215 |                 for a in record.seq.upper():
216 |                     try: seq_nucleotide_info[a] += 1
217 |                     except KeyError: seq_nucleotide_info[a] = 1
218 |             if  args.min_seq_length <= len(record.seq) <= args.max_seq_length and record.id in target_ids:
219 |                 pool.apply_async(get_ssrs, (record, repeats_info, out_name,))
220 |     
221 |         pool.close() 
222 |         pool.join()
223 | 
224 |         # Concat all the output files into one.
225 |         temp_outs = tqdm(range(num_records), total=num_records)
226 |         for o in temp_outs:
227 |             name = './temp_%s.tsv' %(o)
228 |             temp_outs.set_description("Concatenating file: %d " %(o))
229 |             with open(name, 'r') as fh:
230 |                 for line in fh:
231 |                     print(line.strip(), file=args.output)
232 |             del_file(name)
233 |     
234 |     elif args.threads == 1:
235 |         records = tqdm(records, total=num_records)
236 |         for record in records:
237 |             records.set_description("Processing %s" %(record.id))
238 |             if (args.info or args.analyse)==True:
239 |                 for a in record.seq.upper():
240 |                     try: seq_nucleotide_info[a] += 1
241 |                     except KeyError: seq_nucleotide_info[a] = 1
242 |             if  args.min_seq_length <= len(record.seq) <= args.max_seq_length and record.id in target_ids:
243 |                 get_ssrs(record, repeats_info, args.output)
244 | 
245 |     if (args.info or args.analyse)==True:
246 |         line = "#File_name: %s\n#Total_sequences: %d\n#Total_bases: %d\n#GC: %f"\
247 |         %(os.path.basename(args.input), num_records, sum(seq_nucleotide_info.values()),\
248 |         round(getGC(seq_nucleotide_info), 2))
249 |         print(line, file=args.output)
250 |     args.output.close()
251 |            
252 |     if args.annotate is not None:
253 |         annotate(args)
254 | 
255 |     # Specifies to generate a HTML report
256 |     if args.analyse:
257 |         analyse_fasta(args)
258 |     


--------------------------------------------------------------------------------
/PERF/utils.py:
--------------------------------------------------------------------------------
  1 | #! /usr/bin/env python
  2 | # pylint: disable=C0111, C0301
  3 | 
  4 | from __future__ import print_function, division
  5 | import sys, gzip
  6 | from itertools import takewhile, repeat, islice
  7 | from tqdm import tqdm
  8 | from collections import Counter, defaultdict
  9 | 
 10 | 
 11 | kmers = { 
 12 |     1: 'Monomer', 2: 'Dimer', 3: 'Trimer', 4: 'Tetramer', 5: 'Pentamer',
 13 |     6: 'Hexamer', 7: 'Heptamer', 8: 'Octamer', 9: 'Nonamer', 10: 'Decamer',
 14 |     11: 'Undecamer', 12: 'Dodecamer', 13: 'Tridecamer', 14: 'Tetradecamer', 15: 'Pentadecamer',
 15 |     16: 'Hexadecamer', 17: 'Heptadecamer', 18: 'Octadecamer', 19: 'Nonadecamer', 20: 'Icosamer',
 16 |     21: 'Uncosamer', 22: 'Docosamer', 23: 'Tricosamer', 24: 'Tetracosamer', 25: 'Pentacosamer',
 17 |     26: 'Hexacosamer', 27: 'Heptacosamer', 28: 'Octacosamer', 29: 'Nonacosamer', 30: 'Triacontamer',
 18 |     31: 'Untriacontamer', 32: 'Dotriacontamer', 33: 'Tritriacontamer', 34: 'Tetratriacontamer', 35: 'Pentatriacontamer',
 19 |     36: 'Hexatriacontamer', 37: 'Heptatriacontamer', 38: 'Octatriacontamer', 39: 'Nonatriacontamer', 40: 'Tetracontamer',
 20 |     41: 'Untetracontamer', 42: 'Dotetracontamer', 43: 'Tritetracontamer', 44: 'Tetratetracontamer', 45: 'Pentatetracontamer',
 21 |     46: 'Hexatetracontamer', 47: 'Heptatetracontamer', 48: 'Octatetracontamer', 49: 'Nonatetracontamer', 50: 'Pentacontamer',
 22 | }
 23 | 
 24 | 
 25 | def get_cycles(string):
 26 |     cycles = set()
 27 |     for i in range(len(string)):
 28 |         cycles.add(string[i:] + string[:i])
 29 |     cycles = sorted(list(cycles))
 30 |     return cycles
 31 | 
 32 | 
 33 | def build_cycVariations(string):
 34 |     cycles = get_cycles(string)
 35 |     rev_cycles = get_cycles(rev_comp(string))
 36 |     for r in rev_cycles:
 37 |         if r not in cycles: cycles.append(r)
 38 |     return cycles
 39 | 
 40 | 
 41 | def getGC(basesCounter):
 42 |     totalBases = sum(basesCounter.values())
 43 |     try:
 44 |         GC = (float(basesCounter['G'] + basesCounter['C'])/(totalBases-basesCounter['N']))*100
 45 |     except KeyError:
 46 |         GC = (float(basesCounter['G'] + basesCounter['C'])/totalBases)*100
 47 |     return GC
 48 | 
 49 | 
 50 | def rev_comp(string):
 51 |     """Outputs reverse complement of a nucleotide sequence"""
 52 |     if sys.version_info.major == 2:
 53 |         import string as st
 54 |         complement = string.translate(st.maketrans('ACGT', 'TGCA'))
 55 |     else:
 56 |         complement = string.translate(str.maketrans('ACGT', 'TGCA'))
 57 |     return complement[::-1]
 58 | 
 59 | 
 60 | def rawcharCount(filename, char):
 61 |     if filename.endswith('gz'):
 62 |         f = gzip.open(filename, 'rb')
 63 |     else:
 64 |         f = open(filename, 'rb')
 65 |     bufgen = takewhile(lambda x: x, (f.read(1024*1024) for _ in repeat(None)))
 66 |     return sum( buf.count(char.encode('ASCII')) for buf in bufgen if buf )
 67 | 
 68 | 
 69 | def get_targetids(filter_seq_ids, target_seq_ids):
 70 |     """
 71 |         The function returns the set of desired sequence ids 
 72 |         across which repeats will be identified.
 73 |     """
 74 |     target_ids = univset()
 75 |     if filter_seq_ids:
 76 |         target_ids = univset()
 77 |         filter_ids = []
 78 |         with open(filter_seq_ids) as fh:
 79 |             for line in fh:
 80 |                 line = line.strip()
 81 |                 line = line.lstrip('>')
 82 |                 line = line.split(' ')[0]
 83 |                 filter_ids.append(line)
 84 |         target_ids = target_ids - set(filter_ids)
 85 |     
 86 |     elif target_seq_ids:
 87 |         target_ids = []
 88 |         with open(target_seq_ids) as fh:
 89 |             for line in fh:
 90 |                 line = line.strip()
 91 |                 line = line.lstrip('>')
 92 |                 line = line.split(' ')[0]
 93 |                 target_ids.append(line)
 94 |         target_ids = set(target_ids)
 95 | 
 96 |     return target_ids
 97 | 
 98 | 
 99 | class univset(object):
100 |     def __init__(self):
101 |         self._diff = set()
102 |  
103 |     def __sub__(self, other):
104 |         S = univset()
105 |         if type(other) == set:
106 |             S._diff = self._diff | other
107 |             return S
108 |         else:
109 |             S._diff = self._diff | other._diff
110 |             return S
111 |  
112 |     def __rsub__(self, other):
113 |         return other &amp; self._diff
114 |  
115 |     def __contains__(self, obj):
116 |         return not obj in self._diff
117 |  
118 |     def __and__(self, other):
119 |         return other - self._diff
120 |  
121 |     def __rand__(self, other):
122 |         return other - self._diff
123 |  
124 |     def __repr__(self):
125 |         if self._diff == set():
126 |             return "ANY"
127 |         else:
128 |             return "ANY - %s"%self._diff
129 |  
130 |     def __or__(self, other):
131 |         S = univset()
132 |         S._diff = self._diff - other
133 |         return S
134 |  
135 |     def __xor__(self, other):
136 |         return (self - other) | (other - self)
137 |  
138 |     def add(self, elem):
139 |         if elem in self._diff:
140 |             self._diff.remove(elem)
141 |  
142 |     def update(self, elem):
143 |         self._diff = self._diff - other
144 |  
145 |     def __ror__(self, other):
146 |         return self.__or__(other)
147 |  
148 |     def union(self, other):
149 |         return self.__or__(other)
150 |  
151 |     def difference(self, other):
152 |         return self.__sub__(other)
153 |  
154 |     def intersection(self, other):
155 |         return self.__and__(other)
156 |  
157 |     def symmetric_difference(self, other):
158 |         return self.__xor__(other)
159 |  
160 |     def __lt__(self, other):
161 |         return self.issubset(other)
162 |  
163 |     def __eq__(self, other):
164 |         if type(other) == set:
165 |             return False
166 |         try:
167 |             return self._diff == other._diff
168 |         except AttributeError:
169 |             return False
170 |  
171 |     def __ne__(self, other):
172 |         return not self.__eq__(other)
173 |  
174 |     def __le__(self, other):
175 |         return self.__lt__(other) or self.__eq__(other)
176 |  
177 |     def __gt__(self, other):
178 |         return self.issuperset(other)
179 |  
180 |     def __gt__(self, other):
181 |         return self.issuperset(other) or self == other
182 | 
183 | 
184 | class dotDict(dict):
185 |     """
186 |     Example:
187 |     m = dotDict({'first_name': 'Eduardo'}, last_name='Pool', age=24, sports=['Soccer'])
188 |     """
189 |     def __init__(self, *args, **kwargs):
190 |         super(dotDict, self).__init__(*args, **kwargs)
191 |         for arg in args:
192 |             if isinstance(arg, dict):
193 |                 for k, v in arg.items():
194 |                     self[k] = v
195 | 
196 |         if kwargs:
197 |             for k, v in kwargs.items():
198 |                 self[k] = v
199 | 
200 |     def __getattr__(self, attr):
201 |         return self.get(attr)
202 | 
203 |     def __setattr__(self, key, value):
204 |         self.__setitem__(key, value)
205 | 
206 |     def __setitem__(self, key, value):
207 |         super(dotDict, self).__setitem__(key, value)
208 |         self.__dict__.update({key: value})
209 | 
210 |     def __delattr__(self, item):
211 |         self.__delitem__(item)
212 | 
213 |     def __delitem__(self, key):
214 |         super(dotDict, self).__delitem__(key)
215 |         del self.__dict__[key]


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # PERF
  2 | [![Build](https://img.shields.io/badge/Build-passing-brightgreen.svg)]()
  3 | [![PyPI](https://img.shields.io/badge/PyPI-v0.4.6-blue.svg)]()
  4 | [![License](https://img.shields.io/badge/Licence-MIT-blue.svg)]()
  5 | ## Introduction
  6 | PERF is a Python package developed for fast and accurate identification of microsatellites from DNA sequences. Microsatellites or Simple Sequence Repeats (SSRs) are short tandem repeats of 1-6nt motifs. They are present in all genomes, and have a wide range of uses and functional roles. The existing tools for SSR identification have one or more caveats in terms of speed, comprehensiveness, accuracy, ease-of-use, flexibility and memory usage. PERF was designed to address all these problems.
  7 | 
  8 | PERF is a recursive acronym that stands for "PERF is an Exhaustive Repeat Finder". It is compatible with both Python 2 (tested on Python 2.7) and 3 (tested on Python 3.5). Its key features are:
  9 |   - Fast run time. As an example, identification of all SSRs from the entire human genome takes less than 7 minutes. The speed can be further improved ~3 to 4 fold using [PyPy](https://pypy.org/) (human genome finishes in less than 2 minutes using PyPy v5.8.0)
 10 |   - Linear time and space complexity (O(n))
 11 |   - Identifies perfect SSRs
 12 |   - 100% accurate and comprehensive - Does not miss any repeats or does not pick any incorrect ones
 13 |   - Easy to use - The only required argument is the input DNA sequence in FASTA format
 14 |   - Flexible - Most of the parameters are customizable by the user at runtime
 15 |   - Repeat cutoffs can be specified either in terms of the total repeat length or in terms of number of repeating units
 16 |   - TSV output and HTML report. The default output is an easily parseable and exportable tab-separated format. Optionally, PERF also generates an interactive HTML report that depicts trends in repeat data as concise charts and tables
 17 | 
 18 | ## Change log 
 19 | 
 20 | ## [0.4.6] - 2021-04-22
 21 | ### Fixes
 22 |  - Fixed usage of unit options file input for fastq input.
 23 |  - Fixed usage of repeats input file.
 24 | 
 25 | ## [0.4.5] - 2020-05-07
 26 | ### Added
 27 |  - Annotation of repeats w.r.t to genomic context using a GFF or GTF file. (option -g).
 28 |  - Multi-threading. Parallel identification of repeats in different sequences.
 29 |  - Identification of perfect repeats in fastq files.
 30 |  - Analysis report for repeats in fastq files.
 31 |  - Option to identify atomic repeats.
 32 | 
 33 | ### Changed
 34 |  - Analysis report rebuilt with Semantic ui and Apex Charts.
 35 |  - Visualisation of repeat annotation data in analysis report.
 36 | 
 37 | ### Fixes 
 38 |  - Python2 compatability fixed.
 39 |  - Bug fixes for PyPi compatability.
 40 |  - Import error issues.
 41 | 
 42 | ## Installation
 43 | PERF can be directly installed using pip with the package name `perf_ssr`. 
 44 | ```bash
 45 | $ pip install perf_ssr
 46 | ```
 47 | 
 48 | This name was chosen for the package so as not to clash with the existing `perf` package.
 49 | 
 50 | Alternatively, it can be installed from the source code:
 51 | ```bash
 52 | # Download the git repo
 53 | $ git clone https://github.com/RKMlab/perf.git
 54 | 
 55 | # Install
 56 | $ cd perf
 57 | $ python setup.py install
 58 | ```
 59 | Both of the methods add a console command `PERF`, which can be executed from any directory. It can also be used without installation by running the `core.py` file in the `PERF` subfolder:
 60 | 
 61 | ```bash
 62 | $ git clone https://github.com/RKMlab/perf.git
 63 | $ cd perf/PERF
 64 | $ python core.py -h # Print the help message of PERF (see below)
 65 | ```
 66 | 
 67 | ## Usage
 68 | The help message and available options can be accessed using
 69 | ```bash
 70 | $ PERF -h # Short option
 71 | $ PERF --help # Long option
 72 | ```
 73 | which gives the following output
 74 | ```
 75 | usage: core.py [-h] -i <FILE> [-o <FILE>] [--format <STR>] [--version]
 76 |                [-rep <FILE>] [-m <INT>] [-M <INT>] [-s <INT>] [-S <FLOAT>]
 77 |                [--include-atomic] [-l <INT> | -u INT or FILE] [-a] [--info]
 78 |                [-g <FILE>] [--anno-format <STR>] [--gene-key <STR>]
 79 |                [--up-promoter <INT>] [--down-promoter <INT>]
 80 |                [-f <FILE> | -F <FILE>] [-t <INT>]
 81 | 
 82 | Required arguments:
 83 |   -i <FILE>, --input <FILE>
 84 |                         Input sequence file.
 85 | 
 86 | Optional arguments:
 87 |   -o <FILE>, --output <FILE>
 88 |                         Output file name. Default: Input file name + _perf.tsv
 89 |   --format <STR>        Input file format. Default: fasta, Permissible: fasta,
 90 |                         fastq
 91 |   --version             show program's version number and exit
 92 |   -rep <FILE>, --repeats <FILE>
 93 |                         File with list of repeats (Not allowed with -m and/or
 94 |                         -M)
 95 |   -m <INT>, --min-motif-size <INT>
 96 |                         Minimum size of a repeat motif in bp (Not allowed with
 97 |                         -rep)
 98 |   -M <INT>, --max-motif-size <INT>
 99 |                         Maximum size of a repeat motif in bp (Not allowed with
100 |                         -rep)
101 |   -s <INT>, --min-seq-length <INT>
102 |                         Minimum size of sequence length for consideration (in
103 |                         bp)
104 |   -S <FLOAT>, --max-seq-length <FLOAT>
105 |                         Maximum size of sequence length for consideration (in
106 |                         bp)
107 |   --include-atomic      An option to include factor atomic repeats for minimum
108 |                         motif sizes greater than 1.
109 |   -l <INT>, --min-length <INT>
110 |                         Minimum length cutoff of repeat
111 |   -u INT or FILE, --min-units INT or FILE
112 |                         Minimum number of repeating units to be considered.
113 |                         Can be an integer or a file specifying cutoffs for
114 |                         different motif sizes.
115 |   -a, --analyse         Generate a summary HTML report.
116 |   --info                Sequence file info recorded in the output.
117 |   -f <FILE>, --filter-seq-ids <FILE>
118 |                         List of sequence ids in fasta file which will be
119 |                         ignored.
120 |   -F <FILE>, --target-seq-ids <FILE>
121 |                         List of sequence ids in fasta file which will be used.
122 |   -t <INT>, --threads <INT>
123 |                         Number of threads to run the process on. Default is 1.
124 | 
125 | Annotation arguments:
126 |   -g <FILE>, --annotate <FILE>
127 |                         Genic annotation input file for annotation, Both GFF
128 |                         and GTF can be processed. Use --anno-format to specify
129 |                         format.
130 |   --anno-format <STR>   Format of genic annotation file. Valid inputs: GFF,
131 |                         GTF. Default: GFF
132 |   --gene-key <STR>      Attribute key for geneId. The default identifier is
133 |                         "gene". Please check the annotation file and pick a
134 |                         robust gene identifier from the attribute column.
135 |   --up-promoter <INT>   Upstream distance(bp) from TSS to be considered as
136 |                         promoter region. Default 1000
137 |   --down-promoter <INT>
138 |                         Downstream distance(bp) from TSS to be considered as
139 |                         promoter region. Default 1000
140 | ```
141 | The details of each option are given below:
142 | 
143 | ### `-i or --input`
144 | **Expects:** *FILE*<br>
145 | **Default:** *None*<br>
146 | This is the only required argument for the program. The input file must be a valid FASTA/FASTQ file. PERF uses [Biopython's](http://biopython.org/wiki/SeqIO) FASTA parser to read the input fasta files. It accepts both single-line and multi-line sequences. Files with multiple sequences are also valid. To see more details about the FASTA format, see [this page](http://bioperl.org/formats/sequence_formats/FASTA_sequence_format).
147 | 
148 | ### `-o or --output`
149 | **Expects:** *STRING (to be used as filename)*<br>
150 | **Default:** *Input Filename + _perf.tsv (see below)*<br>
151 | If this option is not provided, the default output filename will be the same as the input filename, with its extension replaced with '_perf.tsv'. For example, if the input filename is `my_seq.fa`, the default output filename will be `my_seq_perf.tsv`. If the input filename does not have any extension, `_perf.tsv` will be appended to the filename. Please note that even in the case of no identified SSRs, the output file is still created (therefore overwriting any previous file of the same name) but with no content in the file.
152 | #### Output for fasta
153 | The output is a tab-delimited file, with one SSR record per line. 
154 | The output columns follow the [BED](https://genome.ucsc.edu/FAQ/FAQformat.html) format. The details of the columns are given below:
155 | 
156 | | S.No | Column | Description |
157 | |:----:| ------ | ----------- |
158 | | 1 | Chromosome | Chromosome or Sequence Name as specified by the first word in the FASTA header |
159 | | 2 | Repeat Start | 0-based start position of SSR in the Chromosome |
160 | | 3 | Repeat Stop | End position of SSR in the Chromosome |
161 | | 4 | Repeat Class | Class of repeat as grouped by their cyclical variations |
162 | | 5 | Repeat Length | Total length of identified repeat in nt |
163 | | 6 | Repeat Strand | Strand of SSR based on their cyclical variation |
164 | | 7 | Motif Number | Number of times the base motif is repeated |
165 | | 8 | Actual Repeat | Starting sequence of the SSR irrespective of Repeat class and strand|
166 | 
167 | An example output showing some of the largest repeats from *Drosophila melanogaster* is given below
168 | ```
169 | X       22012826  22014795  ACTGGG  1969    -       328     TCCCAG
170 | 2RHet   591337    591966    AATACT  629     -       104     ATTAGT
171 | 4       1042143   1042690   AAATAT  547     +       91      AAATAT
172 | 2RHet   598244    598789    AATACT  545     -       90      AGTATT
173 | XHet    122       663       AGAT    541     +       135     GATA
174 | X       22422335  22422827  AGAT    492     +       123     GATA
175 | 3R      975265    975710    AAAT    445     -       111     TTAT
176 | X       15442288  15442724  ACAGAT  436     +       72      ACAGAT
177 | 2L      22086818  22087152  AATACT  334     -       55      TATTAG
178 | YHet    137144    137466    AAGAC   322     -       64      CTTGT
179 | ```
180 | 
181 | #### Output for fastq
182 | The output is a tab-delimited file, with data on each repeat class per line.
183 | | S.No | Column | Description |
184 | |:----:| ------ | ----------- |
185 | | 1 | Repeat Class | Class of repeat as grouped by their cyclical variations |
186 | | 2 | Number of reads | Number of reads having an instance of the repeat |
187 | | 3 | Frequency | Total number of instances of the repeat  |
188 | | 4 | Bases | Total number of bases covered by the repeat |
189 | | 5 | Repeat reads per million reads | Number of  |
190 | | 6 | Instances per million reads | Strand of SSR based on their cyclical variation |
191 | | 7 | Repeat Bases per MB of sequence | Number of times the base motif is repeated |
192 | | 8 | Length distribution | Starting sequence of the SSR irrespective of Repeat class and strand|
193 | | 9 | Motif distribution | Starting sequence of the SSR irrespective of Repeat class and strand|
194 | 
195 | 
196 | ### `--format`
197 | **Expects:** *STRING (specifying format of the file)*<br>
198 | **Default:** *fasta*<br>
199 | PERF was originally developed to identify repeats in FASTA files. In version 4.0.0 PERF can identify repeats in FASTQ sequence files as well. The default format the program expects is fasta. Specify input format as 'fasta' for FASTA files and 'fastq' for FASTQ files.
200 | 
201 | ### `-a or --analyze`
202 | **Expects:** *None*<br>
203 | **Default:** *False*<br>
204 | In addition to the default tab-separated output, PERF can also generate a fully interactive HTML report for easy downstream analysis of the repeat data. The filename will be the same prefix as that of the main output. For example, if the input filename was `my_seq.fa`, the analysis report will be  `my_seq_perf.html`. An example HTML report, generated from the repeat data of *Homo sapiens* (build hg19), can be accessed [here](https://raw.githubusercontent.com/RKMlab/perf/html-report/test_data/Human_hg19_perf.html) (Right click -> Save As).
205 | 
206 | ### `-l or --min-length`
207 | **Expects:** *INTEGER*<br>
208 | **Default:** *12*<br>
209 | Minimum length cut-off to be considered when finding an SSR. The same cut-off will apply for SSRs of all motif lengths, even if the motif length is not a divisor of this value. In such cases, SSRs that end with a partial motif are also picked if they pass the length cut-off.
210 | 
211 | ### `-u or --min-units`
212 | **Expects:** *INTEGER* OR *FILE*<br>
213 | **Default:** *None*<br>
214 | This option finds SSRs with a minimum number of repeating motifs. The argument accepts either an integer or file. If an integer is specified, the same value is used for all motif lengths. Instead, a specific value for each motif length using a two-column tab-separated file as demonstrated below:
215 | 
216 | ```bash
217 | $ cat repeat_units.txt
218 | 1	10
219 | 2	6
220 | 3	4
221 | 4	3
222 | 5	2
223 | 6	2
224 | ```
225 | 
226 | The first column specifies the motif length, and the second column specifies the minimum number of times the motif should be repeated to be considered an SSR. This file can be used to identify repeats with different number of repeating motifs: monomers repeated at least 10 times, dimers repeated at least 6 times etc., using the following command
227 | ``` bash
228 | $ PERF -i my_seq.fa -m 1 -M 6 -u repeat_units.txt
229 | ```
230 | 
231 | ### `-rep or --repeats`
232 | **Expects:** *FILE*<br>
233 | **Default:** *None*<br>
234 | PERF provides an option to limit the search to specific repeat motifs. The repeats of interest should be specified via a file containing 4 tab-separated columns, as shown below:
235 | 
236 | ```bash
237 | $ cat my_repeats.txt
238 | A   A   1   +                                                                
239 | T   A   1   -
240 | AG  AG  2   +
241 | CT  AG  2   -
242 | GA  AG  2   +
243 | TC  AG  2   -
244 | $ PERF -i my_seq.fa -rep my_repeats.txt # Find all A and AG repeats from my_seq.fa
245 | ```
246 | 
247 | **Note:** This option is not allowed when `-m` or `-M` options are used.
248 | ### `-m or --min-motif-size`
249 | **Expects:** *INTEGER*<br>
250 | **Default:** *1*<br>
251 | Minimum length of motifs to be considered. By default, PERF ignores redundant motifs. For example, a stretch of 12 A's is considered a monomer repeat of 12 A's rather than a dimer repeat of 6 AA's. However, this is only true if `-m` is set to 1. If for example, `-m` is set to 2, then stretches of 12 A's are reported as dimer AA repeats. If this behavior isn't desired, we suggest using the `-rep` option (see above) to specify the motifs that should/shouldn't be included.
252 | 
253 | ### `-M or --max-motif-size`
254 | **Expects:** *INTEGER*<br>
255 | **Default:** *6*<br>
256 | Maximum length of motifs to be considered. Setting a large value of `-M` has a non-trivial effect on both the runtime and memory usage of PERF. This is noticeable with `-M` values above 10.
257 | 
258 | ### `--include-atomic`
259 | **Expects:** *None*<br>
260 | **Default:** *False*<br>
261 | Searches for atomic repeats when set to *True*. For example, when minimum motif size is set to 2bp, PERF ignores monomer repeats. When include atomic repeats is set to *True*, PERF identifies AA, CC, GG and TT as dimer repeats.
262 | 
263 | ### `-s or --min-seq-length`
264 | **Expects:** *INTEGER*<br>
265 | **Default:** *0*<br>
266 | Minimum length of the input sequence to be searched for SSRs in bp. All sequences in the input file that are smaller than this length will be ignored.
267 | 
268 | ### `-S or --max-seq-length`
269 | **Expects:** *INTEGER*<br>
270 | **Default:** *Infinity*<br>
271 | Maximum length of the input sequence to be searched for SSRs in bp. All sequences in the input file that are larger than this length will be ignored.
272 | 
273 | ### `-f or --filter-seq-ids`
274 | **Expects:** *FILE*<br>
275 | **Default:** *None*<br>
276 | This option accepts a file with a list of sequence IDs in the input file that should be ignored. Useful for ignoring contigs, scaffold, or other poor quality sequences. The IDs can be FASTA headers (starting with '>' symbol) or just the names without the '>' symbol.
277 | 
278 | ### `-F or --target-seq-ids`
279 | **Expects:** *FILE*<br>
280 | **Default:** *None*<br>
281 | This option accepts a file with a list of sequence IDs in the input file that should be analyzed. All other sequences will be ignored. Useful for analyzing specific chromosomes from a large input file. The IDs can be FASTA headers (starting with '>' symbol) or just the names without the '>' symbol.
282 | 
283 | ### `--info`
284 | **Expects:** *None*<br>
285 | **Default:** *False*<br>
286 | This option when set to *True*, includes information about the input sequence files and repeat summary data in the output file.
287 | 
288 | ```bash
289 | $ tail -5 test_input_perf.tsv
290 | gi|514486271|gb|KE346361.1|	2667759	2667775	ATC	16	+	5	CAT
291 | #File_name: test_input.fa
292 | #Total_sequences: 2
293 | #Total_bases: 6462134
294 | #GC: 53.970000
295 | ```
296 | 
297 | ### `-g or --annotate`
298 | **Expects:** *FILE*<br>
299 | **Default:** *None*<br>
300 | Input a genomic feature file to annotate the repeats in the genomic context. PERF accepts both GFF and GTF format genomic feature files. Each repeat is annotated w.r.t the closest gene and classified either as Genic, Exonic, Intronic and Intergenic according to the position of the repeat. Besides this, the repeat is also checked if it falls in the promoter region of the gene. Annotation adds 7 columns to the default perf output which already consist 8 columns.
301 | 
302 | | S.No | Column | Description |
303 | |:----:| ------ | ----------- |
304 | | 9 | Gene name | Name of the closest gene |
305 | | 10 | Gene Start | Start position of gene in the Chromosome |
306 | | 11 | Gene Stop | End position of gene in the Chromosome |
307 | | 12 | Strand | The strand orientation of the gene |
308 | | 13 | Genomic annotation | Annotation of the repeat w.r.t to the gene. Possible annotations are {Genic, Exonic, Intronic, Intergenic} |
309 | | 14 | Promoter annotation | If repeat falls in the promoter region of the closest gene. The default promoter region is 1Kb upstream and downstream of TSS. |
310 | | 15 | Distance from TSS | Distance of the repeat from the TSS of the gene. |
311 | 
312 | ### `--anno-format`
313 | **Expects:** *STRING*<br>
314 | **Default:** *GFF*<br>
315 | Option to specify the format of the input genomic feature file. Accepted  inputs are GFF or GTF. More details about the GFF and GTF formats can be found [here](https://asia.ensembl.org/info/website/upload/gff.html).
316 | 
317 | ### `--gene-key`
318 | **Expects:** *STRING*<br>
319 | **Default:** *gene*<br>
320 | The attribute key used for the name of the gene in the GFF/GTF file. In the below example GFF file, we have the location of a gene and it's mRNA and exon locations. The last column of the file specifies attributes associated with each feature, like ID, Parent, gene etc. PERF uses on of the attribute to identify the gene and also it's exons. In th below example the key "gene" can be used to identify gene and the exons of the gene as they have the same gene name. Please check your GFF/GTF file for a robust attribute key which can identify all genes and their corresponding exons. We are actively working on better annotation where we can identify genes and their exons based on the ID and Parent.
321 | 
322 | ```
323 | # Sample GFF
324 | NC_004354.4	RefSeq	gene	124370	126714	.	-	.	ID=gene1;Name=CG17636;gbkey=Gene;gene=CG17636;gene_biotype=protein_coding;gene_synonym=DmelCG17636,EG:23E12.1;
325 | NC_004354.4	RefSeq	mRNA	124370	126714	.	-	.	ID=rna1;Parent=gene1;Name=NM_001103384.3;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3
326 | NC_004354.4	RefSeq	exon	126626	126714	.	-	.	ID=id13;Parent=rna1;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3
327 | NC_004354.4	RefSeq	exon	125495	126259	.	-	.	ID=id14;Parent=rna1;gbkey=mRNA;gene=CG17636;transcript_id=NM_001103384.3
328 | ```
329 | 
330 | ### `--up-promoter`
331 | **Expects:** *INT*<br>
332 | **Default:** *1000*<br>
333 | Upstream distance(bp) from the TSS of the gene to be considered as promoter region. Default 1000.
334 | 
335 | ### `--down-promoter`
336 | **Expects:** *INT*<br>
337 | **Default:** *1000*<br>
338 | Downstream distance(bp) from the TSS of the gene to be considered as promoter region. Default 1000.
339 | 
340 | ### `--version`
341 | Prints the version info of PERF.
342 | 
343 | ## Examples
344 | 
345 | The following examples assume that the file with input sequence in FASTA format is named `my_seq.fa`.
346 | 
347 | #### Basic Usage
348 | ``` bash
349 | # Find all monomer to hexamer repeats of >=12nt length
350 | $ PERF -i my_seq.fa
351 | # Specify output filename
352 | $ PERF -i my_seq.fa -o PERF_output.tsv
353 | # Specify fastq format
354 | $ PERF -i my_seq.fastq --format fastq
355 | ```
356 | 
357 | #### Generate Analysis Report
358 | ``` bash
359 | # Find all monomer to hexamer repeats of >=12nt length and generate an HTML report
360 | $ PERF -i my_seq.fa -a
361 | # Specify output filename
362 | $ PERF -i my_seq.fa -o PERF_out.tsv -a # HTML file is called PERF_out.html
363 | ```
364 | 
365 | #### Annotate Repeats
366 | ``` bash
367 | # Find all monomer to hexamer repeats of >=12nt length and annotate them
368 | $ PERF -i my_seq.fa -g my_seq.gff
369 | # Specify feature file format and set downstream promoter region to 500bp
370 | $ PERF -i my_seq.fa -g my_seq.gtf --anno-format gtf --down-promoter 500 # HTML file is called PERF_out.html
371 | ```
372 | 
373 | #### Set Cut-off Criteria
374 | ```bash
375 | # Find all monomer to hexamer repeats of >=15nt length
376 | $ PERF -i my_seq.fa -l 15
377 | # Find SSRs with at least 6 repeating motifs (for all motif lengths)
378 | $ PERF -i my_seq.fa -u 6
379 | ```
380 | 
381 | #### Identify Specific Repeats
382 | ``` bash
383 | $ cat my_repeats.txt
384 | AG  AG  2   +
385 | CT  AG  2   -
386 | GA  AG  2   +
387 | TC  AG  2   -
388 | # Find all AG repeats and generate an HTML report
389 | $ PERF -i my_seq.fa -rep my_repeats.txt -a
390 | ```
391 | 
392 | #### Change Motif Length Cut-offs
393 | ```bash
394 | # Ignore monomer and dimer repeats, and repeats with <4 repeating units
395 | $ PERF -i my_seq.fa -m 3 -u 4
396 | # Report only tetramer repeats of >=16nt length, and generate HTML report
397 | $ PERF -i my_seq.fa -m 4 -M 4 -l 16 -a
398 | 
399 | ```
400 | 
401 | In all the above examples, the output of PERF is saved to `my_seq_perf.tsv` and the HTML report is saved to `my_seq_perf.html` unless `-o` is specified.
402 | 
403 | 
404 | ## Citation
405 | 
406 | If you find PERF useful for your research, please cite it as follows:
407 | 
408 | PERF: an exhaustive algorithm for ultra-fast and efficient identification of microsatellites from large DNA sequences<br>
409 | *Akshay Kumar Avvaru, Divya Tej Sowpati, Rakesh Kumar Mishra*<br>
410 | Bioinformatics, , btx721<br>
411 | [doi](https://doi.org/10.1093/bioinformatics/btx721): 10.1093/bioinformatics/btx721
412 | 
413 | ## Contact
414 | For queries or suggestions, please contact:
415 | 
416 | Divya Tej Sowpati - <tej@ccmb.res.in><br>
417 | Akshay Kumar Avvaru - <avvaru@ccmb.res.in>
418 | 
419 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
 1 | PERF
 2 | ====
 3 | 
 4 | Introduction
 5 | ------------
 6 | 
 7 | PERF is a Python package developed for fast and accurate identification of microsatellites from DNA sequences. Microsatellites or Simple Sequence Repeats (SSRs) are short tandem repeats of 1-6nt motifs. They are present in all genomes, and have a wide range of uses and functional roles. The existing tools for SSR identification have one or more caveats in terms of speed, comprehensiveness, 
 8 | accuracy, ease-of-use, flexibility and memory usage. PERF was designed to address all these problems.
 9 | 
10 | PERF is a recursive acronym that stands for “PERF is an Exhaustive Repeat Finder”. It is compatible with both Python 2 (tested on Python 2.7) and 3 (tested on Python 3.5). Its key features are:
11 | 
12 |  - Fast run time, despite being a single-threaded application. As an example, identification of all SSRs from the entire human genome takes less than 7 minutes. The speed can be further improved ~3 to 4 fold using (human genome finishes in less than 2 minutes using PyPy v5.8.0) 
13 |  - Linear time and space complexity (O(n))
14 |  - Identifies perfect SSRs
15 |  - 100% accurate and comprehensive
16 |  - Does not miss any repeats or does not pick any incorrect ones
17 |  - Easy to use
18 |  - The only required argument is the input DNA sequence in FASTA format
19 |  - Flexible
20 |  - Most of the parameters are customizable by the user at runtime
21 |  - Repeat cutoffs can be specified either in terms of the total repeat length or in terms of number of repeating units
22 |  - TSV output and HTML report
23 | 
24 | The default output is an easily parseable and exportable tab-separated format. Optionally, PERF also generates an interactive HTML report that depicts trends in repeat data as concise charts and tables.
25 | 
26 | Installation
27 | ------------
28 | 
29 | PERF can be directly installed using pip with the package name
30 | ``perf_ssr``.
31 | 
32 | .. code:: bash
33 | 
34 |     $ pip install perf_ssr
35 | 
36 | This name was chosen for the package so as not to clash with the existing ``perf`` package.
37 | 
38 | Alternatively, it can also be installed from the source code:
39 | 
40 | .. code:: bash
41 | 
42 |     # Download the git repo
43 |     $ git clone https://github.com/RKMlab/perf.git
44 | 
45 |     # Install
46 |     $ cd perf
47 |     $ python setup.py install
48 | 
49 | Both of the methods add a console command ``PERF``, which can be executed from any directory. It can also be used without installation by running the ``core.py`` file in the ``PERF`` subfolder:
50 | 
51 | .. code:: bash
52 | 
53 |     $ git clone https://github.com/RKMlab/perf.git
54 |     $ cd perf/PERF
55 |     $ python core.py -h # Print the help message of PERF (see below)
56 | 
57 | Usage
58 | -----
59 | 
60 | The help message and available options can be accessed using
61 | 
62 | .. code:: bash
63 | 
64 |     $ PERF -h # Short option
65 |     $ PERF --help # Long option
66 | 
67 | which gives the following output
68 | 
69 | ::
70 | 
71 |     usage: PERF [-h] -i <FILE> [-o <FILE>] [-a] [-l <INT> | -u INT or FILE]
72 |                 [-rep <FILE>] [-m <INT>] [-M <INT>] [--version]
73 | 
74 |     Required arguments:
75 |       -i <FILE>, --input <FILE>
76 |                             Input file in FASTA format
77 | 
78 |     Optional arguments:
79 |       -o <FILE>, --output <FILE>
80 |                             Output file name. Default: Input file name + _perf.tsv
81 |       -a, --analyse         Generate a summary HTML report.
82 |       -l <INT>, --min-length <INT>
83 |                             Minimum length cutoff of repeat
84 |       -u INT or FILE, --min-units INT or FILE
85 |                             Minimum number of repeating units to be considered.
86 |                             Can be an integer or a file specifying cutoffs for
87 |                             different motif sizes.
88 |       -rep <FILE>, --repeats <FILE>
89 |                             File with list of repeats (Not allowed with -m and/or
90 |                             -M)
91 |       -m <INT>, --min-motif-size <INT>
92 |                             Minimum size of a repeat motif in bp (Not allowed with
93 |                             -rep)
94 |       -M <INT>, --max-motif-size <INT>
95 |                             Maximum size of a repeat motif in bp (Not allowed with
96 |                             -rep)
97 |       --version             show program's version number and exit


--------------------------------------------------------------------------------
/pylint.rc:
--------------------------------------------------------------------------------
  1 | [MASTER]
  2 | 
  3 | # Specify a configuration file.
  4 | #rcfile=
  5 | 
  6 | # Python code to execute, usually for sys.path manipulation such as
  7 | # pygtk.require().
  8 | #init-hook=
  9 | 
 10 | # Add files or directories to the blacklist. They should be base names, not
 11 | # paths.
 12 | ignore=CVS
 13 | 
 14 | # Add files or directories matching the regex patterns to the blacklist. The
 15 | # regex matches against base names, not paths.
 16 | ignore-patterns=
 17 | 
 18 | # Pickle collected data for later comparisons.
 19 | persistent=yes
 20 | 
 21 | # List of plugins (as comma separated values of python modules names) to load,
 22 | # usually to register additional checkers.
 23 | load-plugins=
 24 | 
 25 | # Use multiple processes to speed up Pylint.
 26 | jobs=1
 27 | 
 28 | # Allow loading of arbitrary C extensions. Extensions are imported into the
 29 | # active Python interpreter and may run arbitrary code.
 30 | unsafe-load-any-extension=no
 31 | 
 32 | # A comma-separated list of package or module names from where C extensions may
 33 | # be loaded. Extensions are loading into the active Python interpreter and may
 34 | # run arbitrary code
 35 | extension-pkg-whitelist=
 36 | 
 37 | # Allow optimization of some AST trees. This will activate a peephole AST
 38 | # optimizer, which will apply various small optimizations. For instance, it can
 39 | # be used to obtain the result of joining multiple strings with the addition
 40 | # operator. Joining a lot of strings can lead to a maximum recursion error in
 41 | # Pylint and this flag can prevent that. It has one side effect, the resulting
 42 | # AST will be different than the one from reality. This option is deprecated
 43 | # and it will be removed in Pylint 2.0.
 44 | optimize-ast=no
 45 | 
 46 | 
 47 | [MESSAGES CONTROL]
 48 | 
 49 | # Only show warnings with the listed confidence levels. Leave empty to show
 50 | # all. Valid levels: HIGH, INFERENCE, INFERENCE_FAILURE, UNDEFINED
 51 | confidence=
 52 | 
 53 | # Enable the message, report, category or checker with the given id(s). You can
 54 | # either give multiple identifier separated by comma (,) or put this option
 55 | # multiple time (only on the command line, not in the configuration file where
 56 | # it should appear only once). See also the "--disable" option for examples.
 57 | #enable=
 58 | 
 59 | # Disable the message, report, category or checker with the given id(s). You
 60 | # can either give multiple identifiers separated by comma (,) or put this
 61 | # option multiple times (only on the command line, not in the configuration
 62 | # file where it should appear only once).You can also use "--disable=all" to
 63 | # disable everything first and then reenable specific checks. For example, if
 64 | # you want to run only the similarities checker, you can use "--disable=all
 65 | # --enable=similarities". If you want to run only the classes checker, but have
 66 | # no Warning level messages displayed, use"--disable=all --enable=classes
 67 | # --disable=W"
 68 | disable=zip-builtin-not-iterating,oct-method,indexing-exception,file-builtin,parameter-unpacking,coerce-builtin,map-builtin-not-iterating,range-builtin-not-iterating,useless-suppression,raising-string,old-ne-operator,reload-builtin,long-suffix,filter-builtin-not-iterating,using-cmp-argument,old-octal-literal,print-statement,xrange-builtin,buffer-builtin,raw_input-builtin,coerce-method,reduce-builtin,cmp-builtin,getslice-method,unpacking-in-except,import-star-module-level,delslice-method,suppressed-message,cmp-method,unichr-builtin,basestring-builtin,old-division,unicode-builtin,nonzero-method,metaclass-assignment,old-raise-syntax,input-builtin,long-builtin,dict-view-method,apply-builtin,round-builtin,setslice-method,next-method-called,intern-builtin,hex-method,dict-iter-method,execfile-builtin,backtick,no-absolute-import,standarderror-builtin,locally-disabled
 69 | 
 70 | 
 71 | [REPORTS]
 72 | 
 73 | # Set the output format. Available formats are text, parseable, colorized, msvs
 74 | # (visual studio) and html. You can also give a reporter class, eg
 75 | # mypackage.mymodule.MyReporterClass.
 76 | output-format=text
 77 | 
 78 | # Put messages in a separate file for each module / package specified on the
 79 | # command line instead of printing them on stdout. Reports (if any) will be
 80 | # written in a file name "pylint_global.[txt|html]". This option is deprecated
 81 | # and it will be removed in Pylint 2.0.
 82 | files-output=no
 83 | 
 84 | # Tells whether to display a full report or only the messages
 85 | reports=yes
 86 | 
 87 | # Python expression which should return a note less than 10 (10 is the highest
 88 | # note). You have access to the variables errors warning, statement which
 89 | # respectively contain the number of errors / warnings messages and the total
 90 | # number of statements analyzed. This is used by the global evaluation report
 91 | # (RP0004).
 92 | evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
 93 | 
 94 | # Template used to display messages. This is a python new-style format string
 95 | # used to format the message information. See doc for all details
 96 | #msg-template=
 97 | 
 98 | 
 99 | [SPELLING]
100 | 
101 | # Spelling dictionary name. Available dictionaries: none. To make it working
102 | # install python-enchant package.
103 | spelling-dict=
104 | 
105 | # List of comma separated words that should not be checked.
106 | spelling-ignore-words=
107 | 
108 | # A path to a file that contains private dictionary; one word per line.
109 | spelling-private-dict-file=
110 | 
111 | # Tells whether to store unknown words to indicated private dictionary in
112 | # --spelling-private-dict-file option instead of raising a message.
113 | spelling-store-unknown-words=no
114 | 
115 | 
116 | [TYPECHECK]
117 | 
118 | # Tells whether missing members accessed in mixin class should be ignored. A
119 | # mixin class is detected if its name ends with "mixin" (case insensitive).
120 | ignore-mixin-members=yes
121 | 
122 | # List of module names for which member attributes should not be checked
123 | # (useful for modules/projects where namespaces are manipulated during runtime
124 | # and thus existing member attributes cannot be deduced by static analysis. It
125 | # supports qualified module names, as well as Unix pattern matching.
126 | ignored-modules=
127 | 
128 | # List of class names for which member attributes should not be checked (useful
129 | # for classes with dynamically set attributes). This supports the use of
130 | # qualified names.
131 | ignored-classes=optparse.Values,thread._local,_thread._local
132 | 
133 | # List of members which are set dynamically and missed by pylint inference
134 | # system, and so shouldn't trigger E1101 when accessed. Python regular
135 | # expressions are accepted.
136 | generated-members=
137 | 
138 | # List of decorators that produce context managers, such as
139 | # contextlib.contextmanager. Add to this list to register other decorators that
140 | # produce valid context managers.
141 | contextmanager-decorators=contextlib.contextmanager
142 | 
143 | 
144 | [SIMILARITIES]
145 | 
146 | # Minimum lines number of a similarity.
147 | min-similarity-lines=4
148 | 
149 | # Ignore comments when computing similarities.
150 | ignore-comments=yes
151 | 
152 | # Ignore docstrings when computing similarities.
153 | ignore-docstrings=yes
154 | 
155 | # Ignore imports when computing similarities.
156 | ignore-imports=no
157 | 
158 | 
159 | [BASIC]
160 | 
161 | # Good variable names which should always be accepted, separated by a comma
162 | good-names=i,j,k,ex,Run,_
163 | 
164 | # Bad variable names which should always be refused, separated by a comma
165 | bad-names=foo,bar,baz,toto,tutu,tata
166 | 
167 | # Colon-delimited sets of names that determine each other's naming style when
168 | # the name regexes allow several styles.
169 | name-group=
170 | 
171 | # Include a hint for the correct naming format with invalid-name
172 | include-naming-hint=no
173 | 
174 | # List of decorators that produce properties, such as abc.abstractproperty. Add
175 | # to this list to register other decorators that produce valid properties.
176 | property-classes=abc.abstractproperty
177 | 
178 | # Regular expression matching correct class attribute names
179 | class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
180 | 
181 | # Naming hint for class attribute names
182 | class-attribute-name-hint=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
183 | 
184 | # Regular expression matching correct method names
185 | method-rgx=[a-z_][a-z0-9_]{2,30}$
186 | 
187 | # Naming hint for method names
188 | method-name-hint=[a-z_][a-z0-9_]{2,30}$
189 | 
190 | # Regular expression matching correct constant names
191 | const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
192 | 
193 | # Naming hint for constant names
194 | const-name-hint=(([A-Z_][A-Z0-9_]*)|(__.*__))$
195 | 
196 | # Regular expression matching correct argument names
197 | argument-rgx=[a-z_][a-z0-9_]{2,30}$
198 | 
199 | # Naming hint for argument names
200 | argument-name-hint=[a-z_][a-z0-9_]{2,30}$
201 | 
202 | # Regular expression matching correct variable names
203 | variable-rgx=[a-z_][a-z0-9_]{2,30}$
204 | 
205 | # Naming hint for variable names
206 | variable-name-hint=[a-z_][a-z0-9_]{2,30}$
207 | 
208 | # Regular expression matching correct module names
209 | module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
210 | 
211 | # Naming hint for module names
212 | module-name-hint=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
213 | 
214 | # Regular expression matching correct class names
215 | class-rgx=[A-Z_][a-zA-Z0-9]+$
216 | 
217 | # Naming hint for class names
218 | class-name-hint=[A-Z_][a-zA-Z0-9]+$
219 | 
220 | # Regular expression matching correct function names
221 | function-rgx=[a-z_][a-z0-9_]{2,30}$
222 | 
223 | # Naming hint for function names
224 | function-name-hint=[a-z_][a-z0-9_]{2,30}$
225 | 
226 | # Regular expression matching correct attribute names
227 | attr-rgx=[a-z_][a-z0-9_]{2,30}$
228 | 
229 | # Naming hint for attribute names
230 | attr-name-hint=[a-z_][a-z0-9_]{2,30}$
231 | 
232 | # Regular expression matching correct inline iteration names
233 | inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
234 | 
235 | # Naming hint for inline iteration names
236 | inlinevar-name-hint=[A-Za-z_][A-Za-z0-9_]*$
237 | 
238 | # Regular expression which should only match function or class names that do
239 | # not require a docstring.
240 | no-docstring-rgx=^_
241 | 
242 | # Minimum line length for functions/classes that require docstrings, shorter
243 | # ones are exempt.
244 | docstring-min-length=-1
245 | 
246 | 
247 | [ELIF]
248 | 
249 | # Maximum number of nested blocks for function / method body
250 | max-nested-blocks=5
251 | 
252 | 
253 | [VARIABLES]
254 | 
255 | # Tells whether we should check for unused import in __init__ files.
256 | init-import=no
257 | 
258 | # A regular expression matching the name of dummy variables (i.e. expectedly
259 | # not used).
260 | dummy-variables-rgx=(_+[a-zA-Z0-9]*?$)|dummy
261 | 
262 | # List of additional names supposed to be defined in builtins. Remember that
263 | # you should avoid to define new builtins when possible.
264 | additional-builtins=
265 | 
266 | # List of strings which can identify a callback function by name. A callback
267 | # name must start or end with one of those strings.
268 | callbacks=cb_,_cb
269 | 
270 | # List of qualified module names which can have objects that can redefine
271 | # builtins.
272 | redefining-builtins-modules=six.moves,future.builtins
273 | 
274 | 
275 | [LOGGING]
276 | 
277 | # Logging modules to check that the string format arguments are in logging
278 | # function parameter format
279 | logging-modules=logging
280 | 
281 | 
282 | [FORMAT]
283 | 
284 | # Maximum number of characters on a single line.
285 | max-line-length=100
286 | 
287 | # Regexp for a line that is allowed to be longer than the limit.
288 | ignore-long-lines=^\s*(# )?<?https?://\S+>?$
289 | 
290 | # Allow the body of an if to be on the same line as the test if there is no
291 | # else.
292 | single-line-if-stmt=no
293 | 
294 | # List of optional constructs for which whitespace checking is disabled. `dict-
295 | # separator` is used to allow tabulation in dicts, etc.: {1  : 1,\n222: 2}.
296 | # `trailing-comma` allows a space between comma and closing bracket: (a, ).
297 | # `empty-line` allows space-only lines.
298 | no-space-check=trailing-comma,dict-separator
299 | 
300 | # Maximum number of lines in a module
301 | max-module-lines=1000
302 | 
303 | # String used as indentation unit. This is usually "    " (4 spaces) or "\t" (1
304 | # tab).
305 | indent-string='    '
306 | 
307 | # Number of spaces of indent required inside a hanging  or continued line.
308 | indent-after-paren=4
309 | 
310 | # Expected format of line ending, e.g. empty (any line ending), LF or CRLF.
311 | expected-line-ending-format=
312 | 
313 | 
314 | [MISCELLANEOUS]
315 | 
316 | # List of note tags to take in consideration, separated by a comma.
317 | notes=FIXME,XXX,TODO
318 | 
319 | 
320 | [CLASSES]
321 | 
322 | # List of method names used to declare (i.e. assign) instance attributes.
323 | defining-attr-methods=__init__,__new__,setUp
324 | 
325 | # List of valid names for the first argument in a class method.
326 | valid-classmethod-first-arg=cls
327 | 
328 | # List of valid names for the first argument in a metaclass class method.
329 | valid-metaclass-classmethod-first-arg=mcs
330 | 
331 | # List of member names, which should be excluded from the protected access
332 | # warning.
333 | exclude-protected=_asdict,_fields,_replace,_source,_make
334 | 
335 | 
336 | [DESIGN]
337 | 
338 | # Maximum number of arguments for function / method
339 | max-args=5
340 | 
341 | # Argument names that match this expression will be ignored. Default to name
342 | # with leading underscore
343 | ignored-argument-names=_.*
344 | 
345 | # Maximum number of locals for function / method body
346 | max-locals=15
347 | 
348 | # Maximum number of return / yield for function / method body
349 | max-returns=6
350 | 
351 | # Maximum number of branch for function / method body
352 | max-branches=12
353 | 
354 | # Maximum number of statements in function / method body
355 | max-statements=50
356 | 
357 | # Maximum number of parents for a class (see R0901).
358 | max-parents=7
359 | 
360 | # Maximum number of attributes for a class (see R0902).
361 | max-attributes=7
362 | 
363 | # Minimum number of public methods for a class (see R0903).
364 | min-public-methods=2
365 | 
366 | # Maximum number of public methods for a class (see R0904).
367 | max-public-methods=20
368 | 
369 | # Maximum number of boolean expressions in a if statement
370 | max-bool-expr=5
371 | 
372 | 
373 | [IMPORTS]
374 | 
375 | # Deprecated modules which should not be used, separated by a comma
376 | deprecated-modules=optparse
377 | 
378 | # Create a graph of every (i.e. internal and external) dependencies in the
379 | # given file (report RP0402 must not be disabled)
380 | import-graph=
381 | 
382 | # Create a graph of external dependencies in the given file (report RP0402 must
383 | # not be disabled)
384 | ext-import-graph=
385 | 
386 | # Create a graph of internal dependencies in the given file (report RP0402 must
387 | # not be disabled)
388 | int-import-graph=
389 | 
390 | # Force import order to recognize a module as part of the standard
391 | # compatibility libraries.
392 | known-standard-library=
393 | 
394 | # Force import order to recognize a module as part of a third party library.
395 | known-third-party=enchant
396 | 
397 | # Analyse import fallback blocks. This can be used to support both Python 2 and
398 | # 3 compatible code, which means that the block might have code that exists
399 | # only in one or another interpreter, leading to false positives when analysed.
400 | analyse-fallback-blocks=no
401 | 
402 | 
403 | [EXCEPTIONS]
404 | 
405 | # Exceptions that will emit a warning when being caught. Defaults to
406 | # "Exception"
407 | overgeneral-exceptions=Exception
408 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | biopython>=1.67
2 | regex>=2016.8.27
3 | tqdm>=4.8.4
4 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.rst


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | from setuptools import setup, find_packages
 4 | 
 5 | setup(
 6 |   name='perf_ssr',
 7 |   version='0.4.6',
 8 |   description='PERF is an exhaustive repeat finder',
 9 |   url='https://github.com/rkmlab/perf',
10 |   keywords='ssr microsatellites',
11 |   author='Divya Tej Sowpati',
12 |   author_email='tej@ccmb.res.in',
13 |   license='MIT',
14 |   packages=find_packages(),
15 |   install_requires=['biopython==1.69', 'tqdm>=4'], # biopython version 1.69 installs numpy
16 |   entry_points={
17 |     'console_scripts': ['PERF=PERF.core:main']
18 |   },
19 |   include_package_data=True, # change path according to package name in MANIFEST.in
20 | )


--------------------------------------------------------------------------------
/test_data/repeat_options.txt:
--------------------------------------------------------------------------------
1 | A	A	1	+
2 | T	A	1	-
3 | AG	AG	2	+
4 | CT	AG	2	-
5 | GA	AG	2	+
6 | TC	AG	2	-
7 | 


--------------------------------------------------------------------------------
/test_data/test_input.fastq.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/RKMlab/perf/e343a1fa437033afce5b5a794079230530619983/test_data/test_input.fastq.gz


--------------------------------------------------------------------------------
/test_data/unit_options.txt:
--------------------------------------------------------------------------------
1 | 1	12
2 | 2	6
3 | 3	4
4 | 4	3
5 | 5	2
6 | 6	2
7 | 


--------------------------------------------------------------------------------
/utils/repeat_generator.py:
--------------------------------------------------------------------------------
 1 | #! /usr/bin/env python
 2 | 
 3 | # pylint: disable=C0103
 4 | from __future__ import print_function
 5 | import sys
 6 | import argparse
 7 | from itertools import product
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('-m', '--min-motif-size', type=int, metavar='<INT>', default=1, help='Minimum size of a repeat motif in bp')
11 | parser.add_argument('-M', '--max-motif-size', type=int, metavar='<INT>', default=6, help='Maximum size of a repeat motif in bp')
12 | parser.add_argument('-fo', '--out', type=argparse.FileType('w'), metavar='<FILE>', default=sys.stdout, help='Output file')
13 | args = parser.parse_args()
14 | 
15 | def rev_comp(string):
16 |     complement = string.translate(str.maketrans('ACGT', 'TGCA'))
17 |     return complement[::-1]
18 | 
19 | 
20 | def expand_repeat(string, size):
21 |     return_string = ''
22 |     i = 0
23 |     while len(return_string) < size:
24 |         return_string += string[i]
25 |         i += 1
26 |         if i >= len(string):
27 |             i = 0
28 |     return return_string
29 | 
30 | 
31 | def get_cycles(string):
32 |     cycles = []
33 |     for i in range(len(string)):
34 |         cycles.append(string[i:] + string[:i])
35 |     return cycles
36 | 
37 | 
38 | def generate_repeats(min_size, max_size, output_file):
39 |     alphabet = ['A', 'C', 'G', 'T']
40 |     expanded_set = set()
41 |     repeat_set = set()
42 |     for i in range(min_size, max_size+1):
43 |         for combination in product(alphabet, repeat=i):
44 |             repeat = ''.join(combination)
45 |             repeat_revcomp = rev_comp(repeat)
46 |             expanded = expand_repeat(repeat, max_size)
47 |             if expanded in expanded_set:
48 |                 continue
49 |             else:
50 |                 repeat_cycles = get_cycles(repeat)
51 |                 for cycle in repeat_cycles:
52 |                     strand = '+'
53 |                     string = expand_repeat(cycle, max_size)
54 |                     expanded_set.add(string)
55 |                     if cycle not in repeat_set:
56 |                         repeat_set.add(cycle)
57 |                         print(cycle, repeat, str(len(cycle)), strand, sep='\t', file=output_file)
58 |                 if repeat_revcomp == repeat:
59 |                     continue
60 |                 repeat_cycles = get_cycles(repeat_revcomp)
61 |                 for cycle in repeat_cycles:
62 |                     strand = '-'
63 |                     string = expand_repeat(cycle, max_size)
64 |                     expanded_set.add(string)
65 |                     if cycle not in repeat_set:
66 |                         repeat_set.add(cycle)
67 |                         print(cycle, repeat, str(len(cycle)), strand, sep='\t', file=output_file)
68 | 
69 | min_motif_size = args.min_motif_size
70 | max_motif_size = args.max_motif_size
71 | generate_repeats(min_motif_size, max_motif_size, args.output)
72 | 


--------------------------------------------------------------------------------