├── .gitignore
├── LICENSE
├── Nanopore_SumStatQC_Tutorial.Rmd
├── README.md
├── RawData
    ├── .gitattributes
    ├── lambda_barcoding_summary.txt.bz2
    └── lambda_sequencing_summary.txt.bz2
├── Static
    ├── Bibliography.bib
    ├── Images
    │   ├── KnitIt.png
    │   ├── ONT_logo.png
    │   ├── ONT_logo_deprecated.png
    │   └── sumstatEditParams.png
    ├── TutorialPostamble.md
    ├── TutorialPreamble.md
    └── ont_tutorial.css
├── config.yaml
└── environment.yaml


/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | .Ruserdata
5 | ont_tutorial_basicqc.Rproj
6 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.
374 | 


--------------------------------------------------------------------------------
/Nanopore_SumStatQC_Tutorial.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Nanopore summary statistics and basic QC tutorial"
  3 | date: "Report created: `r Sys.Date()`"
  4 | output:
  5 |   html_document:
  6 |     keep_md: no
  7 |     number_sections: yes
  8 |     self_contained: yes
  9 |     theme: default
 10 |     highlight: null
 11 |     css: Static/ont_tutorial.css
 12 |     toc: yes
 13 |     toc_depth: 2
 14 |     toc_float:
 15 |       collapsed: yes
 16 |       smooth_scroll: yes
 17 |     df_print: paged
 18 | link-citations: yes
 19 | bibliography: Static/Bibliography.bib
 20 | ---
 21 | 
 22 | <div style="position:absolute;top:0px;right:0px;padding:15px;background-color:gray;width:45%;">
 23 | ```{r, echo=FALSE}
 24 | knitr::include_graphics("https://nanoporetech.com/themes/custom/nanopore/images/ont-logo.svg?tutorial=basicqc")
 25 | ```
 26 | </div>
 27 | 
 28 | ```{r setup, include=FALSE}
 29 | 
 30 | knitr::opts_chunk$set(fig.width=9, fig.height=6, warning=FALSE, message=FALSE, tidy = FALSE, cache.extra = packageVersion('tufte'), cache.path = "Results/KnitR")
 31 | options(htmltools.dir.version = FALSE)
 32 | 
 33 | library(data.table)
 34 | library(digest)
 35 | library(dplyr)
 36 | library(plyr)
 37 | library(emojifont)
 38 | library(extrafont)
 39 | library(ggplot2)
 40 | library(knitr)
 41 | library(RColorBrewer)
 42 | library(tufte)
 43 | library(caTools)
 44 | library(yaml)
 45 | library(fastmatch)  # added to accommodate the guppy barcode content
 46 | 
 47 | config <- yaml.load_file("config.yaml")
 48 | inputFile      <- config$inputFile
 49 | flowcellId     <- config$flowcellId
 50 | basecaller     <- config$basecaller
 51 | tutorialText   <- config$tutorialText
 52 | 
 53 | # core parameters used for the presentation and configuration of report
 54 | # not to be moved to config.yaml (yet)
 55 | qcThreshold <- 7
 56 | binFilter <- 5
 57 | scaling <- 1
 58 | reportDPI <- 90
 59 | 
 60 | # make a results directory
 61 | dir.create("Results", showWarnings = FALSE)
 62 | 
 63 | slurpContent <- function(filename) {
 64 |   include = as.logical(tutorialText)
 65 |   if (include) {
 66 |     paste(readLines(filename),collapse="\n")
 67 |   }
 68 | }
 69 | 
 70 | ```
 71 | 
 72 | 
 73 | `r slurpContent("Static/TutorialPreamble.md")`
 74 | 
 75 | 
 76 | # Executive summary
 77 | 
 78 | 
 79 | ```{r executivesummary, fig.fullwidth = TRUE, echo=FALSE}
 80 | 
 81 | # Using fread for fast and friendly import of sequence_summary file
 82 | # no definition of column types to speed import and allow for merged files
 83 | # could be worthwhile to fread(select=) to select only a subset of columns - this could
 84 | # preclude e.g. barcode data or different versions?
 85 | sequencedata <- data.table::fread(inputFile, stringsAsFactors=FALSE)
 86 | 
 87 | # remove the redundant headers from merged files
 88 | if (length(which(sequencedata[,1]=="filename")) > 0) {
 89 |   sequencedata <- sequencedata[-which(sequencedata[,1]=="filename"),]
 90 | }
 91 | 
 92 | # coerce the columns used in analytics into more appropriate data-types 
 93 | sequencedata$channel<-as.numeric(sequencedata$channel)
 94 | sequencedata$start_time<-as.numeric(sequencedata$start_time)
 95 | sequencedata$duration<-as.numeric(sequencedata$duration)
 96 | sequencedata$num_events<-as.numeric(sequencedata$num_events)
 97 | sequencedata$sequence_length_template<-as.numeric(sequencedata$sequence_length_template)
 98 | sequencedata$mean_qscore_template<-as.numeric(sequencedata$mean_qscore_template)
 99 | 
100 | # passes_filtering is a useful flag; but there are examples of sequencing_summary.txt where this 
101 | # is not present - https://github.com/a-slide/pycoQC/blob/master/pycoQC/data/sequencing_summary_1D_DNA_Albacore_1.2.1.txt
102 | if (! "passes_filtering" %in% colnames(sequencedata)) {
103 |   # set all of the reads to pass? apply a cutoff?
104 |   sequencedata$passes_filtering <- TRUE
105 | } else {
106 |   sequencedata$passes_filtering <- as.logical(sequencedata$passes_filtering)
107 | }
108 | # create a convenient separation of pass and fail ...
109 | passedSeqs <- sequencedata[which(sequencedata$passes_filtering), ]
110 | failedSeqs <- sequencedata[which(!sequencedata$passes_filtering), ]
111 | 
112 | ```
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | ```{r pseudoValueBox, include=TRUE, echo=FALSE, fig.fullwidth = TRUE, dpi=360, fig.width=9, fig.height=2.5}
120 | 
121 | # calculate some basec, but key, metrics
122 | readCount <- formatC(nrow(sequencedata), big.mark=",")
123 | totalBases = sum(sequencedata$sequence_length_template,na.rm=T)/10^9
124 | passedBases = sum(passedSeqs$sequence_length_template,na.rm=T)/10^9
125 | gigabases <- round(totalBases,2)
126 | 
127 | # prepare a data object to render a summary graphic
128 | figures <- 3
129 | df <- data.frame(
130 |     x = cumsum(c(2, rep(6.5, figures-1))),
131 |     y = rep(2, figures),
132 |     h = rep(4, figures),
133 |     w = rep(6, figures))
134 | 
135 | df$info <- c("flowcell", readCount, gigabases)
136 | df$key <- c(flowcellId,"Reads produced", "gigabases called")
137 | df$icon <- fontawesome(c('fa-qrcode', 'fa-filter', 'fa-file-text-o'))
138 | df$colour <- rep("steelblue", figures)
139 | 
140 | 
141 | # and display the plot
142 | ExecutiveSummaryValueBoxes <- ggplot(df, aes(x, y, height = h, width = w, label = key, fill = colour)) +
143 |     geom_tile(fill = brewer.pal(9,"Blues")[7]) +
144 |     geom_text(color = brewer.pal(9,"Blues")[3], hjust="left", nudge_y=-1.5, nudge_x=-2.6, size=5) +
145 |     geom_text(label=df$info, size=10, color = brewer.pal(9,"Blues")[2], fontface = "bold", nudge_x=-2.6, hjust="left") +
146 |     geom_text(label=df$icon, family='fontawesome-webfont', colour=brewer.pal(9,"Blues")[5], size=23, hjust="right", nudge_x=2.85, nudge_y=0.8) +
147 |     coord_fixed() + 
148 |     theme_void() +
149 |     guides(fill = F)
150 | 
151 | # some juggling here to prepare a plot that can be rendered across platforms = windows/linux/osx
152 | ggsave(file.path("Results", "ExecutiveSummaryValueBoxes.png"), plot=ExecutiveSummaryValueBoxes, device="png", units="cm", width=25, height=5, dpi=reportDPI)
153 | knitr::include_graphics(file.path("Results", "ExecutiveSummaryValueBoxes.png"))
154 | 
155 | ```
156 | 
157 | Basecalling was performed using the **`r config$basecaller`** software. Called reads were classified as either pass or fail depending on their mean quality score. For this analysis, a total of `r formatC(nrow(sequencedata), big.mark=",")` reads were basecalled and of these `r formatC(nrow(passedSeqs), big.mark=",")` (`r round(nrow(passedSeqs) / nrow(sequencedata) * 100, 1)`%) were passed as satsifying the quality metric. The passed reads contain a total of `r round(passedBases, 2)` Gb of DNA sequence. This passed-fraction amounts to `r round(passedBases / totalBases * 100, 1)`% of the total DNA nucleotide bases sequenced.
158 | 
159 | 
160 | 
161 | ```{r qcPassGauge, echo=FALSE, include=TRUE, fig.margin=TRUE}
162 | df <- data.frame(matrix(nrow=1, ncol = 3))
163 | 
164 | names(df) <- c("variable", "percentage","label")
165 | df$variable <- c("pass")
166 | df$percentage <- c(round(length(which(sequencedata$passes_filtering==TRUE)) / nrow(sequencedata), 3))
167 | 
168 | df <- df %>% mutate(group=ifelse(percentage <0.6, "red",
169 |  ifelse(percentage>=0.6 & percentage<0.8, "orange","green")),
170 |  label=paste0(df$percentage*100, "%"))
171 | 
172 | title="Percentage of reads\npassing QC filter"
173 | 
174 | ggplot(df, aes(fill = group, ymax = percentage, ymin = 0, xmax = 2, xmin = 1)) +
175 |  geom_rect(aes(ymax=1, ymin=0, xmax=2, xmin=1), fill ="#ece8bd") +
176 |  geom_rect() + 
177 |  coord_polar(theta = "y",start=-pi/2) + xlim(c(0, 2)) + ylim(c(0,2)) +
178 |   guides(fill=FALSE) +
179 |   guides(colour=FALSE) +
180 |   theme_void() +
181 |   theme(strip.background = element_blank(), strip.text.x = element_blank()) +
182 |   geom_text(aes(x = 0, y = 0, label = label), size=13) +
183 |   geom_text(aes(x=1.5, y=1.5, label=title), size=11) +
184 |   scale_fill_manual(values = c("red"="#C9146C", "orange"="#DA9112", "green"="#129188")) +
185 |   scale_colour_manual(values = c("red"="#C9146C", "orange"="#DA9112", "green"="#129188"))
186 | 
187 | ```
188 | 
189 | \newpage
190 | 
191 | 
192 | # Sequencing channel activity plot
193 | 
194 | The nanopores through which DNA is passed, and signal collected, are arrayed as a 2-dimensional matrix. A heatmap can be plotted showing channel productivity against spatial position on the matrix. Such a plot enables the identification of spatial artifacts that could result from membrane damage through e.g. the introduction of an air-bubble. This heatmap representation of spatial activity shows only gross spatial aberations. Since each channel can address four differemnt pores (Mux) the activity plot below shows the number of sequences produced per channel, not per pore. 
195 | 
196 | 
197 | 
198 | ```{r channel_plot, include=TRUE, echo=FALSE, fig.fullwidth = TRUE}
199 | # create an empty read count container ... MinION or PromethION??
200 | 
201 | # https://gist.github.com/roblanf/df47b9748c3aae00809cc675aca79989
202 | # build the map for R9.5 flowcell, as a long-form dataframe that translates
203 | # channels into rows and columns on the flowcell. Good for plotting in R.
204 | p1 = data.frame(channel=33:64, row=rep(1:4, each=8), col=rep(1:8, 4))
205 | p2 = data.frame(channel=481:512, row=rep(5:8, each=8), col=rep(1:8, 4))
206 | p3 = data.frame(channel=417:448, row=rep(9:12, each=8), col=rep(1:8, 4))
207 | p4 = data.frame(channel=353:384, row=rep(13:16, each=8), col=rep(1:8, 4))
208 | p5 = data.frame(channel=289:320, row=rep(17:20, each=8), col=rep(1:8, 4))
209 | p6 = data.frame(channel=225:256, row=rep(21:24, each=8), col=rep(1:8, 4))
210 | p7 = data.frame(channel=161:192, row=rep(25:28, each=8), col=rep(1:8, 4))
211 | p8 = data.frame(channel=97:128, row=rep(29:32, each=8), col=rep(1:8, 4))
212 | 
213 | q1 = data.frame(channel=1:32, row=rep(1:4, each=8), col=rep(16:9, 4))
214 | q2 = data.frame(channel=449:480, row=rep(5:8, each=8), col=rep(16:9, 4))
215 | q3 = data.frame(channel=385:416, row=rep(9:12, each=8), col=rep(16:9, 4))
216 | q4 = data.frame(channel=321:352, row=rep(13:16, each=8), col=rep(16:9, 4))
217 | q5 = data.frame(channel=257:288, row=rep(17:20, each=8), col=rep(16:9, 4))
218 | q6 = data.frame(channel=193:224, row=rep(21:24, each=8), col=rep(16:9, 4))
219 | q7 = data.frame(channel=129:160, row=rep(25:28, each=8), col=rep(16:9, 4))
220 | q8 = data.frame(channel=65:96, row=rep(29:32, each=8), col=rep(16:9, 4))
221 | 
222 | # long form as a data frame, i.e. map$channel[[1]] returns 33
223 | channelMap = rbind(p1, p2, p3, p4, p5, p6, p7, p8, q1, q2, q3, q4, q5, q6, q7, q8)
224 | 
225 | hm.palette <- colorRampPalette(brewer.pal(9, 'Blues'), space='Lab') #RdPu, Oranges, Greens, YlOrRd, Purples
226 | 
227 | channelCounts <- as.data.frame(matrix(rep(0, 512), ncol=1))
228 | channelCountRaw <- as.data.frame(table(unlist(sequencedata[, "channel"])), row.names=1)
229 | channelCounts[row.names(channelCountRaw),] <- channelCountRaw[,1]
230 | #channelMap <- cbind(channelMap[channelMap$channel,], frequency=channelCounts[channelMap$channel,])
231 | channelMap <- merge(channelMap, channelCounts, by.x="channel", by.y=0)
232 | colnames(channelMap)[4]<-"count"
233 | channelMapMatrix <- reshape2::acast(channelMap, col ~ row, value.var = "count")
234 | 
235 | theme_update(plot.title = element_text(hjust = 0.5))
236 | 
237 | ggplot(channelMap, aes(x = row, y = col, fill = count)) +
238 |     geom_tile() +
239 |     geom_text(data=channelMap,aes(x=row, y=col,label=count,color=count),show.legend = F, size=2.5) +
240 |     scale_x_discrete(breaks=NULL) +
241 |     scale_y_discrete(breaks=NULL) +
242 |     coord_equal() +
243 |     scale_fill_gradientn(colours = hm.palette(100)) +
244 |     scale_color_gradient2(low = hm.palette(100), high = hm.palette(1)) +
245 |     theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
246 |     labs(title="Channel activity plot showing number of reads per flowcell channel") +
247 |     theme(panel.border = element_blank(), panel.grid.major = element_blank(),
248 |         panel.grid.minor = element_blank(),
249 |         axis.title.x = element_blank(),
250 |         axis.title.y = element_blank(),
251 |         legend.position="bottom",
252 |         legend.key.width=unit(5.6,"cm"))
253 | 
254 | ```
255 | 
256 | 
257 | 
258 | 
259 | \pagebreak
260 | 
261 | # Quality and length
262 | 
263 | The distribution of base-called DNA sequence lengths and their accompanying qualities are key metrics for the review of a sequencing library. This section of the QC review tutorial assesses the length and quality distributions for reads from this flowcell. We will review the total collection of sequences, including those that fail the mean quality filter. The information plot below summarises key metrics from the sequence library. Mean read length, N50, mean read quality and longest read are calculated from the reads that pass the QC threshold. Mean failed QC is calculated from only the reads that failed QC.
264 | 
265 | 
266 | ```{r summaryStatMeasures, include=FALSE, echo=FALSE}
267 | 
268 | lenSorted <- rev(sort(passedSeqs$sequence_length_template))
269 | passedMeanLength = round(mean(lenSorted), digits = 0)
270 | N50 <- lenSorted[cumsum(lenSorted) >= sum(lenSorted)*0.5][1]
271 | passedMeanQ = round(mean(passedSeqs$mean_qscore_template), digits = 1)
272 | failedMeanQ = round(mean(failedSeqs$mean_qscore_template), digits = 1)
273 | 
274 | #N50 length is the length of the shortest contig such that the sum of contigs of equal length or longer is at least 50% of the total length of all contigs
275 | ```
276 | 
277 | 
278 | ```{r seqInfoPlot, include=TRUE, echo=FALSE, fig.fullwidth = TRUE, dpi=360, fig.width=9, fig.height=2}
279 | 
280 | figures <- 5
281 | 
282 | df <- data.frame(
283 |     x = cumsum(c(2, rep(6.5, figures-1))),
284 |     y = rep(2, figures),
285 |     h = rep(4, figures),
286 |     w = rep(6, figures))
287 | 
288 |     df$info <- c(passedMeanLength, N50, passedMeanQ, failedMeanQ, prettyNum(max(passedSeqs$sequence_length_template), big.mark=","))
289 |     df$key <- c("Mean Read Length (nt)","N50","Mean Read Quality (QV)","Mean Failed QV","Longest Read")
290 |     df$icon <- fontawesome(c("fa-bar-chart", "fa-play", "fa-area-chart", "fa-bug", "fa-sort"))
291 | 
292 |     df$colour <- rep("steelblue", figures)
293 | 
294 | ReadCharacteristicsValueBoxes <- ggplot(df, aes(x, y, height = h, width = w, label = key, fill = colour)) +
295 |     geom_tile(fill = brewer.pal(9,"Blues")[7]) +
296 |     geom_text(color = brewer.pal(9,"Blues")[3], hjust="left", nudge_y=-1.5, nudge_x=-2.6, size=3.5) +
297 |     geom_text(label=df$info, size=5.5, color = brewer.pal(9,"Blues")[2], fontface = "bold", nudge_x=-2.6, hjust="left") +
298 |     geom_text(label=df$icon, family='fontawesome-webfont', colour=brewer.pal(9,"Blues")[5], size=13.3, hjust="right", nudge_x=2.85, nudge_y=0.8) +
299 |     coord_fixed() + 
300 |     scale_fill_brewer(type = "qual",palette = "Dark2") +
301 |     theme_void() +
302 |     guides(fill = F)
303 | 
304 | 
305 | 
306 | ggsave(file.path("Results", "ReadCharacteristicsValueBoxes.png"), plot=ReadCharacteristicsValueBoxes, device="png", units="cm", width=25, height=5, dpi=reportDPI)
307 | 
308 | knitr::include_graphics(file.path("Results", "ReadCharacteristicsValueBoxes.png"))
309 | 
310 | ```
311 | 
312 | 
313 | The distribution of sequence lengths will be dependent on the protocols that have been used to extract and/or prepare the starting DNA. Sequences from amplicon DNA will have a tight distribution of read lengths, while sequences from genomic DNA will have a broader distribution. The distribution will be further influenced if a size-selection step has been used, and will also be dependent on the choice of sequencing library preparation kits. The read-length distribution should be assessed to see if the distribution is concordant with that expected.
314 | 
315 | \hfill\break
316 | 
317 | <!-- weighted read length histogram -->
318 | 
319 | ```{r weightedreadlength, echo=FALSE, include=TRUE}
320 | 
321 | # https://stackoverflow.com/questions/6461209/how-to-round-up-to-the-nearest-10-or-100-or-x
322 | roundUpNice <- function(x, nice=seq(from=1, to=10, by=0.25)) {
323 |     if(length(x) != 1) stop("'x' must be of length 1")
324 |     10^floor(log10(x)) * nice[[which(x <= 10^floor(log10(x)) * nice)[[1]]]]
325 | }
326 | 
327 | # pick a friendly upper limit to render sequence lengths into a histogram
328 | # here we're aiming for a robustly rounded up 97.5 quantile of the data (skip a few outliers ...)
329 | upperLimit <- roundUpNice(as.numeric(quantile(x=sequencedata$sequence_length_template, probs=c(0.975))))
330 | 
331 | # an ideal histogram will have 40 or so bins
332 | histogramBinCount <- 40
333 | breakVal = roundUpNice(upperLimit / histogramBinCount)
334 | 
335 | breaks <- seq(0, to=upperLimit, by=breakVal)
336 | 
337 | binAssignments <- cut(sequencedata$sequence_length_template, breaks, include.lowest=TRUE, right=FALSE)
338 | 
339 | scrapeBinnedBases <- function(level, qcpass) {
340 |   sum(subset(sequencedata[which(binAssignments == level), ], passes_filtering==qcpass)$sequence_length_template)
341 | }
342 | 
343 | passedBinnedBases <- unlist(lapply(levels(binAssignments), scrapeBinnedBases, qcpass=TRUE))
344 | failedBinnedBases <- unlist(lapply(levels(binAssignments), scrapeBinnedBases, qcpass=FALSE))
345 | 
346 | binnedBaseDist <- data.frame(length=head(breaks, -1), pass=passedBinnedBases, fail=failedBinnedBases)
347 | binnedBaseMelt <- reshape2::melt(binnedBaseDist, id.vars=c("length"))
348 | 
349 | ggplot(binnedBaseMelt, aes(x=length, fill=variable, y=value)) +
350 |   geom_bar(stat="identity") +
351 |   xlab("Read length\n") + ylab("Number of bases sequenced\n") +
352 |   scale_fill_manual("QC", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
353 |   scale_x_continuous(limits=c(-breakVal,upperLimit), breaks=pretty(passedSeqs$sequence_length_template,n=40)) +
354 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
355 |   labs(title="Histogram showing the number of sequenced bases against sequence length", fill="QV filter")+
356 |     geom_vline(xintercept = N50, size = 1) +
357 |     annotate("text", x=N50, y=max(passedBinnedBases + failedBinnedBases), label = " N50", hjust=0, colour="SteelBlue") +
358 |     geom_vline(xintercept = passedMeanLength, size = 1) +
359 |     annotate("text", x=passedMeanLength, y=max(passedBinnedBases + failedBinnedBases), label = " Mean", hjust=0, colour="SteelBlue")
360 | 
361 | ```
362 | 
363 | 
364 | The weighted read length histogram above shows the binned distribution of sequence length against number of sequence nucleotides contained within the bin. This plot will show clear peaks if for example, amplicons are sequenced or if size selection has been performed. The histogram includes annotations for N50 and mean sequence sizes. N50 describes the sequence length where 50% of the sequenced bases are contained within reads of this length, or longer. The mean sequence length is the average sequence length across the whole sequence collection. The N50 and mean sequence lengths plotted are calculated from only the sequence reads that pass QC.
365 | 
366 | 
367 | ```{r lengthdistribution, include=TRUE, cache=FALSE, fig.fullwidth=FALSE, echo=FALSE}
368 | 
369 | scrapeBinnedReads <- function(level, qcpass) {
370 |   length(subset(sequencedata[which(binAssignments == level), ], passes_filtering==qcpass)$sequence_length_template)
371 | }
372 | 
373 | passedBinnedReads <- unlist(lapply(levels(binAssignments), scrapeBinnedReads, qcpass=TRUE))
374 | failedBinnedReads <- unlist(lapply(levels(binAssignments), scrapeBinnedReads, qcpass=FALSE))
375 | 
376 | binnedReadDist <- data.frame(length=head(breaks, -1), pass=passedBinnedReads, fail=failedBinnedReads)
377 | binnedReadMelt <- reshape2::melt(binnedReadDist, id.vars=c("length"))
378 | 
379 | ggplot(binnedReadMelt, aes(x=length, fill=variable, y=value)) +
380 |   geom_bar(stat="identity") +
381 |   xlab("Read length\n") + ylab("Number of reads\n") +
382 |   scale_fill_manual("QC", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
383 |   scale_x_continuous(limits=c(-breakVal,upperLimit), breaks=pretty(passedSeqs$sequence_length_template,n=40)) +
384 |   theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
385 |   labs(title="Histogram showing distribution of read lengths across quality passing sequences", fill="QV filter")+
386 |     geom_vline(xintercept = N50, size = 1) +
387 |     annotate("text", x=N50, y=max(passedBinnedReads + failedBinnedReads), label = " N50", hjust=0, colour="SteelBlue") +
388 |     geom_vline(xintercept = passedMeanLength, size = 1) +
389 |     annotate("text", x=passedMeanLength, y=max(passedBinnedReads + failedBinnedReads), label = " Mean", hjust=0, colour="SteelBlue")
390 | 
391 | ```
392 | 
393 | 
394 | \hfill\break
395 | A histogram of mean QV scores reveals the relative abundance of sequences of different qualities. The distribution of sequence qualities is shaded by the QV filter pass status. This QV filter is applied during the base-calling process as a modifiable parameter. For downstream analyses we recommend processing only the higher-quality sequence reads. The histogram includes annotations for N50 and mean sequence sizes. N50 describes the sequence length where 50% of the sequenced bases are contained within reads of this length, or longer. The mean sequence length is the average sequence length across the whole sequence collection. The N50 and mean sequence lengths plotted are calculated from only the sequence reads that pass QC.
396 | \hfill\break
397 | 
398 | ```{r QvalueDistributions, include=TRUE, cache=FALSE, echo=FALSE, fig.margin=FALSE}
399 | 
400 | ggplot(sequencedata, aes(x=mean_qscore_template, fill=passes_filtering)) + 
401 |     geom_histogram(breaks=seq(from=0, to=15, by=0.1)) +
402 |       scale_fill_manual(name="QC", values=c("TRUE"=brewer.pal(6, "Paired")[2], "FALSE"=brewer.pal(6, "Paired")[1]), labels=c( "pass", "fail"), breaks=c("TRUE", "FALSE")) +
403 |     labs(title="Plot showing distribution of quality scores across all reads") +
404 |     xlab("Mean Q score of read") +
405 |     ylab("Number of reads")
406 | ```
407 | 
408 | 
409 | 
410 | \hfill\break
411 | The plot above shows the distribution of mean read quality scores across the whole sequence collection. The distribution has been shaded for the sequence reads that have passed or failed the base-callers quality filter.
412 | \hfill\break
413 | 
414 | 
415 | ```{r lengthQualityDensityPlots, include=TRUE, cache=FALSE, echo=FALSE, warning=FALSE}
416 | # prepare the density plot, but do not render
417 | lq_dens <- ggplot(sequencedata, aes(log10(sequence_length_template), mean_qscore_template)) + geom_bin2d(bins=100)
418 | # extract the density map from the plot
419 | lq_dens_counts <- ggplot_build(lq_dens)$data[[1]]
420 | if (binFilter > 0) {
421 |   # remove the bins from the density map that do not contain sequence count above threshold 
422 |   lq_dens_counts <- lq_dens_counts[-which(lq_dens_counts$count <= binFilter),]
423 | }
424 | # directly plot this modified density map (stat=="identity")
425 | ggplot(lq_dens_counts) + 
426 |   geom_bin2d(aes(x,y,fill=count), stat="identity") +
427 |   scale_fill_distiller(palette="Blues", trans="reverse") + 
428 |   geom_hline(yintercept = qcThreshold, size = 1) + 
429 |   xlab("log10(read length)") + 
430 |   ylab("read mean quality") + 
431 |   scale_x_continuous(breaks = c(1,2,3,4,5), label = c("10", "100", "1000", "10,000", "100,000")) +
432 |   annotation_logticks(base = 10, sides = "b", scaled = TRUE) +
433 |   labs(title="Contour Plot showing distribution of quality scores against log10 read lengths (all reads)")
434 | 
435 | 
436 | ```
437 | 
438 | \hfill\break
439 | The density plot of mean sequence quality plotted against log10 sequence length is a useful graphic to show patterns within the broader sequence collection. The density plot shown in the figure below has been de-speckled by omitting the rarer sequence bins containing only `r binFilter` reads or fewer have been omitted. This is mainly aesthetic and masks some speckle around the periphery of the main density map. 
440 | \hfill\break
441 | 
442 | 
443 | \pagebreak
444 | 
445 | # Time/duty performance
446 | 
447 | Another key metric in the quality review of a sequencing run is an analysis of the temporal performance of the run. During a run each sequencing channel will address a number of different pores (mux) and the individual pores may become temporarily or permanently blocked. It is therefore expected that during a run sequencing productivity will decrease. It is useful to consider whether the observed productivity decline is normal or if it happens more rapidly than expected. A rapid pore decline could be indicative of contaminants with the sequencing library.
448 | 
449 | \hfill\break
450 | 
451 | Plotting the number of bases generated per unit time over the course of a sequencing run can reveal unexpected behaviours. In an ideal experiment there should not be any sudden decreases in performance.
452 | 
453 | 
454 | ```{r basesPerTimeBin, include=TRUE, cache=FALSE, echo=FALSE}
455 | 
456 | sequencedata$start_time <- sequencedata$start_time - min(sequencedata$start_time)
457 | sequencedata$start_time <- sequencedata$start_time / scaling
458 | 
459 | # assuming a 48 hour run, 5 minute intervals
460 | sampleHours = 48
461 | sampleIntervalMinutes = 60
462 | 
463 | breaks = seq(0, sampleHours*60*60, by=60*sampleIntervalMinutes)
464 | binass <- findInterval(sequencedata$start_time, breaks)
465 | 
466 | mergeItPerHour <- function(interval, binnedAssignments, filter) {
467 |   totalbases = 0
468 |   if (length(which(binnedAssignments==interval))>0) {
469 |     subset <- sequencedata[which(binnedAssignments==interval), ]
470 |     if (length(which(subset$passes_filtering == filter)) > 0) {
471 |       totalbases = sum(subset[which(subset$passes_filtering == filter), "sequence_length_template"])
472 |     }
473 |   }
474 |   # need to scale what is being returned - totalbases value is total bases within an interval (sampleIntervalMinutes)
475 |   return(totalbases / 1e9 / sampleIntervalMinutes * 60)
476 | }
477 | 
478 | binnedTemporalDataPerHour <- data.frame(
479 |   cbind(
480 |     time=breaks,
481 |     pass=unlist(lapply(seq(breaks), mergeItPerHour, binnedAssignments=binass,filter=TRUE)),
482 |     fail=unlist(lapply(seq(breaks), mergeItPerHour, binnedAssignments=binass, filter=FALSE))
483 |   )
484 | )
485 | 
486 | binnedTemporalDataPerHour$time <- binnedTemporalDataPerHour$time / 60 / 60
487 | 
488 | ggplot(binnedTemporalDataPerHour, aes(time)) +
489 |   geom_line(aes(y = fail, colour = "fail"), size=1) + 
490 |   geom_line(aes(y = pass, colour = "pass"), size=1) +
491 |   scale_color_manual(name="QV", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
492 |   xlab("Time (hours)") + 
493 |   ylab("Gigabases sequenced per hour") + 
494 |   labs(title="Plot showing sequence throughput against time")
495 | 
496 | ```
497 | 
498 | 
499 | The temporal data presented in the figure above has been scaled to gigabases of sequence produced per hour. For a finer resolution of performance the **`R`** code that prepares this report could be modified for a more frequent sample interval. 
500 | 
501 | 
502 | ```{r temporalPerformance, echo=FALSE}
503 | 
504 | # binnedTemporalDataPerHour is scaled to Gbp per hour - rescale to raw for cumulative plotting
505 | binnedTemporalDataPerHour$pass <- binnedTemporalDataPerHour$pass / 60 * sampleIntervalMinutes
506 | binnedTemporalDataPerHour$fail <- binnedTemporalDataPerHour$fail / 60 * sampleIntervalMinutes
507 | 
508 | # https://stackoverflow.com/questions/31404679/can-ggplot2-find-the-intersections-or-is-there-any-other-neat-way
509 | acquireTimePoints <- which(binnedTemporalDataPerHour$pass > 0)
510 | targetInterpolate <- approxfun(x=binnedTemporalDataPerHour[acquireTimePoints, "time"], y=cumsum(binnedTemporalDataPerHour[acquireTimePoints, "pass"]))
511 | 
512 | base50 <- sum(passedSeqs$sequence_length_template)/1e9*0.5
513 | base90 <- sum(passedSeqs$sequence_length_template)/1e9*0.9
514 | 
515 | T50 <- optimize(function(t0) abs(targetInterpolate(t0) - base50), 
516 |                 interval = range(binnedTemporalDataPerHour[acquireTimePoints, "time"]))
517 | T90 <- optimize(function(t0) abs(targetInterpolate(t0) - base90), 
518 |                 interval = range(binnedTemporalDataPerHour[acquireTimePoints, "time"]))
519 | ```
520 | 
521 | 
522 | 
523 | 
524 | ```{r cumulativeSequencePlotBP, include=TRUE, cache=FALSE, echo=FALSE, fig.margin=FALSE}
525 | 
526 | ggplot(binnedTemporalDataPerHour, aes(time)) +
527 |   geom_line(aes(y = cumsum(fail), colour = "fail"), size=1) + 
528 |   geom_line(aes(y = cumsum(pass), colour = "pass"), size=1) +
529 |    scale_color_manual(name="QV", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
530 |   geom_segment(x=T50$minimum, y=0, xend=T50$minimum, yend=base50, colour="darkgray", size=1) +
531 |   geom_segment(x=0, y=base50, xend=T50$minimum, yend=base50, colour="darkgray", size=1) +
532 |   annotate("text", x=T50$minimum, y=base50, label=" T50", vjust=1, hjust=0, colour="SteelBlue") +
533 |   geom_segment(x=T90$minimum, y=0, xend=T90$minimum, yend=base90, colour="darkgray", size=1) +
534 |   geom_segment(x=0, y=base90, xend=T90$minimum, yend=base90, colour="darkgray", size=1) +
535 |   annotate("text", x=T90$minimum, y=base90, label=" T90", vjust=1, hjust=0, colour="SteelBlue") +
536 |   xlab("Time (hours)") + 
537 |   ylab("Number of bases sequenced (Gigabases)") + 
538 |   labs(title="Plot showing cumulative bases sequenced against time")
539 | 
540 | ```
541 | 
542 | In addition to plotting the temporal production of data, the cumulative plot shown above shows how data is accumulated during the run. From this dataset, we have measured a total of `r round(sum(passedSeqs$sequence_length_template)/1e9, 2)` Gb of quality passing sequence. We can identify the timepoint T50, where 50% of sequenced bases have been collected within this time - or `r round(T50$minimum, 2)` hours in this example. This is displayed on the graph along with T90, the time at which 90% of the sequenced based have been acquired.
543 | 
544 | 
545 | ```{r cumulativeSequencePlotReads, include=TRUE, cache=FALSE, echo=FALSE, fig.margin=TRUE}
546 | 
547 | mergeItReadsPerHour <- function(interval, binnedAssignments,filter) {
548 |   totalreads = 0
549 |   if (length(which(binnedAssignments==interval))>0) {
550 |     subset <- sequencedata[which(binnedAssignments==interval), ]
551 |     if (length(which(subset$passes_filtering == filter)) > 0) {
552 |       totalreads = nrow(subset[which(subset$passes_filtering == filter),])
553 |     }
554 |   }
555 |   # scale results to mean millions of reads per hour
556 |   return(totalreads/ 1e6 / sampleIntervalMinutes * 60)
557 | }
558 | 
559 | binnedTemporalDataReadsPerHour <- data.frame(
560 |   cbind(time=breaks,
561 |         pass=unlist(lapply(seq(breaks), mergeItReadsPerHour, binnedAssignments=binass, filter=TRUE)),
562 |         fail=unlist(lapply(seq(breaks), mergeItReadsPerHour, binnedAssignments=binass, filter=FALSE))
563 |     )
564 | )
565 | 
566 | binnedTemporalDataReadsPerHour$time <- binnedTemporalDataReadsPerHour$time / 60 / 60
567 | # binnedTemporalDataReadsPerHour is scaled to Gbp per hour - rescale to raw for cumulative plotting
568 | binnedTemporalDataReadsPerHour$pass <- binnedTemporalDataReadsPerHour$pass / 60 * sampleIntervalMinutes
569 | binnedTemporalDataReadsPerHour$fail <- binnedTemporalDataReadsPerHour$fail / 60 * sampleIntervalMinutes
570 | 
571 | ggplot(binnedTemporalDataReadsPerHour, aes(time)) +
572 |   geom_line(aes(y = cumsum(fail), colour = "fail"), size=1) + 
573 |   geom_line(aes(y = cumsum(pass), colour = "pass"), size=1) +
574 |    scale_color_manual(name="QV", values=c("fail"=brewer.pal(6, "Paired")[1], "pass"=brewer.pal(6, "Paired")[2])) +
575 |   xlab("Time (hours)") + 
576 |   ylab("Number of reads sequenced (Millions)") + 
577 |   labs(title="Plot showing cumulative reads sequenced against time")
578 | 
579 | ```
580 | 
581 | In addition to the cumulative plot of sequenced bases, an equivalent plot for the sequenced reads can be plotted - this is shown in the figure above. This is not too dissimilar in structure or morphology to the cumulative baseplot. 
582 | 
583 | \hfill\break
584 | 
585 | The speed/time plot is a useful tool to observe any substantial changes in sequencing speed. A marked slow-down in sequencing speed can indicate challenges within the sequencing chemistry that could have been caused by the method of DNA isolation or an abundance of small DNA fragments. Please contact our technical team if you see a profound slowdown with your sequencing.
586 | 
587 | 
588 | ```{r timeSpeedPlot, include=TRUE, cache=FALSE, echo=FALSE}
589 | 
590 | speedTime <- data.frame(segment=binass, rate=sequencedata$sequence_length_template / (sequencedata$duration/scaling))
591 | 
592 | ggplot(speedTime, aes(x=segment, y=rate, group=segment)) + geom_boxplot(fill="steelblue", outlier.shape=NA) +scale_x_continuous(name="Time (hours)") + ylab("Sequencing rate (bases per second)") + labs(title="boxplot showing distribution of translocation speed against time")
593 | 
594 | ```
595 | 
596 | 
597 | 
598 | The data points shown in the speed-time plot shown above have been filtered to mask outlying sequences (sequences beyond the 95% range). The distribution of the boxplots and their 'whiskers' are unchanged.
599 | 
600 | 
601 | 
602 | 
603 | 
604 | ```{r activeChannelsPlot, include=TRUE, cache=FALSE, echo=FALSE}
605 | 
606 | mergeActiveChannels <- function(interval, binnedAssignments) {
607 |   totalChannels = 0
608 |   if (length(which(binnedAssignments==interval))>0) {
609 |     subset <- sequencedata[which(binnedAssignments==interval), ]
610 |     totalChannels = length(unique(subset$channel))
611 |   }
612 |   return(totalChannels)
613 | }
614 | 
615 | 
616 | binnedTemporalChannels <- data.frame(time=breaks,
617 |         channels=unlist(lapply(seq(breaks), mergeActiveChannels, binnedAssignments=binass)
618 |   )
619 | )
620 | 
621 | binnedTemporalChannels$time <- binnedTemporalChannels$time / 60 / 60
622 | 
623 | ggplot(binnedTemporalChannels, aes(time)) +
624 |   geom_step(aes(y = channels), size=1, colour = "Steelblue") +
625 |   xlab("Time (hours)") + 
626 |   ylab("Number of channels producing data") + 
627 |   labs(title="Plot showing number of functional channels against time")
628 | 
629 | 
630 | ```
631 | 
632 | The graph presented above shows the number of sequencing channels that are actively producing data across time. A channel is defined as being active if one or more sequence reads are observed per time window (one hour for the default graph). It is expected that over the course of the run pores will block and the number of channels producing data will decrease. Changing the pore used by the channel (mux) and strategies to unblock pores mean that the number of functional channels may increase or decrease at a given timepoint but generally the number of channels producing data will decrease over time.
633 | 
634 | 
635 | \pagebreak
636 | 
637 | # Demultiplexing
638 | 
639 | 
640 | ```{r demultiplex, cache=FALSE, echo=FALSE}
641 | barcodes = 0
642 | barcodeUnass = 0
643 | barcodeRange <- c(0,0)
644 | 
645 | # if barcode_arrangement is lacking this could still be guppy called sequence?
646 | if (!"barcode_arrangement" %in% colnames(passedSeqs)) {
647 |   
648 |   if ("barcodeFile" %in% names(config) && file.exists(config$barcodeFile)) {
649 |       barcodedata <- data.table::fread(config$barcodeFile, select=c("read_id", "barcode_arrangement"), showProgress=TRUE, stringsAsFactors=FALSE)
650 |     pso <- order(passedSeqs$read_id, method="radix")
651 |     passedSeqs <- passedSeqs[pso, ]
652 |   
653 |     bco <- order(barcodedata$read_id, method="radix")
654 |     barcodedata <- barcodedata[bco, ]
655 |     
656 |     barcodeMapping <- fmatch(passedSeqs$read_id, barcodedata$read_id)
657 |     passedSeqs$barcode_arrangement <- barcodedata[barcodeMapping,c("barcode_arrangement")]
658 |   }
659 | 
660 | }
661 | ```
662 | 
663 | 
664 | ```{r demultiplexB, cache=FALSE, echo=FALSE}
665 | if ("barcode_arrangement" %in% names(passedSeqs)) {
666 |   barcodedata=plyr::count(passedSeqs$barcode_arrangement)
667 |   barcodedata=subset(barcodedata, freq > 150)
668 |   names(barcodedata) <- gsub("x", "barcode", names(barcodedata))
669 |   if ("unclassified" %in% barcodedata$barcode) {
670 |     barcodes <- nrow(barcodedata[-which(barcodedata$barcode=="unclassified"),])
671 |     barcodeUnass <- sum(barcodedata[-which(barcodedata$barcode=="unclassified"),"freq"]) / sum(barcodedata$freq) * 100
672 |     barcodeRange <- range(subset(barcodedata, barcode!="unclassified")$freq)
673 |   } else {
674 |     barcodes <- nrow(barcodedata)
675 |     barcodeUnass = 100
676 |     barcodeRange <- range(barcodedata$freq)
677 |   }
678 | }
679 | figures <- 3
680 | 
681 | df <- data.frame(
682 |     x = cumsum(c(2, rep(6.5, figures-1))),
683 |     y = rep(2, figures),
684 |     h = rep(4, figures),
685 |     w = rep(6, figures))
686 | 
687 |     df$info <- c(round(barcodeUnass, digits = 1), barcodes, paste(barcodeRange,collapse="\n"))
688 |     df$key <- c("Reads with barcode (%)","Barcoded libraries", "barcode variance")
689 |     df$icon <- fontawesome(c('fa-pie-chart', 'fa-barcode', 'fa-sliders'))
690 |     df$colour <- rep("steelblue", figures)
691 | 
692 | 
693 | ```
694 | 
695 | ```{r pseudoValueBarcodes, include=TRUE, echo=FALSE, fig.fullwidth = TRUE, dpi=360, fig.width=9, fig.height=2.5}
696 | if (barcodes > 0) {
697 | 
698 | MultiplexCharacteristicsValueBoxes <- ggplot(df, aes(x, y, height = h, width = w, label = key, fill = colour)) +
699 |     geom_tile(fill = brewer.pal(9,"Blues")[7]) +
700 |     geom_text(color = brewer.pal(9,"Blues")[3], hjust="left", nudge_y=-1.5, nudge_x=-2.6, size=5) +
701 |     geom_text(label=df$info, size=10, color = brewer.pal(9,"Blues")[2], fontface = "bold", nudge_x=-2.6, hjust="left") +
702 |     geom_text(label=df$icon, family='fontawesome-webfont', colour=brewer.pal(9,"Blues")[5], size=23, hjust="right", nudge_x=2.85, nudge_y=0.8) +
703 |     coord_fixed() + 
704 |     scale_fill_brewer(type = "qual",palette = "Dark2") +
705 |     theme_void() +
706 |     guides(fill = F)
707 | 
708 | 
709 | 
710 | ggsave(file.path("Results", "MultiplexCharacteristicsValueBoxes.png"), plot=MultiplexCharacteristicsValueBoxes, device="png", units="cm", width=25, height=5, dpi=reportDPI)
711 | 
712 | knitr::include_graphics(file.path("Results", "MultiplexCharacteristicsValueBoxes.png"))
713 | 
714 | }
715 | ```
716 | 
717 | 
718 | 
719 | 
720 | ```{r convenienceScoringLibraries, echo=FALSE}
721 | if (barcodes > 0) {
722 | 
723 | # https://www.ncbi.nlm.nih.gov/assembly/help/
724 | ncalc <- function(len.vector, n) {
725 |   # N50 - length such that scaffolds of this length or longer include half the bases of the assembly
726 |   len.sorted <- rev(sort(len.vector))
727 |   len.sorted[cumsum(len.sorted) >= sum(len.sorted)*n][1]
728 | }
729 | 
730 | lcalc <- function(len.vector, n) {
731 |   # L50 - number of scaffolds that are longer than, or equal to, the N50 length and therefore include half the bases of the assembly
732 |   len.sorted <- rev(sort(len.vector))
733 |   which(cumsum(len.sorted) >= sum(len.sorted)*n)[1]
734 | }
735 | 
736 | 
737 | N50 <- ncalc(passedSeqs$sequence_length_template, 0.5)
738 | N90 <- ncalc(passedSeqs$sequence_length_template, 0.9)
739 | L50 <- lcalc(passedSeqs$sequence_length_template, 0.5)
740 | L90 <- lcalc(passedSeqs$sequence_length_template, 0.9)
741 | 
742 | seqSummary <- function(barcodeId, myBarcode, myVector, myMethod, xlist=NA) {
743 |   subVector <- myVector[which(myBarcode == barcodeId)]
744 |   params <- list(subVector)
745 |   if (!is.na(xlist)) {
746 |     params <- append(params, xlist)
747 |   }
748 |   do.call(myMethod, params)
749 | }
750 | 
751 | barcodedata <- cbind(barcodedata, "%"=round(barcodedata$freq / sum(barcodedata$freq) * 100, digits=1))
752 | 
753 | barcodedata <- cbind(barcodedata, Mb=round(unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="sum")) / 1e06, digits=0))
754 | 
755 | barcodedata <- cbind(barcodedata, min=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="min")))
756 | 
757 | barcodedata <- cbind(barcodedata, max=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="max")))
758 | 
759 | barcodedata <- cbind(barcodedata, mean=round(unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="mean")), digits=0))
760 | 
761 | barcodedata <- cbind(barcodedata, N50=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="ncalc", xlist=list(n=0.5))))
762 | 
763 | barcodedata <- cbind(barcodedata, L50=unlist(lapply(as.character(barcodedata$barcode), seqSummary, myBarcode=passedSeqs$barcode_arrangement, myVector=passedSeqs$sequence_length_template, myMethod="lcalc", xlist=list(n=0.5))))
764 | 
765 | kable(barcodedata, format="markdown", caption="Table summarising barcode content")
766 | 
767 | }
768 | ```
769 | 
770 | `r if (barcodes > 0) {"The table above shows summary statistics for the barcode assignments within this sequence collection. The annotated barcode is presented along with the number of sequence reads assigned to it (freq), the percentage of reads assigned to the barcode (%), the megabases of DNA sequence (Mb), shortest read in nucleotides (min), longest read in nucleotides (max), mean sequence length in nucleotides (mean) and N50 and L50 values, again in nucleotides."}`
771 | 
772 | 
773 | 
774 | ```{r demultiplex_plot, include=TRUE, echo=FALSE, cache=FALSE, fig.margin=TRUE}
775 | if (barcodes > 0) {
776 |   if ("barcode_arrangement" %in% names(passedSeqs)) {
777 |     # it's a run that used the --barcoding flag
778 |   
779 |     #library(extrafont)
780 |     #loadfonts(device = "win")
781 |     ggplot(barcodedata, aes(barcode, freq, fill=barcode)) +
782 |         geom_bar(stat="identity", width=0.5, fill="#9ecae1") +
783 |         xlab("\nDemultiplexed barcodes") + 
784 |         ylab("\nFrequency") +
785 |         scale_y_continuous(expand = c(0,0)) +
786 |       labs(title="Histogram showing abundance of different barcodes") +
787 |         theme(axis.text.x = element_text(angle=45, hjust=1))
788 |   }
789 | }
790 | ```
791 | 
792 | `r if (barcodes > 0) {"The histogram above shows the abundance of different barcodes within the sequence collection. The size of the bar corresponds to the frequency of the observation - this is the number of sequence reads observed."}`
793 | 
794 | 
795 | 
796 | 
797 | \pagebreak
798 | 
799 | 
800 | 
801 | # Glossary of terms
802 | 
803 | * __knit__ is the command to render a Rmarkdown file. The knitr package is used to embed code, the results of R analyses and their figures within the typeset text from the document
804 | 
805 | * __L50__  describes the number of sequences (or contigs) that are longer than, or equal to, the N50 length and therefore include half the bases of the assembly
806 | 
807 | * __N50__  describes the length (read length, contig length etc) where half the bases of the sequence collection are contained within reads/contigs of this length or longer
808 | 
809 | * __Rmarkdown__ is an extension to markdown. Functional R code can be embedded in a plain-text document and subsequently rendered to other formats including the PDF format of this report.
810 | 
811 | * __QV__  the quality value, -log10(p) that any given base is incorrect. QV may be either at the individual base level, or may be averaged across whole sequences
812 | 
813 | * __sequencing_summary.txt__ is a summary file describing sequence characteristics following base calling with the Guppy software.
814 | 
815 | 
816 | 
817 | # Reproducible research - produce your own report
818 | 
819 | This report has been created using **`Rmarkdown`**, publicly available **`R`** packages, and the \LaTeX document typesetting software for reproducibility. For clarity the **`R`** packages used, and their versions, is listed below.
820 | 
821 | \fontsize{8}{12}
822 | 
823 | ```{r sessionInfo, eval=TRUE, echo=FALSE, comment=NA}
824 | options(width = 100)
825 | utils:::print.sessionInfo(sessionInfo()[-7], locale=FALSE)
826 | ```
827 | 
828 | \fontsize{10}{14}
829 | 
830 | 
831 | `r slurpContent("Static/TutorialPostamble.md")`
832 | 
833 | 
834 | 
835 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![.](Static/Images/ONT_logo_deprecated.png "This Oxford Nanopore Technologies repo is deprecated")
  2 | 
  3 | ******************
  4 | The BasicQC tutorial functionality has been moved to our new EPI2ME Labs product. Please see the EPI2ME Labs documentation at [https://labs.epi2me.io] and have a look at the product's GitHub pages at [https://github.com/epi2me-labs].
  5 | 
  6 | The EPI2ME Labs product provides a collection of tutorials and best-practise guidelines for processing Nanopore sequence data. The product is provided in a maintained docker container and interactive tutorials are provided through Jupyter notebooks. The Jupyter experience has been customised and provides exciting new material through interactive menus, genome browsers and more.
  7 | 
  8 | This repository is now unsupported and we do not recommend its use. Please contact Oxford Nanopore: support@nanoporetech.com for help with your application if it is not possible to upgrade to our new resources, or we are missing key features.
  9 | ******************
 10 | 
 11 | # 1. Introduction:
 12 | 
 13 | 
 14 | ### Overview:
 15 | 
 16 | The **Summary Statistics and QC tutorial** is intended as a functional guide to help assess the quality characteristics of a single Nanopore sequence run. This tutorial aims to enable an objective assessment of the performance of a Nanopore flowcell run and to assess the sequence characteristics to benchmark quality. 
 17 | 
 18 | ### Features:
 19 | 
 20 | Sufficient information is provided in the tutorial such that the workflow can be tested, validated, and replicated. The tutorial is provided with an example dataset from a barcoded sequence library. The tutorial is intended to address important questions;
 21 | 
 22 | * how many reads (and how many gigabases) were sequenced?
 23 | * what fraction of my sequence collection is good quality?
 24 | * how are longer sequence reads represented in my sample?
 25 | * how uniform is the representation of different barcodes?
 26 | 
 27 | ******************
 28 | 
 29 | # 2. Getting Started:
 30 | 
 31 | 
 32 | ### Input and Output: 
 33 | 
 34 | This tutorial uses the R markdown contained within this Github repository, a **`sequence_summary.txt`** file from the Guppy base-calling software, and optionally a **`barcoding_summary.txt`** file from Guppy barcoding as input. Example summary files are included within the repository. The result of the tutorial will be a tutorial document in **`html`** format. This workflow can also process the `sequence_summary.txt` file prepared by the **`albacore`** base calling software.
 35 | 
 36 | ### Dependencies:
 37 | 
 38 | This tutorial requires a computer running Linux (Centos7, Ubuntu 18_10, Fedora 29) - 8Gb of memory would be recommended.  The tutorial has been tested on minimal server installs of these operating systems.
 39 | 
 40 | Other dependencies include
 41 | 
 42 | * **`Conda`** is required by this tutorial and orchestrates and manages the installation of other required software
 43 | * **`R`** is a statistical analysis software and is used for the analysis and reporting of the sequence summary data
 44 | * **`Rstudio`** is a graphical user interface to **`R`** and provides much of the required reporting framework 
 45 | * **`git`** packages for downloading the tutorial from Github repository. 
 46 | * **`git-lfs`** is required to download the sequence and metadata files provided with the tutorial. 
 47 | 
 48 | 
 49 | 
 50 | ### Installation:
 51 | 
 52 | 1. Most software dependecies are managed though **`conda`**, install as described at  <br> [https://conda.io/docs/install/quick.html](https://conda.io/docs/install/quick.html).
 53 | ```
 54 |     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 55 |     bash Miniconda3-latest-Linux-x86_64.sh
 56 |     bash
 57 | ```
 58 | 2. Download Nanopore QC tutorial & example files into a folder named `QCTutorial`. This tutorial requires the **`git-lfs`** large file support capabilities; this should be installed first through **`conda`**
 59 | ```
 60 |     conda install -c conda-forge git-lfs
 61 |     git lfs install
 62 |     git clone https://github.com/nanoporetech/ont_tutorial_basicqc.git QCTutorial
 63 | ```
 64 | 3. Change working directory into the new `QCTutorial` folder
 65 | ```
 66 |     cd QCTutorial
 67 | ```
 68 | 4. Install conda software dependencies with
 69 | ```
 70 |     conda env create --name BasicQC --file environment.yaml
 71 | ```
 72 | 5. Initialise conda environment with 
 73 | ```
 74 |     source activate BasicQC
 75 | ```
 76 | 
 77 | 
 78 | #### Compilation From Source
 79 | 
 80 | This tutorial does not contain software that requires compilation.
 81 | 
 82 | 
 83 | 
 84 | ### Usage: 
 85 | 
 86 | In your Conda environment, and in the tutorial working directory,
 87 | 
 88 | 1. *optional* edit the provided **`config.yaml`** file to match your own study design
 89 | 2. Render the tutorial report using the command
 90 | ```
 91 |     R --slave -e 'rmarkdown::render("Nanopore_SumStatQC_Tutorial.Rmd", "html_document")'
 92 | ```
 93 | 
 94 | The provided Rmarkdown tutorial script can also be opened directly in Rstudio
 95 | 
 96 | ```
 97 | rstudio Nanopore_SumStatQC_Tutorial.Rmd
 98 | ```
 99 | 
100 | The report can be prepared by "knit" from the GUI as shown in the figure
101 | 
102 | ![.](Static/Images/KnitIt.png "Prepare a report using Knit")
103 | 
104 | 
105 | ******************
106 | 
107 | # 3. Results
108 | 
109 | This tutorial workflow will produce a rich description of your sequence characteristics as observed from the starting **`sequence_summary.txt`** file. Please visit the tutorial page at [https://community.nanoporetech.com/knowledge/bioinformatics]( https://community.nanoporetech.com/knowledge/bioinformatics) for more information
110 | 
111 | ******************
112 | 
113 | # 4. Help:
114 | 
115 | ### Licence and Copyright:
116 | 
117 | © 2019 Oxford Nanopore Technologies Ltd.
118 | 
119 | Bioinformatics-Tutorials are distributed by Oxford Nanopore Technologies under the terms of the MPL-2.0 license.
120 | 
121 | ### FAQs:
122 | 
123 | 
124 | 
125 | ### Abbreviations:
126 | 
127 | 
128 | * __knit__ is the command to render an Rmarkdown file. The knitr package is used to embed code, the results of R analyses and their figures within the typeset text from the document. 
129 | 
130 | * __L50__  the number of sequences (or contigs etc) that are longer than, or equal to, the N50 length and therefore include half the bases of the assembly
131 | 
132 | * __N50__  length such that sequences (or contigs etc) of this length or longer include half the bases of the sequence collection
133 | 
134 | * __Rmarkdown__ is an extension to markdown. Functional R code can be embedded in a plain-text document and subsequently rendered to other formats including the PDF format of this report.
135 | 
136 | * __QV__  the quality value - -log10(p) that any given base is incorrect. QV may be either at the individual base level, or may be averaged across whole sequences
137 | 
138 | * __sequencing_summary.txt__ a summary file describing sequence characteristics following base calling with the Guppy / Albacore software.
139 | 
140 | 
141 | ### References and Supporting Information:
142 | 
143 | *  https://community.nanoporetech.com/knowledge/bioinformatics
144 | *  https://www.r-project.org/
145 | *  https://snakemake.readthedocs.io/en/stable/
146 | *  https://bioconda.github.io/
147 | 
148 | 


--------------------------------------------------------------------------------
/RawData/.gitattributes:
--------------------------------------------------------------------------------
1 | *.bz2 filter=lfs diff=lfs merge=lfs -text
2 | 


--------------------------------------------------------------------------------
/RawData/lambda_barcoding_summary.txt.bz2:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:cc3e19aceecf50f4e971f6d5102eed29a94c51b1b2abf371af5a5beceeafbf22
3 | size 43359311
4 | 


--------------------------------------------------------------------------------
/RawData/lambda_sequencing_summary.txt.bz2:
--------------------------------------------------------------------------------
1 | version https://git-lfs.github.com/spec/v1
2 | oid sha256:4373b735ab6074215adb03502281656f378aa9102fbfd84c193573855b964c5f
3 | size 58994233
4 | 


--------------------------------------------------------------------------------
/Static/Bibliography.bib:
--------------------------------------------------------------------------------
  1 | @article{minimap22018,
  2 | author = {Li, Heng},
  3 | title = {Minimap2: pairwise alignment for nucleotide sequences},
  4 | journal = {Bioinformatics},
  5 | volume = {34},
  6 | number = {18},
  7 | pages = {3094-3100},
  8 | year = {2018},
  9 | doi = {10.1093/bioinformatics/bty191},
 10 | URL = {http://dx.doi.org/10.1093/bioinformatics/bty191},
 11 | eprint = {/oup/backfile/content_public/journal/bioinformatics/34/18/10.1093_bioinformatics_bty191/1/bty191.pdf}
 12 | }
 13 | 
 14 | @article{samtools2009,
 15 | author = {Li, Heng and Handsaker, Bob and Wysoker, Alec and Fennell, Tim and Ruan, Jue and Homer, Nils and Marth, Gabor and Abecasis, Goncalo and Durbin, Richard and 1000 Genome Project Data Processing Subgroup},
 16 | title = {The Sequence Alignment/Map format and SAMtools},
 17 | journal = {Bioinformatics},
 18 | volume = {25},
 19 | number = {16},
 20 | pages = {2078-2079},
 21 | year = {2009},
 22 | doi = {10.1093/bioinformatics/btp352},
 23 | URL = {http://dx.doi.org/10.1093/bioinformatics/btp352},
 24 | eprint = {/oup/backfile/content_public/journal/bioinformatics/25/16/10.1093/bioinformatics/btp352/2/btp352.pdf}
 25 | }
 26 | 
 27 | @article{snakemake2012,
 28 | author = {Köster, Johannes and Rahmann, Sven},
 29 | title = {Snakemake—a scalable bioinformatics workflow engine},
 30 | journal = {Bioinformatics},
 31 | volume = {28},
 32 | number = {19},
 33 | pages = {2520-2522},
 34 | year = {2012},
 35 | doi = {10.1093/bioinformatics/bts480},
 36 | URL = {http://dx.doi.org/10.1093/bioinformatics/bts480},
 37 | eprint = {/oup/backfile/content_public/journal/bioinformatics/28/19/10.1093/bioinformatics/bts480/2/bts480.pdf}
 38 | }
 39 | 
 40 | @article{BH1995,
 41 |  ISSN = {00359246},
 42 |  URL = {http://www.jstor.org/stable/2346101},
 43 |  abstract = {The common approach to the multiplicity problem calls for controlling the familywise error rate (FWER). This approach, though, has faults, and we point out a few. A different approach to problems of multiple significance testing is presented. It calls for controlling the expected proportion of falsely rejected hypotheses-the false discovery rate. This error rate is equivalent to the FWER when all hypotheses are true but is smaller otherwise. Therefore, in problems where the control of the false discovery rate rather than that of the FWER is desired, there is potential for a gain in power. A simple sequential Bonferroni-type procedure is proved to control the false discovery rate for independent test statistics, and a simulation study shows that the gain in power is substantial. The use of the new procedure and the appropriateness of the criterion are illustrated with examples.},
 44 |  author = {Yoav Benjamini and Yosef Hochberg},
 45 |  journal = {Journal of the Royal Statistical Society. Series B (Methodological)},
 46 |  number = {1},
 47 |  pages = {289--300},
 48 |  publisher = {[Royal Statistical Society, Wiley]},
 49 |  title = {Controlling the False Discovery Rate: A Practical and Powerful Approach to Multiple Testing},
 50 |  volume = {57},
 51 |  year = {1995}
 52 | }
 53 | 
 54 | 
 55 | 
 56 | @Manual{R-apeglm,
 57 |   title = {apeglm: Approximate posterior estimation for GLM coefficients},
 58 |   author = {Anqi Zhu and Joseph G. Ibrahim and Michael I. Love},
 59 |   year = {2018},
 60 |   note = {R package version 1.4.0},
 61 | }
 62 | @Manual{R-base,
 63 |   title = {R: A Language and Environment for Statistical Computing},
 64 |   author = {{R Core Team}},
 65 |   organization = {R Foundation for Statistical Computing},
 66 |   address = {Vienna, Austria},
 67 |   year = {2018},
 68 |   url = {https://www.R-project.org/},
 69 | }
 70 | @Manual{R-DESeq2,
 71 |   title = {DESeq2: Differential gene expression analysis based on the negative
 72 | binomial distribution},
 73 |   author = {Michael Love and Simon Anders and Wolfgang Huber},
 74 |   year = {2018},
 75 |   note = {R package version 1.22.0},
 76 |   url = {https://github.com/mikelove/DESeq2},
 77 | }
 78 | @Manual{R-pcaMethods,
 79 |   title = {pcaMethods: A collection of PCA methods},
 80 |   author = {Wolfram Stacklies and Henning Redestig and Kevin Wright},
 81 |   year = {2018},
 82 |   note = {R package version 1.74.0},
 83 |   url = {https://github.com/hredestig/pcamethods},
 84 | }
 85 | @Manual{R-rmarkdown,
 86 |   title = {rmarkdown: Dynamic Documents for R},
 87 |   author = {JJ Allaire and Yihui Xie and Jonathan McPherson and Javier Luraschi and Kevin Ushey and Aron Atkins and Hadley Wickham and Joe Cheng and Winston Chang},
 88 |   year = {2018},
 89 |   note = {R package version 1.10},
 90 |   url = {https://CRAN.R-project.org/package=rmarkdown},
 91 | }
 92 | @Manual{R-Rsubread,
 93 |   title = {Rsubread: Subread sequence alignment and counting for R},
 94 |   author = {Wei Shi and Yang Liao with contributions from Gordon K Smyth and Jenny Dai and Timothy {Triche, Jr.}},
 95 |   year = {2018},
 96 |   note = {R package version 1.32.0},
 97 |   url = {http://bioconductor.org/packages/release/bioc/html/Rsubread.html},
 98 | }
 99 | @Manual{R-ShortRead,
100 |   title = {ShortRead: FASTQ input and manipulation},
101 |   author = {Martin Morgan and Michael Lawrence and Simon Anders},
102 |   year = {2018},
103 |   note = {R package version 1.40.0},
104 | }
105 | 
106 | 


--------------------------------------------------------------------------------
/Static/Images/KnitIt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_tutorial_basicqc/3e736bfafc5b6584161938dce3b412f3a1f608a8/Static/Images/KnitIt.png


--------------------------------------------------------------------------------
/Static/Images/ONT_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_tutorial_basicqc/3e736bfafc5b6584161938dce3b412f3a1f608a8/Static/Images/ONT_logo.png


--------------------------------------------------------------------------------
/Static/Images/ONT_logo_deprecated.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_tutorial_basicqc/3e736bfafc5b6584161938dce3b412f3a1f608a8/Static/Images/ONT_logo_deprecated.png


--------------------------------------------------------------------------------
/Static/Images/sumstatEditParams.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nanoporetech/ont_tutorial_basicqc/3e736bfafc5b6584161938dce3b412f3a1f608a8/Static/Images/sumstatEditParams.png


--------------------------------------------------------------------------------
/Static/TutorialPostamble.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | # Customise the tutorial and explore your own data
 4 | 
 5 | Final thoughts; behind this **`Rmarkdown`** file is a modest amount of **`R code`** - please explore the **`Rmarkdown`** template; modify it, and run with your own samples.
 6 | 
 7 | The **`Nanopore_SumStatQC_Tutorial.Rmd`** script contains the R code to perform the QC analysis. This Rmarkdown script will first import the **`config.yaml`** configuration file to load the appropriate sequence summary file. Please edit the **`config.yaml`** file to point to your own summary file. As a recommended best practice, place the summary file into the **`RawData`** folder within your project directory. The summary file may be compressed  (compress with either **`.gz`** or **`.bz2`**).
 8 | 
 9 | The **`config.yaml`** and **`Nanopore_SumStatQC_Tutorial.Rmd`** files can both be edited directly in Rstudio. The code below shows the command to open the configuration file with the **`Rstudio`** software.
10 | 
11 | ```
12 | rstudio config.yaml
13 | ```
14 | 
15 | The figure below shows a screenshot of the **`config.yaml`** file. Change the values in the fields **`inputFile`**, **`barcodeFile`**, and **`flowcellId`** to reflect the details of your own sequencing run. **`inputFile`** should correspond to the `sequencing_summary.txt` file from the basecaller. **`barcodeFile`** should contain the barcode assignments from the demultiplexing step - if you do not have a `barcodeFile` (your library was not multiplexed, or demultiplexing was performed using e.g. Albacore) then please leave this field blank.
16 | 
17 | Change the value of **`tutorialText`** to `FALSE` and the texts relating to the installation, configuration and customisation of the tutorial will be hidden.
18 | 
19 | If base-calling was performed in MinKNOW and you have hundreds of numbered sequencing_summary files, copy all files into your **`RawData`** folder and concatenate into a single file with the command
20 | 
21 | ```
22 | cat sequencing_summary_*.txt > merged_sequence_summary.txt
23 | ```
24 | 
25 | Use (for example) this `merged_sequence_summary.txt` filename for `inputFile` in your `config.yaml`.
26 | 
27 | ![](Static/Images/sumstatEditParams.png)
28 | 
29 | 
30 | To extract the whole set of **`R code`** from the **`Rmarkdown`**, use the **`purl`** command - this will extract the R code into its own file.
31 | 
32 | ```
33 | knitr::purl("Nanopore_cDNA_Tutorial.Rmd", quiet=TRUE)
34 | ```
35 | 
36 | 
37 | \pagebreak
38 | 
39 | 
40 | 
41 | # References and citations
42 | 
43 | 


--------------------------------------------------------------------------------
/Static/TutorialPreamble.md:
--------------------------------------------------------------------------------
  1 | # Tutorial objectives
  2 | 
  3 | The **Summary statistics and QC tutorial** is intended as a functional guide to help assess the quality characteristics of a single Nanopore sequencing run. This tutorial aims to enable an objective assessment of the performance of a Nanopore flowcell run and to assess the sequence characteristics to benchmark quality. 
  4 | 
  5 | Sufficient information is provided in the tutorial that the workflow can be tested, validated and replicated. The tutorial is provided with an example dataset from a barcoded sequence library. The tutorial is intended to address important questions;
  6 | 
  7 | * how many reads (and how many gigabases) were sequenced?
  8 | * what fraction of my sequence collection is good quality?
  9 | * how are longer sequence reads represented in my sample?
 10 | * how uniform is the representation of different barcodes?
 11 | 
 12 | **Methods used** in this tutorial include 
 13 | 
 14 | * **`R`** for statistical analysis and reporting
 15 | * **`sequencing_summary.txt`** as data source for parsing
 16 | 
 17 | **Computational requirements** for this tutorial include 
 18 | 
 19 | * Computer running Linux (Centos7, Ubuntu 18_10, Fedora 29) 
 20 | * At least 8 Gb RAM 
 21 | * Runtime with provided example data - approximately 10 minutes
 22 | 
 23 | 
 24 | \pagebreak
 25 | 
 26 | # Quick start 
 27 | 
 28 | 1. Most software dependecies are managed though **`conda`**, install as described at  <br> [https://conda.io/docs/install/quick.html](https://conda.io/docs/install/quick.html). Accept the license agreement during installation, and it is recommended to allow the Conda installer to prepend its path to your `.bashrc` file when asked.
 29 | ```
 30 |     wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
 31 |     bash Miniconda3-latest-Linux-x86_64.sh
 32 |     bash
 33 | ```
 34 | 2. Download Nanopore QC tutorial & example files into a folder named `QCTutorial`. This tutorial requires the **`git-lfs`** large file support capabilities; this should be installed first through **`conda`**
 35 | ```
 36 |     conda install -c conda-forge git-lfs
 37 |     git lfs install
 38 |     git clone https://github.com/nanoporetech/ont_tutorial_basicqc.git QCTutorial
 39 | ```
 40 | 3. Change working directory into the new `QCTutorial` folder
 41 | ```
 42 |     cd QCTutorial
 43 | ```
 44 | 4. Install conda software dependencies into a discrete conda environment
 45 | ```
 46 |     conda env create --name BasicQC --file environment.yaml
 47 | ```
 48 | 5. Initialise the new conda environment with 
 49 | ```
 50 |     source activate BasicQC
 51 | ```
 52 | 6. *Optional:* edit the provided **`config.yaml`** file to match your own study design
 53 | 7. Render the report using results from the analysis above
 54 | ```
 55 |     R --slave -e 'rmarkdown::render("Nanopore_SumStatQC_Tutorial.Rmd", "html_document")'
 56 | ```
 57 | 
 58 | This workflow will create an HTML file in the working directory called **`Nanopore_SumStatQC_Tutorial.html`**. Open this file in your prefered web browser to view the summary statistics and a graphical review of your sequence collection.
 59 | 
 60 | 
 61 | \pagebreak
 62 | 
 63 | 
 64 | 
 65 | 
 66 | # Introduction
 67 | 
 68 | This tutorial aims to summarise the data characteristics from an Oxford Nanopore Technologies sequencing run. Observations from basecalled reads and their quality characteristics, temporal performance, and barcoded content are presented. The information presented is derived solely from the **`sequence_summary.txt`** file produced during basecalling with the Guppy software. A compatible file will also be generated by MinKNOW in upcoming releases.
 69 | 
 70 | This tutorial document has been produced from an **`Rmarkdown`** template. This template is intended as the starting point for your exploration and assessement of Oxford Nanopore DNA sequence data. The goals from this tutorial include
 71 | 
 72 | 1. To introduce a literate framework for analysing base-calling summary statistics to evaluate the relative performance of runs
 73 | 1. To provide basic QC metrics so that a review and consideration of experimental data can be undertaken 
 74 | 1. To provide training as to which QC metrics are of most interest and to encourage an understanding of how different aspects of sequence data quality can be attributed to sample characteristics from DNA isolation and library preparation.
 75 | 
 76 | Several of the plots included in this report have been replicated from publicly available projects such as POREquality [^1], minion_qc [^2], and pycoQC [^3]. 
 77 | 
 78 | [^1]: [jcarsweshau/POREquality](https://github.com/carsweshau/POREquality) 
 79 | [^2]: [roblanf/minion_qc](https://github.com/roblanf/minion_qc)
 80 | [^3]: [a-slide/pycoQC](https://github.com/a-slide/pycoQC)
 81 | 
 82 | The **`sequence_summary.txt`** file is automatically produced during base-calling with the Guppy software. This summary file contains rich metadata for each sequence read produced during a run. These data include timestamp, pore duration, read quality, and channel information, in addition to the characteristics of the resulting DNA sequence. This tutorial uses this summary file for performance reasons. 
 83 | 
 84 | Tools such as wub [^4] utilise the **`fastq`** files for quality metrics, and other tools make extensive use of the **`fast5`** files. Parsing the **`fast5`** files provides additional analytical context but is much more demanding in terms of compute resource and time. This tutorial is lightweight and is intended to run within a few minutes on a desktop computer.
 85 | 
 86 | [^4]: [nanoporetech/wub](https://github.com/nanoporetech/wub)
 87 | 
 88 | 
 89 | # Setup a computational environment for your QC analysis
 90 | 
 91 | This tutorial is intended to be simple to install, run and customise. The analysis is performed using the **`R`** statistical software and further functionality is provided by a number of **`R packages`** and the **`RStudio`** graphical user interface. The following steps describe a simple approach to installing the tutorial and its dependencies.
 92 | 
 93 | ## Conda package management software
 94 | 
 95 | **`Conda`** provides simple software package management. A wide variety of bioinformatics and scientific computing software has been deployed within Conda and it provides a streamlined way to install both software packages and their required dependencies without the requirement for administrative rights. These installation instructions assume that you are using the **`BASH`** shell interface to your computer.
 96 | 
 97 | Install the Conda software on your computer using the instructions provided at [https://conda.io/docs/install/quick.html](https://conda.io/docs/install/quick.html)
 98 | 
 99 | A recommended **`Conda`** installation could be installed on Linux with the following commands
100 | 
101 | ```
102 | # download Python3 version of the Miniconda installer
103 | wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
104 | 
105 | bash Miniconda3-latest-Linux-x86_64.sh
106 | 
107 | bash
108 | ```
109 | 
110 | The commands here are used to (1) download the Conda installer, (2) Install conda and (3) reload your bash shell, and make the new Conda installation available.
111 | 
112 | We recommend that the Conda installer be allowed to prepend the Conda path to your **`.bashrc`** file. This makes Conda available each time  you log in to the system. If Conda does not add this path, you should either edit your own `.bashrc` file or initialise your Conda environment manually with the instructions provided.
113 | 
114 | Check that Conda has been installed with the commands
115 | 
116 | ```
117 | echo $PATH
118 | conda --version
119 | ```
120 | 
121 | 
122 | ## Download the tutorial files
123 | 
124 | The tutorial documents and example data are contained on the [Oxford Nanopore Technologies Github site](www.github.com/nanoporetech). This Basic QC tutorial is contained within a project called **`ont_tutorial_basicqc`**. The tutorial download requires the **`large file support (lfs)`** extensions to **`git`** - the summary files from **`Guppy`** base-calling and de-multiplexing that are distributed with the tutorial are relatively large files.
125 | 
126 | The installation of the **`git-lfs`** extensions and download of the tutorial files and accompanying dataset can be performed with the commands
127 | 
128 | ```
129 | conda install -c conda-forge git-lfs
130 | git lfs install
131 | git clone https://github.com/nanoporetech/ont_tutorial_basicqc.git QCTutorial
132 | 
133 | # Change into the created `QCTutorial` directory
134 | cd QCTutorial
135 | 
136 | ```
137 | 
138 | This will download a collection of files into a new folder called `QCTutorial`. The files downloaded include
139 | 
140 | 1. The Rmarkdown file, **`Nanopore_SumStatQC_Tutorial.Rmd`**, includes this documentation and performs the analysis.
141 | 1. **`RawData/lambda_sequencing_summary.txt.bz2`** is a summary statistic file describing approximately 1 million sequence reads from a barcoded lambda DNA sequencing run. These sequence reads have been base-called using the Guppy software. This file has been bzip2 compressed.
142 | 1. **`RawData/lambda_barcoding_summary.txt.bz2`** is a barcode summary statistic file describing the barcode assignments for the sequence reads. This barcode assignment has been performed using the Guppy software.
143 | 1. **`Static/`** is a folder containing some of the descriptive images used in the tutorial, text for the tutorial, bibliography and style-sheets.
144 | 1. **`environment.yaml`** is a text file (in yaml format) that describes the software packages and computational environment that will be used to build a working compute environment using Conda.
145 | 1. **`config.yaml`** is a text file (in yaml format) that describes the sequence summary statistic files to be analysed. This is the main file to be edited to describe your own analysis.
146 | 
147 | 
148 | ## Build a Conda environment
149 | 
150 | In the previous section we downloaded the project files. This download includes the file, `environment.yaml` that can be used to initialise a **`Conda`** working environment. To create a Conda environment arbitrarily named `BasicQC` we should use the commands
151 | 
152 | ```
153 | conda env create --name BasicQC --file environment.yaml
154 | source activate BasicQC
155 | ```
156 | 
157 | 
158 | # Run the analysis
159 | 
160 | 
161 | Analysis of the sequences specified within the **`Rmarkdown`** file will be performed as part of the **`knit`** process. This will load the **`summary_statistics`** file, will prepare a sequence analysis, render figures and prepare the report. To start the analysis, it is only necessary to click the  **`knit`** button in the **`Rstudio`** software - please see figure \ref{fig:KnitIt} below. The **`Rstudio`** software can be opened with the tutorial markdown document with the command 
162 | 
163 | ```
164 | rstudio Nanopore_SumStatQC_Tutorial.Rmd
165 | ```
166 | 
167 | ![](Static/Images/KnitIt.png)
168 | 
169 | 
170 | It is possible to perform the QC analysis using **`knit`** from the command line; the commands below will knit the template document to produce and html format report with the command.
171 | 
172 | ```
173 | R --slave -e 'rmarkdown::render("Nanopore_SumStatQC_Tutorial.Rmd", "html_document")'
174 | ```
175 | 
176 | This command will build the HTML document that contains the analysis and exploration of the sequence collection. The file **`Nanopore_SumStatQC_Tutorial.html`** should be opened in a web-browser.
177 | 
178 | \pagebreak
179 | 


--------------------------------------------------------------------------------
/Static/ont_tutorial.css:
--------------------------------------------------------------------------------
 1 | /*!
 2 |  * bootswatch v3.3.6
 3 |  * Homepage: http://bootswatch.com
 4 |  * Copyright 2012-2015 Thomas Park
 5 |  * Licensed under MIT
 6 |  * Based on Bootstrap
 7 | *//*!
 8 |  * Bootstrap v3.3.6 (http://getbootstrap.com)
 9 |  * Copyright 2011-2015 Twitter, Inc.
10 |  * Licensed under MIT (https://github.com/twbs/bootstrap/blob/master/LICENSE)
11 |  *//*! normalize.css v3.0.3 | MIT License | github.com/necolas/normalize.css */
12 |  
13 |  /* @sagrudd brewer.pal(6, "Paired")  "#A6CEE3" "#1F78B4" "#B2DF8A" "#33A02C" "#FB9A99" "#E31A1C" */
14 |  
15 |  html{font-family:arial;-ms-text-size-adjust:100%;-webkit-text-size-adjust:100%}
16 |  body{margin:0; font:arial;}
17 |  article,aside,details,figcaption,figure,footer,header,hgroup,main,menu,nav,section,summary{display:block}
18 |  a{background-color:transparent}
19 |  a:active,a:hover{outline:0}
20 |  abbr[title]{border-bottom:1px dotted}
21 |  b,strong{font-weight:bold}
22 |  dfn{font-style:italic}
23 |  .title{margin-top: 115px;}
24 |  h1{font-size:2em;margin:0.67em 0;color:#1F78B4}
25 |  h2{font-size:1.5em;margin:0.67em 0;color:#A6CEE3}
26 |  mark{background:#ff0;color:#000}
27 |  small{font-size:80%}
28 |  sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}
29 |  sup{top:-0.5em}
30 |  sub{bottom:-0.25em}
31 |  img{border:0}
32 |  svg:not(:root){overflow:hidden}
33 |  figure{margin:1em 40px}
34 |  hr{-webkit-box-sizing:content-box;-moz-box-sizing:content-box;box-sizing:content-box;height:0}
35 |  pre{overflow:auto solid #999;page-break-inside:avoid}
36 |  code,kbd,pre,samp{background-color:gray; color:white; font-family:monospace, monospace;font-size:1em}
37 |  button,input,optgroup,select,textarea{color:inherit;font:inherit;margin:0}
38 |  button{overflow:visible}
39 |  button,select{text-transform:none}
40 |  button,html input[type="button"],input[type="reset"],input[type="submit"]{-webkit-appearance:button;cursor:pointer}
41 |  button[disabled],html input[disabled]{cursor:default}
42 |  button::-moz-focus-inner,input::-moz-focus-inner{border:0;padding:0}
43 |  fieldset{border:1px solid #c0c0c0;margin:0 2px;padding:0.35em 0.625em 0.75em}
44 |  legend{border:0;padding:0}
45 |  textarea{overflow:auto}
46 |  optgroup{font-weight:bold}
47 |  table{border-collapse:collapse;border-spacing:0}
48 |  td,th{padding:0}/*! Source: https://github.com/h5bp/html5-boilerplate/blob/master/src/css/main.css */
49 | 
50 | 


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
 1 | 
 2 | inputFile:   "RawData/lambda_sequencing_summary.txt.bz2"
 3 | barcodeFile: "RawData/lambda_barcoding_summary.txt.bz2"
 4 | basecaller:  "Guppy 2.1.3"
 5 | flowcellId:  "FAK41706"
 6 | tutorialText: FALSE
 7 | 
 8 | # **inputFile** - this should be the sequencing_summary.txt from Guppy etc
 9 | # move your own sequence_summary.txt file (or concatenation thereof) to the
10 | # RawData folder to run analysis on your own sequence collection
11 | 
12 | # **barcodeFile** - if Guppy_barcoder has been used to demultiplex library,
13 | # move the barcoding_summary.txt file to the RawData folder and update variable
14 | 
15 | # **basecaller** and **flowcellId** - are used for presentation in report
16 | # please update to correspond to your sequence analysis
17 | 
18 | # change the **tutorialText** value to FALSE to mask  tutorial instructions


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
 1 | channels:
 2 |   - bioconda
 3 |   - conda-forge
 4 |   - r
 5 |   - sagrudd
 6 | dependencies:
 7 |   - snakemake-minimal =5.2.4
 8 |   - python =3.6
 9 |   - r-essentials
10 |   - r =3.5.1
11 |   - r-base
12 |   - rstudio
13 |   - nanopore-r-basicqc
14 |   - r-fastmatch


--------------------------------------------------------------------------------