├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE.md ├── Makefile ├── NAMESPACE ├── NEWS.md ├── R ├── filterResults.R ├── plot.R ├── primerTree.R ├── search.R ├── sequence.R └── taxonomy.R ├── README.md ├── TODO.md ├── data ├── bryophytes_trnL.RData └── mammals_16S.RData ├── man ├── accession2taxid.Rd ├── bryophytes_trnL.Rd ├── calc_rank_dist_ave.Rd ├── clustalo.Rd ├── filter_seqs.Rd ├── get_sequence.Rd ├── get_sequences.Rd ├── get_taxonomy.Rd ├── identify.primerTree_plot.Rd ├── layout_tree_ape.Rd ├── mammals_16S.Rd ├── parse_primer_hits.Rd ├── plot.primerTree.Rd ├── plot_tree.Rd ├── plot_tree_ranks.Rd ├── primerTree.Rd ├── primer_search.Rd ├── search_primer_pair.Rd ├── seq_lengths.Rd ├── seq_lengths.primerTree.Rd ├── summary.primerTree.Rd └── tree_from_alignment.Rd ├── paper.Rmd ├── primerTree.Rproj └── src ├── init.c └── read_dna.c /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^CRAN-RELEASE$ 2 | ^.*\.Rproj$ 3 | ^\.Rproj\.user$ 4 | ^Makefile$ 5 | ^.*docx$ 6 | ^.*.md$ 7 | ^figure$ 8 | ^citations$ 9 | ^.*\.bib$ 10 | ^test\.fa$ 11 | ^.*\.html$ 12 | ^\.travis\.yml$ 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | src/*.o 5 | src/*.so 6 | src/*.dll 7 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r 2 | 3 | language: R 4 | sudo: false 5 | cache: packages 6 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: primerTree 2 | Title: Visually Assessing the Specificity and Informativeness of 3 | Primer Pairs 4 | Version: 1.0.7 5 | Authors@R: 6 | c(person(given = "Jim", 7 | family = "Hester", 8 | role = "aut", 9 | email = "james.hester@gmail.com"), 10 | person(given = "Matt", 11 | family = "Cannon", 12 | role = c("aut", "cre"), 13 | email = "matthewvc1@gmail.com")) 14 | Description: Identifies potential target sequences for a given 15 | set of primers and generates phylogenetic trees annotated with the 16 | taxonomies of the predicted amplification products. 17 | License: GPL-2 18 | Depends: 19 | directlabels, 20 | gridExtra, 21 | R (>= 3.0.0) 22 | Imports: 23 | ape, 24 | foreach, 25 | ggplot2, 26 | grid, 27 | httr, 28 | lubridate, 29 | plyr, 30 | reshape2, 31 | scales, 32 | stringr, 33 | XML 34 | Encoding: UTF-8 35 | LazyData: true 36 | RoxygenNote: 7.2.3 37 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | PrimerTree 2 | Copyright (C) 2013 Jim Hester 3 | 4 | GNU GENERAL PUBLIC LICENSE 5 | Version 2, June 1991 6 | 7 | Copyright (C) 1989, 1991 Free Software Foundation, Inc., [http://fsf.org/] 8 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 9 | Everyone is permitted to copy and distribute verbatim copies 10 | of this license document, but changing it is not allowed. 11 | 12 | Preamble 13 | 14 | The licenses for most software are designed to take away your 15 | freedom to share and change it. By contrast, the GNU General Public 16 | License is intended to guarantee your freedom to share and change free 17 | software--to make sure the software is free for all its users. This 18 | General Public License applies to most of the Free Software 19 | Foundation's software and to any other program whose authors commit to 20 | using it. (Some other Free Software Foundation software is covered by 21 | the GNU Lesser General Public License instead.) You can apply it to 22 | your programs, too. 23 | 24 | When we speak of free software, we are referring to freedom, not 25 | price. Our General Public Licenses are designed to make sure that you 26 | have the freedom to distribute copies of free software (and charge for 27 | this service if you wish), that you receive source code or can get it 28 | if you want it, that you can change the software or use pieces of it 29 | in new free programs; and that you know you can do these things. 30 | 31 | To protect your rights, we need to make restrictions that forbid 32 | anyone to deny you these rights or to ask you to surrender the rights. 33 | These restrictions translate to certain responsibilities for you if you 34 | distribute copies of the software, or if you modify it. 35 | 36 | For example, if you distribute copies of such a program, whether 37 | gratis or for a fee, you must give the recipients all the rights that 38 | you have. You must make sure that they, too, receive or can get the 39 | source code. And you must show them these terms so they know their 40 | rights. 41 | 42 | We protect your rights with two steps: (1) copyright the software, and 43 | (2) offer you this license which gives you legal permission to copy, 44 | distribute and/or modify the software. 45 | 46 | Also, for each author's protection and ours, we want to make certain 47 | that everyone understands that there is no warranty for this free 48 | software. If the software is modified by someone else and passed on, we 49 | want its recipients to know that what they have is not the original, so 50 | that any problems introduced by others will not reflect on the original 51 | authors' reputations. 52 | 53 | Finally, any free program is threatened constantly by software 54 | patents. We wish to avoid the danger that redistributors of a free 55 | program will individually obtain patent licenses, in effect making the 56 | program proprietary. To prevent this, we have made it clear that any 57 | patent must be licensed for everyone's free use or not licensed at all. 58 | 59 | The precise terms and conditions for copying, distribution and 60 | modification follow. 61 | 62 | GNU GENERAL PUBLIC LICENSE 63 | TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION 64 | 65 | 0. This License applies to any program or other work which contains 66 | a notice placed by the copyright holder saying it may be distributed 67 | under the terms of this General Public License. The "Program", below, 68 | refers to any such program or work, and a "work based on the Program" 69 | means either the Program or any derivative work under copyright law: 70 | that is to say, a work containing the Program or a portion of it, 71 | either verbatim or with modifications and/or translated into another 72 | language. (Hereinafter, translation is included without limitation in 73 | the term "modification".) Each licensee is addressed as "you". 74 | 75 | Activities other than copying, distribution and modification are not 76 | covered by this License; they are outside its scope. The act of 77 | running the Program is not restricted, and the output from the Program 78 | is covered only if its contents constitute a work based on the 79 | Program (independent of having been made by running the Program). 80 | Whether that is true depends on what the Program does. 81 | 82 | 1. You may copy and distribute verbatim copies of the Program's 83 | source code as you receive it, in any medium, provided that you 84 | conspicuously and appropriately publish on each copy an appropriate 85 | copyright notice and disclaimer of warranty; keep intact all the 86 | notices that refer to this License and to the absence of any warranty; 87 | and give any other recipients of the Program a copy of this License 88 | along with the Program. 89 | 90 | You may charge a fee for the physical act of transferring a copy, and 91 | you may at your option offer warranty protection in exchange for a fee. 92 | 93 | 2. You may modify your copy or copies of the Program or any portion 94 | of it, thus forming a work based on the Program, and copy and 95 | distribute such modifications or work under the terms of Section 1 96 | above, provided that you also meet all of these conditions: 97 | 98 | a) You must cause the modified files to carry prominent notices 99 | stating that you changed the files and the date of any change. 100 | 101 | b) You must cause any work that you distribute or publish, that in 102 | whole or in part contains or is derived from the Program or any 103 | part thereof, to be licensed as a whole at no charge to all third 104 | parties under the terms of this License. 105 | 106 | c) If the modified program normally reads commands interactively 107 | when run, you must cause it, when started running for such 108 | interactive use in the most ordinary way, to print or display an 109 | announcement including an appropriate copyright notice and a 110 | notice that there is no warranty (or else, saying that you provide 111 | a warranty) and that users may redistribute the program under 112 | these conditions, and telling the user how to view a copy of this 113 | License. (Exception: if the Program itself is interactive but 114 | does not normally print such an announcement, your work based on 115 | the Program is not required to print an announcement.) 116 | 117 | These requirements apply to the modified work as a whole. If 118 | identifiable sections of that work are not derived from the Program, 119 | and can be reasonably considered independent and separate works in 120 | themselves, then this License, and its terms, do not apply to those 121 | sections when you distribute them as separate works. But when you 122 | distribute the same sections as part of a whole which is a work based 123 | on the Program, the distribution of the whole must be on the terms of 124 | this License, whose permissions for other licensees extend to the 125 | entire whole, and thus to each and every part regardless of who wrote it. 126 | 127 | Thus, it is not the intent of this section to claim rights or contest 128 | your rights to work written entirely by you; rather, the intent is to 129 | exercise the right to control the distribution of derivative or 130 | collective works based on the Program. 131 | 132 | In addition, mere aggregation of another work not based on the Program 133 | with the Program (or with a work based on the Program) on a volume of 134 | a storage or distribution medium does not bring the other work under 135 | the scope of this License. 136 | 137 | 3. You may copy and distribute the Program (or a work based on it, 138 | under Section 2) in object code or executable form under the terms of 139 | Sections 1 and 2 above provided that you also do one of the following: 140 | 141 | a) Accompany it with the complete corresponding machine-readable 142 | source code, which must be distributed under the terms of Sections 143 | 1 and 2 above on a medium customarily used for software interchange; or, 144 | 145 | b) Accompany it with a written offer, valid for at least three 146 | years, to give any third party, for a charge no more than your 147 | cost of physically performing source distribution, a complete 148 | machine-readable copy of the corresponding source code, to be 149 | distributed under the terms of Sections 1 and 2 above on a medium 150 | customarily used for software interchange; or, 151 | 152 | c) Accompany it with the information you received as to the offer 153 | to distribute corresponding source code. (This alternative is 154 | allowed only for noncommercial distribution and only if you 155 | received the program in object code or executable form with such 156 | an offer, in accord with Subsection b above.) 157 | 158 | The source code for a work means the preferred form of the work for 159 | making modifications to it. For an executable work, complete source 160 | code means all the source code for all modules it contains, plus any 161 | associated interface definition files, plus the scripts used to 162 | control compilation and installation of the executable. However, as a 163 | special exception, the source code distributed need not include 164 | anything that is normally distributed (in either source or binary 165 | form) with the major components (compiler, kernel, and so on) of the 166 | operating system on which the executable runs, unless that component 167 | itself accompanies the executable. 168 | 169 | If distribution of executable or object code is made by offering 170 | access to copy from a designated place, then offering equivalent 171 | access to copy the source code from the same place counts as 172 | distribution of the source code, even though third parties are not 173 | compelled to copy the source along with the object code. 174 | 175 | 4. You may not copy, modify, sublicense, or distribute the Program 176 | except as expressly provided under this License. Any attempt 177 | otherwise to copy, modify, sublicense or distribute the Program is 178 | void, and will automatically terminate your rights under this License. 179 | However, parties who have received copies, or rights, from you under 180 | this License will not have their licenses terminated so long as such 181 | parties remain in full compliance. 182 | 183 | 5. You are not required to accept this License, since you have not 184 | signed it. However, nothing else grants you permission to modify or 185 | distribute the Program or its derivative works. These actions are 186 | prohibited by law if you do not accept this License. Therefore, by 187 | modifying or distributing the Program (or any work based on the 188 | Program), you indicate your acceptance of this License to do so, and 189 | all its terms and conditions for copying, distributing or modifying 190 | the Program or works based on it. 191 | 192 | 6. Each time you redistribute the Program (or any work based on the 193 | Program), the recipient automatically receives a license from the 194 | original licensor to copy, distribute or modify the Program subject to 195 | these terms and conditions. You may not impose any further 196 | restrictions on the recipients' exercise of the rights granted herein. 197 | You are not responsible for enforcing compliance by third parties to 198 | this License. 199 | 200 | 7. If, as a consequence of a court judgment or allegation of patent 201 | infringement or for any other reason (not limited to patent issues), 202 | conditions are imposed on you (whether by court order, agreement or 203 | otherwise) that contradict the conditions of this License, they do not 204 | excuse you from the conditions of this License. If you cannot 205 | distribute so as to satisfy simultaneously your obligations under this 206 | License and any other pertinent obligations, then as a consequence you 207 | may not distribute the Program at all. For example, if a patent 208 | license would not permit royalty-free redistribution of the Program by 209 | all those who receive copies directly or indirectly through you, then 210 | the only way you could satisfy both it and this License would be to 211 | refrain entirely from distribution of the Program. 212 | 213 | If any portion of this section is held invalid or unenforceable under 214 | any particular circumstance, the balance of the section is intended to 215 | apply and the section as a whole is intended to apply in other 216 | circumstances. 217 | 218 | It is not the purpose of this section to induce you to infringe any 219 | patents or other property right claims or to contest validity of any 220 | such claims; this section has the sole purpose of protecting the 221 | integrity of the free software distribution system, which is 222 | implemented by public license practices. Many people have made 223 | generous contributions to the wide range of software distributed 224 | through that system in reliance on consistent application of that 225 | system; it is up to the author/donor to decide if he or she is willing 226 | to distribute software through any other system and a licensee cannot 227 | impose that choice. 228 | 229 | This section is intended to make thoroughly clear what is believed to 230 | be a consequence of the rest of this License. 231 | 232 | 8. If the distribution and/or use of the Program is restricted in 233 | certain countries either by patents or by copyrighted interfaces, the 234 | original copyright holder who places the Program under this License 235 | may add an explicit geographical distribution limitation excluding 236 | those countries, so that distribution is permitted only in or among 237 | countries not thus excluded. In such case, this License incorporates 238 | the limitation as if written in the body of this License. 239 | 240 | 9. The Free Software Foundation may publish revised and/or new versions 241 | of the General Public License from time to time. Such new versions will 242 | be similar in spirit to the present version, but may differ in detail to 243 | address new problems or concerns. 244 | 245 | Each version is given a distinguishing version number. If the Program 246 | specifies a version number of this License which applies to it and "any 247 | later version", you have the option of following the terms and conditions 248 | either of that version or of any later version published by the Free 249 | Software Foundation. If the Program does not specify a version number of 250 | this License, you may choose any version ever published by the Free Software 251 | Foundation. 252 | 253 | 10. If you wish to incorporate parts of the Program into other free 254 | programs whose distribution conditions are different, write to the author 255 | to ask for permission. For software which is copyrighted by the Free 256 | Software Foundation, write to the Free Software Foundation; we sometimes 257 | make exceptions for this. Our decision will be guided by the two goals 258 | of preserving the free status of all derivatives of our free software and 259 | of promoting the sharing and reuse of software generally. 260 | 261 | NO WARRANTY 262 | 263 | 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY 264 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN 265 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES 266 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED 267 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 268 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS 269 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE 270 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, 271 | REPAIR OR CORRECTION. 272 | 273 | 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING 274 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR 275 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, 276 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING 277 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED 278 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY 279 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER 280 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE 281 | POSSIBILITY OF SUCH DAMAGES. 282 | 283 | END OF TERMS AND CONDITIONS 284 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | BASE=$(wildcard R/*R) 2 | all: install 3 | 4 | install: R_package 5 | 6 | R_package: $(BASE) 7 | Rscript -e 'library(devtools);install(".")' 8 | touch R_package 9 | 10 | make clean: 11 | rm -f inst/doc/*.html inst/doc/*.md 12 | 13 | #from yihui's knitr Makefile 14 | PKGNAME := $(shell sed -n "s/Package: *\([^ ]*\)/\1/p" DESCRIPTION) 15 | PKGVERS := $(shell sed -n "s/Version: *\([^ ]*\)/\1/p" DESCRIPTION) 16 | PKGSRC := $(shell basename `pwd`) 17 | 18 | docs: 19 | Rscript -e 'library(devtools);library(methods);library(utils);document(".")' 20 | 21 | build: 22 | cd ..;\ 23 | R CMD build $(PKGSRC) 24 | 25 | check: build 26 | cd ..;\ 27 | R CMD check $(PKGNAME)_$(PKGVERS).tar.gz --as-cran 28 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(filter_seqs,primerTree) 4 | S3method(identify,primerTree_plot) 5 | S3method(plot,primerTree) 6 | S3method(print,primerTree) 7 | S3method(print,primerTree_plot_multi) 8 | S3method(seq_lengths,primerTree) 9 | S3method(summary,primerTree) 10 | export(accession2taxid) 11 | export(calc_rank_dist_ave) 12 | export(clustalo) 13 | export(filter_seqs) 14 | export(get_sequence) 15 | export(get_sequences) 16 | export(get_taxonomy) 17 | export(parse_primer_hits) 18 | export(plot_tree) 19 | export(plot_tree_ranks) 20 | export(primer_search) 21 | export(search_primer_pair) 22 | export(seq_lengths) 23 | export(tree_from_alignment) 24 | import(XML) 25 | import(ape) 26 | import(directlabels) 27 | import(foreach) 28 | import(ggplot2) 29 | import(gridExtra) 30 | import(httr) 31 | import(plyr) 32 | import(stringr) 33 | importFrom(grDevices,dev.cur) 34 | importFrom(grDevices,dev.off) 35 | importFrom(grDevices,dev.set) 36 | importFrom(grDevices,pdf) 37 | importFrom(grid,grid.locator) 38 | importFrom(lubridate,"%--%") 39 | importFrom(lubridate,now) 40 | importFrom(lubridate,seconds) 41 | importFrom(reshape2,melt) 42 | importFrom(scales,expand_range) 43 | importFrom(stats,na.omit) 44 | importFrom(stats,quantile) 45 | importFrom(utils,capture.output) 46 | useDynLib(primerTree,rawStreamToDNAbin) 47 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # primerTree 1.0.7 2 | 3 | * Couple of internal changes to fix CRAN issue 4 | * Also updated documentation for filter_seqs 5 | 6 | # primerTree 1.0.6 7 | 8 | * Changed how matches are handled internally to use accession numbers instead of 9 | gi numbers. 10 | 11 | # primerTree 1.0.5 12 | 13 | * `search_primer_pair()` gains a `api_key` parameter, or you can set `NCBI_API_KEY` as an environment variable (@MVesuviusC, #36) 14 | * Add RCurl to Imports, it is an implicit dependency due to a S4 Class being 15 | included in the data for the package and CRAN is now checking that packages 16 | load without Suggested packages. 17 | 18 | # primerTree 1.0.4 19 | 20 | * Add RCurl to Suggests, it is an implicit dependency due to a S4 Class being 21 | included in the data for the package. 22 | 23 | * Added a `NEWS.md` file to track changes to the package. 24 | -------------------------------------------------------------------------------- /R/filterResults.R: -------------------------------------------------------------------------------- 1 | #PrimerTree 2 | #Copyright (C) 2013 Jim Hester 3 | 4 | 5 | #' Filter out sequences retrieved by search_primer_pair() that are either too 6 | #' short or too long. The alignment and tree will be recalculated after removing 7 | #' unwanted reads. 8 | #' @param x a primerTree object 9 | #' @param min_length the minimum sequence length to keep 10 | #' @param max_length the maximum sequence length to keep 11 | #' @param ... additional arguments passed to methods. 12 | #' @return a primerTree object 13 | #' @examples 14 | #' \dontrun{ 15 | #' # filter out sequences longer or shorter than desired: 16 | #' mammals_16S_filtered <- filter_seqs(mammals_16S, min_length=131, max_length=156) 17 | #' } 18 | #' @export 19 | filter_seqs = function(x, ...) UseMethod("filter_seqs") 20 | 21 | #' @describeIn filter_seqs Method for primerTree objects 22 | #' @export 23 | filter_seqs.primerTree = function(x, min_length = 0, max_length = Inf, ...) { 24 | #if(is.null(x$sequence)) { 25 | # print "No sequences in primerTree object" 26 | #} 27 | 28 | # calculate how many removed and print out to screen 29 | lengths <- vapply(x$sequence, length, integer(1)) 30 | above_max <- lengths >= max_length 31 | below_min <- lengths <= min_length 32 | message(sum(below_min), " sequences below ", min_length) # same for min 33 | message(sum(above_max), " sequences above ", max_length) # same for min 34 | 35 | #filter the sequences 36 | x$sequence <- x$sequence[lengths >= min_length & lengths <= max_length] 37 | 38 | # realign filtered sequences 39 | x$alignment <- clustalo(x$sequence) 40 | 41 | # remake tree 42 | x$tree <- tree_from_alignment(x$alignment) 43 | x 44 | } 45 | 46 | #' Get a summary of sequence lengths from a primerTree object 47 | #' @param x a primerTree object. 48 | #' @param summarize a logical indicating if a summary should be displayed 49 | #' @return a table of sequence length frequencies 50 | #' @examples 51 | #' 52 | #' # Show the counts for each length 53 | #' seq_lengths(mammals_16S) 54 | #' 55 | #' # Plot the distribution of lengths 56 | #' seqLengths <- seq_lengths(mammals_16S) 57 | #' barplot(seqLengths, 58 | #' main = "Frequency of sequence lengths for 16S mammal primers", 59 | #' xlab="Amplicon length (in bp)", 60 | #' ylab=("Frequency")) 61 | #' @export 62 | seq_lengths <- function(x,summarize = TRUE) UseMethod("seq_lengths") 63 | 64 | #' Method for primerTree objects 65 | #' @inheritParams seq_lengths 66 | #' @export 67 | seq_lengths.primerTree <- function(x, summarize = TRUE) { 68 | lengths <- vapply(x$sequence, length, integer(1)) 69 | message("sequence length distribution: ") 70 | table(as.factor(lengths)) 71 | } 72 | -------------------------------------------------------------------------------- /R/plot.R: -------------------------------------------------------------------------------- 1 | #PrimerTree 2 | #Copyright (C) 2013 Jim Hester 3 | 4 | #' plots a tree along with a series of taxonomic ranks 5 | #' @param ranks The ranks to include, defaults to all common ranks, if null 6 | #' print all ranks. 7 | #' @inheritParams plot_tree 8 | #' @seealso \code{\link{plot_tree}} to plot only a single rank or the just the 9 | #' tree layout. 10 | #' @export 11 | #' @examples 12 | #' library(gridExtra) 13 | #' library(directlabels) 14 | #' #plot all the common ranks 15 | #' plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy) 16 | #' #plot specific ranks, with a larger dot size 17 | #' plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy, 18 | #' ranks=c('kingdom', 'class', 'family'), size=3) 19 | 20 | plot_tree_ranks = function(tree, taxonomy, main=NULL, type='unrooted', 21 | ranks=common_ranks, size=2, 22 | guide_size=NULL, legend_cutoff=25, ...){ 23 | if(is.null(ranks)) 24 | ranks = setdiff(names(taxonomy), c('accession', 'taxId')) 25 | 26 | plots = list() 27 | plots$structure = plot_tree(tree, main=main, guide_size=guide_size, type=type, ...) 28 | 29 | for(rank in intersect(ranks, names(taxonomy))){ 30 | if(length(na.omit(taxonomy[rank])) > 0){ 31 | plots[[rank]]= plot_tree(tree, guide_size=guide_size, type=type, rank=rank, taxonomy=taxonomy, size=size, legend_cutoff=legend_cutoff, ...) 32 | } 33 | } 34 | p = do.call(arrangeGrob, plots) 35 | class(p) <- c("primerTree_plot_multi", class(p)) 36 | 37 | p 38 | } 39 | common_ranks = c("kingdom", "phylum", "class", "order", "family", "genus", "species") 40 | 41 | #' plots a tree, optionally with colored and labeled points by taxonomic rank 42 | #' 43 | #' @param tree to be plotted, use layout_tree to layout tree. 44 | #' @param taxonomy A data.frame with an accession field corresponding to the 45 | #' tree tip labels. 46 | #' @param main An optional title for the plot 47 | #' @param type The type of tree to plot, default unrooted. 48 | #' @param rank The rank to include, if null only the tree is plotted 49 | #' @param size The size of the colored points 50 | #' @param guide_size The size of the length guide. If NULL auto detects a 51 | #' reasonable size. 52 | #' @param legend_cutoff The number of different taxa names after which the 53 | #' names are no longer printed. 54 | #' @param ... additional arguments passed to \code{\link{layout_tree_ape}} 55 | #' @return plot to be printed. 56 | #' @export 57 | 58 | plot_tree = function(tree, type='unrooted', main=NULL, guide_size=NULL, 59 | rank=NULL, taxonomy=NULL, size=2, legend_cutoff=25, ...){ 60 | 61 | x = layout_tree_ape(tree, type=type, ...) 62 | 63 | range_x = range(x$edge$x, x$tip$x) 64 | range_y = expand_range(range(x$edqe$y, x$tip$y), mul=.1) 65 | 66 | if(is.null(guide_size)){ 67 | guide_size = 10**(round_any(log10(range_x[2]-range_x[1]), 1)-1) 68 | } 69 | p = ggplot() + 70 | geom_segment(data=x$edge, aes_string(x='x', y='y', xend='xend', yend='yend')) + 71 | theme_noaxis() + 72 | annotate('segment', x=range_x[1], xend=range_x[1]+guide_size, 73 | y=range_y[1], yend=range_y[1], 74 | arrow=arrow(ends="both",angle=90, length=unit(.2,"cm"))) + 75 | annotate('text', x=range_x[1]+(guide_size/2), y=range_y[1], label=guide_size, vjust=-.5) 76 | 77 | if(!is.null(rank)){ 78 | if(is.null(taxonomy)) 79 | stop('Must provide a taxonomy if plotting a rank') 80 | 81 | if(is.null(main)) 82 | main = rank 83 | 84 | x$tip = merge(x$tip, taxonomy, by.x='label', by.y='accession', all.x=T) 85 | 86 | rows = na.omit(x$tip[, c('x','y',rank)]) 87 | p = p+geom_point(data=rows, aes_string(x='x', y='y', color=rank), 88 | size=size, na.rm=T) + theme(legend.position='none') 89 | 90 | if(length(unique(rows[[rank]])) < legend_cutoff){ 91 | smart.grid2 = list('get.means', 'calc.boxes', 'empty.grid') 92 | p = p + geom_dl(data=rows, method=smart.grid2, 93 | aes_string(x='x', y='y', color=rank, label=rank)) 94 | } 95 | } 96 | p = p + ggtitle(main) 97 | class(p) = c(class(p), 'primerTree_plot') 98 | p 99 | } 100 | 101 | #' layout a tree using ape, return an object to be plotted by 102 | #' \code{\link{plot_tree}} 103 | #' @param tree The \code{\link{phylo}} tree to be plotted 104 | #' @param ... additional arguments to \code{\link{plot.phylo}} 105 | #' @return \item{edge}{list of x, y and xend, yend coordinates 106 | #' as well as ids for the edges} 107 | #' \item{tips}{list of x, y, label and id for the tips} 108 | #' \item{nodes}{list of x, y and id for the nodes} 109 | layout_tree_ape = function(tree, ...){ 110 | #hack to write no output 111 | cur_dev = dev.cur() #store previous dev 112 | temp_file = tempfile() 113 | pdf(file=temp_file) 114 | plot.phylo(tree, plot=F, ...) 115 | dev.off() 116 | unlink(temp_file) 117 | dev.set(cur_dev) #restore previous dev 118 | 119 | last = .PlotPhyloEnv$last_plot.phylo 120 | new = list() 121 | new$edge$x = last$xx[last$edge[,1]] 122 | new$edge$xend = last$xx[last$edge[,2]] 123 | 124 | new$edge$y = last$yy[last$edge[,1]] 125 | new$edge$yend = last$yy[last$edge[,2]] 126 | new$edge$id = tree$edge 127 | 128 | new$edge = data.frame(new$edge, stringsAsFactors=F) 129 | 130 | new$tip$x = last$xx[1:last$Ntip] 131 | new$tip$y = last$yy[1:last$Ntip] 132 | new$tip$label = tree$tip.label 133 | new$tip$id = 1:last$Ntip 134 | 135 | new$tip = data.frame(new$tip, stringsAsFactors=F) 136 | 137 | new$node$x = last$xx[(last$Ntip + 1):length(last$xx)] 138 | new$node$y = last$yy[(last$Ntip + 1):length(last$yy)] 139 | new$node$label = tree$node.label 140 | new$node = data.frame(new$node, stringsAsFactors=F) 141 | 142 | new 143 | } 144 | theme_noaxis = function(){ 145 | theme(panel.border=element_blank(), panel.grid=element_blank(), 146 | axis.line=element_blank(), axis.text=element_blank(), 147 | axis.title=element_blank(), axis.ticks=element_blank(), 148 | plot.margin=unit(c(0, 0, -1, -1), 'lines')) 149 | } 150 | 151 | #' @export 152 | print.primerTree_plot_multi <- function(x, ...) { 153 | grid.arrange(x, ...) 154 | } 155 | -------------------------------------------------------------------------------- /R/primerTree.R: -------------------------------------------------------------------------------- 1 | #PrimerTree 2 | #Copyright (C) 2013 Jim Hester 3 | 4 | #' \pkg{primerTree} Visually Assessing the Specificity and Informativeness of Primer Pairs 5 | #' 6 | #' \code{primerTree} has two main commands: 7 | #' \code{\link{search_primer_pair}} which takes a primer pair and returns an 8 | #' primerTree object of the search results 9 | #' \code{\link{plot.primerTree}} a S3 method for plotting the primerTree object 10 | #' obtained using \code{\link{search_primer_pair}} 11 | #' @name primerTree 12 | #' @docType package 13 | #' @aliases primerTree-package 14 | #' @import ggplot2 XML ape httr plyr directlabels gridExtra 15 | #' stringr foreach 16 | #' @importFrom lubridate %--% seconds now 17 | #' @importFrom grid grid.locator 18 | #' @importFrom scales expand_range 19 | #' @importFrom grDevices dev.cur dev.off dev.set pdf 20 | #' @importFrom stats na.omit quantile 21 | #' @importFrom utils capture.output 22 | #' @useDynLib primerTree rawStreamToDNAbin 23 | NULL 24 | 25 | #' PrimerTree results for the mammalian 16S primers 26 | #' @name mammals_16S 27 | #' @docType data 28 | NULL 29 | 30 | #' PrimerTree results for the bryophyte trnL primers 31 | #' @name bryophytes_trnL 32 | #' @docType data 33 | NULL 34 | 35 | #' @export 36 | print.primerTree = function(x, ...){ 37 | cat("Name: ", x$name, "\n", 38 | " Arguments: ", paste(names(x$arguments), x$arguments, sep=":", collapse=' '), "\n") 39 | cat("\nHTTP Response\n") 40 | print(x$response[[1]]) 41 | cat("\nPrimer Products\n") 42 | print(x$sequence) 43 | cat("\nAligned Products\n") 44 | print(x$alignment) 45 | cat("\nPhylogenetic Tree\n") 46 | print(x$tree) 47 | } 48 | #' plot function for a primerTree object, calls plot_tree_ranks 49 | #' @param x primerTree object to plot 50 | #' @param ranks The ranks to include, defaults to all common ranks, if NULL 51 | #' print all ranks. If 'none' just print the layout. 52 | #' @param main an optional title to display, if NULL displays the name as the title 53 | #' @param ... additional arguments passed to plot_tree_ranks 54 | #' @export 55 | #' @seealso \code{\link{plot_tree_ranks}}, \code{\link{plot_tree}} 56 | #' @examples 57 | #' library(gridExtra) 58 | #' library(directlabels) 59 | #' #plot with all common ranks 60 | #' plot(mammals_16S) 61 | #' 62 | #' #plot only the class 63 | #' plot(mammals_16S, 'class') 64 | #' 65 | #' #plot the layout only 66 | #' plot(mammals_16S, 'none') 67 | plot.primerTree = function(x, ranks=NULL, main=NULL, ...){ 68 | if(is.null(ranks)){ 69 | if(is.null(main)) 70 | main = x$name 71 | plot_tree_ranks(x$tree, x$taxonomy, main=main, ...) 72 | } 73 | else if(length(ranks) > 1){ 74 | if(is.null(main)) 75 | main = x$name 76 | plot_tree_ranks(x$tree, x$taxonomy, ranks=ranks, main=main, ...) 77 | } 78 | else { 79 | if(ranks == 'none') { 80 | plot_tree(x$tree, main=main, ...) 81 | } 82 | else{ 83 | plot_tree(x$tree, taxonomy=x$taxonomy, rank=ranks, main=main, ...) 84 | } 85 | } 86 | } 87 | #' Automatic primer searching Search a given primer pair, retrieving the alignment 88 | #' results, their product sequences, the taxonomic information for the sequences, 89 | #' a multiple alignment of the products 90 | #' @param name name to give to the primer pair 91 | #' @param simplify use simple names for primer hit results or complex 92 | #' @param .progress name of the progress bar to use, see 93 | #' \code{\link{create_progress_bar}} 94 | #' @param clustal_options a list of options to pass to clustal omega, see 95 | #' \code{link{clustalo}} for a list of options 96 | #' @param distance_options a list of options to pass to dist.dna, see 97 | #' \code{link{dist.dna}} for a list of options 98 | #' @param api_key NCBI api-key to allow faster sequence retrieval 99 | #' @inheritParams primer_search 100 | #' @return A list with the following elements, 101 | #' \item{name}{name of the primer pair} 102 | #' \item{BLAST_result}{html blast results from Primer-BLAST as 103 | #' 'a \code{\link{response}}} object. 104 | #' \item{taxonomy}{taxonomy for the primer products from NCBI} 105 | #' \item{sequence}{sequence of the primer products} 106 | #' \item{alignment}{multiple alignment of the primer products} 107 | #' \item{tree}{phylogenetic tree of the reconstructed from the 108 | #' 'multiple alignment} 109 | #' @seealso \code{\link{primer_search}}, \code{\link{clustalo}} 110 | #' @export 111 | #' @examples 112 | #' \dontrun{ 113 | #' #simple search 114 | #' mammals_16S = search_primer_pair(name='Mammals 16S', 115 | #' 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT') 116 | #' #returning 1000 alignments, allow up to 3 mismatches in primer 117 | #' mammals_16S = search_primer_pair(name='Mammals 16S', 118 | #' 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT', 119 | #' num_aligns=1000, total_primer_specificity_mismatch=3) 120 | #' } 121 | search_primer_pair = function(forward, reverse, name=NULL, num_aligns=500, 122 | num_permutations=25, simplify=TRUE, clustal_options=list(), 123 | distance_options=list(model="N", pairwise.deletion=T), api_key=Sys.getenv("NCBI_API_KEY"), 124 | ..., .parallel=FALSE, .progress='none'){ 125 | 126 | #HACK, primerTree is an environment rather than a list so we can treat it as 127 | #a pointer, I could make it a reference class, but that seems to be overkill 128 | #as I am converting to a list at the end of the function anyway... 129 | 130 | if(missing(forward) || missing(reverse)){ 131 | BLAST_primer() 132 | return() 133 | } 134 | 135 | primer_search = new.env(parent=globalenv()) 136 | #list all primers used to search 137 | primer_search = env2list( 138 | try_default({ 139 | primer_search$name = if(!is.null(name)) name 140 | else name=paste(forward, reverse, sep='_') 141 | 142 | primer_search$arguments = 143 | c(forward=forward, reverse=reverse, name=name, 144 | num_aligns=num_aligns, num_permutations = num_permutations, 145 | simplify=simplify, clustal_options=clustal_options, list(...)) 146 | 147 | primer_search$response = primer_search(forward, reverse, 148 | num_permutations=num_permutations, 149 | .progress=.progress, 150 | .parallel=.parallel, 151 | num_aligns=num_aligns, 152 | ...) 153 | start_time = now() 154 | primer_search$BLAST_result = 155 | filter_duplicates(ldply(primer_search$response, parse_primer_hits, .parallel=.parallel)) 156 | 157 | message(nrow(primer_search$BLAST_result), ' BLAST alignments parsed in ', seconds_elapsed_text(start_time)) 158 | 159 | start_time = now() 160 | primer_search$taxonomy = get_taxonomy(primer_search$BLAST_result$accession) 161 | message('taxonomy retrieved in ', seconds_elapsed_text(start_time)) 162 | 163 | start_time = now() 164 | primer_search$sequence = get_sequences(primer_search$BLAST_result$accession, 165 | primer_search$BLAST_result$product_start, 166 | primer_search$BLAST_result$product_stop, 167 | api_key=api_key, 168 | simplify=simplify, 169 | .parallel=.parallel) 170 | 171 | lengths = laply(primer_search$sequence, length) 172 | message(length(primer_search$sequence), ' sequences retrieved from NCBI', 173 | ' in ', seconds_elapsed_text(start_time), ', product length', 174 | ' min:', min(lengths), 175 | ' mean:', round(mean(lengths),2), 176 | ' max:', max(lengths)) 177 | 178 | start_time = now() 179 | primer_search$alignment = do.call(clustalo, c(list(primer_search$sequence, threads=getDoParWorkers()), clustal_options)) 180 | message(nrow(primer_search$alignment), ' sequences aligned in ', 181 | seconds_elapsed_text(start_time), 182 | ' length:', ncol(primer_search$alignment)) 183 | 184 | start_time = now() 185 | primer_search$distances = do.call(dist.dna, c(list(primer_search$alignment), distance_options)) 186 | message('pairwise DNA distances calculated in ', 187 | seconds_elapsed_text(start_time)) 188 | 189 | start_time = now() 190 | primer_search$tree = tree_from_alignment(primer_search$alignment) 191 | message('constructed neighbor joining tree in ', seconds_elapsed_text(start_time)) 192 | 193 | primer_search 194 | }, default=primer_search) 195 | ) 196 | class(primer_search) = 'primerTree' 197 | primer_search 198 | } 199 | 200 | #given a start time print out the number of seconds which have elapsed 201 | seconds_elapsed_text = function(start_time){ 202 | paste((start_time %--% now()) %/% seconds(1), 'seconds') 203 | } 204 | 205 | #fast way to convert a environment to a list 206 | env2list = function(env){ 207 | names = ls(env) 208 | mget(names, env) 209 | } 210 | #' identify the point closest to the mouse click 211 | #' only works on single ranks 212 | #' @param x the plot to identify 213 | #' @param ... additional arguments passed to annotate 214 | #' @export 215 | identify.primerTree_plot = function(x, ...) { 216 | point <- gglocator(x$layers[[4]]) 217 | distances <- distance(point, x$layers[[4]]$data[,c('x','y')]) 218 | closest <- which(distances == min(distances))[1] 219 | point$label <- x$layers[[4]]$data[closest,deparse(x$layers[[4]]$mapping$colour)] 220 | x + annotate("text", label=point$label, x=point$x, y=point$y, ...) 221 | } 222 | gglocator = function(object) { 223 | loc <- as.numeric(grid.locator("npc")) 224 | 225 | xrng <- with(object, range(data[,deparse(mapping$x)])) 226 | yrng <- with(object, range(data[,deparse(mapping$y)])) 227 | 228 | point <- data.frame(xrng[1] + loc[1]*diff(xrng), yrng[1] + loc[2]*diff(yrng)) 229 | names(point) <- with(object, c(deparse(mapping$x), deparse(mapping$y))) 230 | point 231 | } 232 | 233 | #returns the distance from a point in point to the points in points 234 | distance <- function(point,points){ 235 | sqrt((point$x-points$x)^2 + (point$y-points$y)^2) 236 | } 237 | 238 | #' Summarize a primerTree result, printing quantiles of sequence length and 239 | #' pairwise differences. 240 | 241 | #' @param object the primerTree object to summarise 242 | #' @param ... Ignored options 243 | #' @param probs quantile probabilities to compute, defaults to 0, 5, 50, 95, 244 | #' and 100 probabilities. 245 | #' @param ranks ranks to show unique counts for, defaults to the common ranks 246 | #' @return invisibly returns a list containing the printed results 247 | #' @export 248 | summary.primerTree <- function(object, ..., probs=c(0, .05, .5, .95, 1), ranks = common_ranks) { 249 | 250 | res = list() 251 | res[['lengths']] = t(data.frame('Sequence lengths'=labeled_quantile(laply(object$sequence, length), sprintf('%.0f%%', probs*100), probs=probs), check.names=F)) 252 | print(res[['lengths']]) 253 | 254 | res[['distances']] = t(data.frame('Pairwise differences'=labeled_quantile(object$distances, sprintf('%.0f%%', probs*100), probs=probs), check.names=F)) 255 | cat('\n') 256 | print(res[['distances']]) 257 | 258 | res[['rankDistances']] = calc_rank_dist_ave(object, common_ranks) 259 | print(res[['rankDistances']]) 260 | 261 | res[['ranks']] = laply(object$taxonomy[common_ranks], function(x) length(unique(x))) 262 | cat('\n', 'Unique taxa out of ', length(object$sequence), ' sequences\n', sep='') 263 | names(res[['ranks']]) = ranks 264 | print(res[['ranks']]) 265 | 266 | invisible(res) 267 | } 268 | 269 | labeled_quantile = function(x, labels, ...){ 270 | res = quantile(x, ...) 271 | names(res) = labels 272 | res 273 | } 274 | 275 | #' Summarize pairwise differences. 276 | 277 | #' @param x a primerTree object 278 | #' @param ranks ranks to show unique counts for, defaults to the common ranks 279 | #' @return returns a data frame of results 280 | #' @details 281 | #' The purpose of this function is to calculate the average number 282 | #' of nucleotide differences between species within each taxa of given taxonomic 283 | #' level. 284 | #' 285 | #' For example, at the genus level, the function calculates the average number 286 | #' of nucleotide differences between all species within each genus and reports 287 | #' the mean of those values. 288 | #' 289 | #' There are several key assumptions and calculations made in this 290 | #' function. 291 | #' 292 | #' First, the function randomly selects one sequence from each species 293 | #' in the primerTree results. This is to keep any one species (e.g. 294 | #' human, cow, etc.) with many hits from skewing the results. 295 | #' 296 | #' Second, for each taxonomic level tested, the function divides the 297 | #' sequences by each taxon at that level and calculates the mean 298 | #' number of nucleotide differences within that taxa, then returns the 299 | #' mean of those values. 300 | #' 301 | #' Third, when calculating the average distance, any taxa for which 302 | #' there is only one species is omitted, as the number of nucleotide 303 | #' differences will always be 0. 304 | #' 305 | #' @examples 306 | #' \dontrun{ 307 | #' calc_rank_dist_ave(mammals_16S) 308 | #' 309 | #' calc_rank_dist_ave(bryophytes_trnL) 310 | #' 311 | #' # Note that the differences between the results from these two primers 312 | #' # the mean nucleotide differences is much higher for the mammal primers 313 | #' # than the byrophyte primers. This suggests that the mammal primers have 314 | #' # better resolution to distinguish individual species. 315 | #' } 316 | #' @importFrom reshape2 melt 317 | #' @export 318 | 319 | # using tree data format info from http://www.phytools.org/eqg/Exercise_3.2/ 320 | calc_rank_dist_ave <- function(x, ranks = common_ranks) { 321 | used_ranks <- grep("species", ranks, invert = T, value = T) 322 | rank_dist_mean <- data.frame(matrix(nrow = 1, ncol = 0)) 323 | 324 | # Raw taxonomy data 325 | taxa <- as.data.frame(x$taxonomy) 326 | 327 | # Randomize the order of the taxa data frame 328 | taxa <- taxa[sample(nrow(taxa)), ] 329 | rownames(taxa) <- taxa$accession 330 | 331 | # Pick random example per species and add back in the taxa info 332 | unique_factors <- as.data.frame(unique(taxa$species)) 333 | colnames(unique_factors) <- "species" 334 | unique_factors <- join(unique_factors, taxa, type = "left", match = "first", by = "species") 335 | 336 | # Get sequences for randomly selected species 337 | seqs <- x$sequence 338 | seqs <- seqs[names(seqs) %in% unique_factors$accession] 339 | 340 | # Align and calculate pairwise distances and convert dists to dataframe 341 | align <- clustalo(seqs) 342 | dists <- as.data.frame(as.matrix(dist.dna(align, model = "N", pairwise.deletion = T))) 343 | dists$accession <- row.names(dists) 344 | 345 | # Melt the dists dataframe so I can drop any distance that isn't within the (rank) 346 | melted <- melt(dists, id = "accession", variable.name = "accession2", value.name = "dist") 347 | 348 | for(rank in used_ranks) { 349 | 350 | # Gather only the needed taxa data 351 | unique_factors_sub <- unique_factors[ , colnames(unique_factors) %in% c("accession", "species", rank)] 352 | 353 | # Drop any row in (rank) where there is only one species represented 354 | # Any instance of this leads to a distance within that rank of 0, skewing the results downward 355 | counts <- as.data.frame(table(unique_factors_sub[[rank]])) 356 | colnames(counts) <- c(rank, "count") 357 | unique_factors_sub <- join(unique_factors_sub, counts, by = rank) 358 | unique_factors_sub <- unique_factors_sub[unique_factors_sub$count > 1, ] 359 | 360 | # Pull the nucleotide distance data in 361 | # Replace the rank1 accession with the rank1 taxa 362 | melted_sub <- join(melted, unique_factors_sub,by = "accession") 363 | melted_sub$rank1 <- as.factor(melted_sub[[rank]]) 364 | 365 | # Drop all columns except the three needed so the next join doesn't get messed up 366 | melted_sub <- melted_sub[, colnames(melted_sub) %in% c("accession2", "dist", "rank1", "species")] 367 | colnames(melted_sub)[1] <- "accession" 368 | 369 | # Replace the rank2 accession with the rank2 taxa 370 | melted_sub <- join(melted_sub, unique_factors_sub, by = "accession") 371 | melted_sub$rank2 <- as.factor(melted_sub[[rank]]) 372 | 373 | # Drop all columns except the three needed 374 | melted_sub <- melted_sub[ , colnames(melted_sub) %in% c("rank2", "dist", "rank1", "species")] 375 | 376 | # Drop all rows with missing information 377 | melted_sub <- na.omit(melted_sub) 378 | 379 | # We only want distances within a taxa, so drop all comparisons between taxa 380 | # We also want to drop any comparisons of a species to itself, which will have dist == 0 381 | melted_sub <- melted_sub[melted_sub$rank1 == melted_sub$rank2 & melted_sub$species != melted_sub$species.1, ] 382 | 383 | # Calculate the mean distance for each taxa compared 384 | # We calculate each separately to avoid any one taxa with lots of hits (like human seqs) 385 | # from skewing the mean 386 | melted_sub$group <- paste(melted_sub$rank1, melted_sub$rank2) 387 | 388 | # Plug the means into the storage dataframe 389 | rank_dist_mean[[rank]] <- mean(melted_sub$dist) 390 | } 391 | message("\nAverage number of nucleotide differences between sequences within a given taxonomic group") 392 | message("See function description for further details") 393 | rank_dist_mean 394 | } 395 | -------------------------------------------------------------------------------- /R/search.R: -------------------------------------------------------------------------------- 1 | #PrimerTree 2 | #Copyright (C) 2013 Jim Hester 3 | 4 | #' Query a pair of primers using ncbi's Primer-BLAST, if primers contain iupac 5 | #' 6 | #' ambiguity codes, enumerate all possible combinations and combine the 7 | #' results. 8 | #' @param forward forward primer to search by 5'-3' on plus strand 9 | #' @param reverse reverse primer to search by 5'-3' on minus strand 10 | #' @param num_aligns number of alignment results to keep 11 | #' @param num_permutations the number of primer permutations to search, if the degenerate bases 12 | #' cause more than this number of permutations to exist, this number will be 13 | #' sampled from all possible permutations. 14 | #' @param ... additional arguments passed to Primer-Blast 15 | #' @param .parallel if 'TRUE', perform in parallel, using parallel backend 16 | #' provided by foreach 17 | #' @param .progress name of the progress bar to use, see 'create_progress_bar' 18 | #' @return httr response object of the query, pass to \code{\link{parse_primer_hits}} to parse the results. 19 | #' @export 20 | primer_search = function(forward, reverse, num_aligns=500, num_permutations=25, ..., .parallel=FALSE, .progress='none'){ 21 | if(missing(forward) || missing(reverse)) 22 | BLAST_primer() 23 | primers = enumerate_primers(forward, reverse) 24 | num_primers = nrow(primers) 25 | if(num_primers > num_permutations){ 26 | warning(immediate.=T, 'primers have ', num_primers, ' possible combinations due to degenerate bases, sampling ', num_permutations, " primers, use 'num_permutations' to change") 27 | primers = primers[ sample.int(num_primers, num_permutations, replace=F), ] 28 | } 29 | message('BLASTing ', nrow(primers), ' primer combinations') 30 | #enumerate all combinations to handle ambiguity codes 31 | alply(primers, .margins=1, .expand=F, .parallel=.parallel, .progress=.progress, 32 | function(row) BLAST_primer(row$forward, row$reverse, num_targets_with_primers=max(num_aligns %/% nrow(primers), 1), ...)) 33 | } 34 | iupac = list( "M" = list("A", "C"), 35 | "R" = list("A", "G"), 36 | "W" = list("A", "T"), 37 | "S" = list("C", "G"), 38 | "Y" = list("C", "T"), 39 | "K" = list("G", "T"), 40 | "V" = list("A", "C", "G"), 41 | "H" = list("A", "C", "T"), 42 | "D" = list("A", "G", "T"), 43 | "B" = list("C", "G", "T"), 44 | "N" = list("A", "C", "G", "T"), 45 | "I" = list("A", "T", "C")) 46 | 47 | enumerate_primers = function(forward, reverse){ 48 | forward_primers = enumerate_ambiguity(forward) 49 | data.frame(forward=forward_primers, 50 | reverse=rep(enumerate_ambiguity(reverse), 51 | each=length(forward_primers)), 52 | stringsAsFactors = FALSE) 53 | } 54 | enumerate_ambiguity = function(sequence){ 55 | search_regex = paste(names(iupac), collapse='|') 56 | locs = str_locate_all(sequence, search_regex) 57 | sequences = list() 58 | count = 1 59 | for (i in seq_len(nrow(locs[[1]]))){ 60 | loc = locs[[1]][i,] 61 | ambiguity = str_sub(sequence, loc[1], loc[2]) 62 | for(type in iupac[[ambiguity]]){ 63 | new_seq = sequence 64 | str_sub(new_seq, loc[1], loc[2]) <- type 65 | sequences[[count]] = enumerate_ambiguity(new_seq) 66 | count = count + 1 67 | } 68 | return(unlist(sequences)) 69 | } 70 | return(sequence) 71 | } 72 | 73 | print_options = function(options){ 74 | output = capture.output( print( 75 | options[ is.na(options$type) | options$type != 'hidden', 76 | c('name', 'type', 'defval') ]) 77 | ) 78 | 79 | message(paste(output, "\n", sep="")) 80 | } 81 | 82 | BLAST_primer = function(forward, reverse, ..., organism='', 83 | primer_specificity_database='nt', exclude_env='on'){ 84 | 85 | url = 'https://www.ncbi.nlm.nih.gov/tools/primer-blast/' 86 | form = GET_retry(url) 87 | 88 | content = parsable_html(form) 89 | 90 | all_options = get_options(content) 91 | 92 | if(missing(forward) || missing(reverse)){ 93 | print_options(all_options) 94 | stop('No primers specified') 95 | } 96 | 97 | options = list(..., primer_left_input=forward, primer_right_input=reverse, 98 | organism=organism, 99 | primer_specificity_database=primer_specificity_database, 100 | exclude_env=exclude_env, 101 | search_specific_primer='on') 102 | 103 | names(options) = toupper(names(options)) 104 | 105 | match_args = pmatch(names(options), all_options$name) 106 | bad_args = is.na(match_args) 107 | 108 | if(any(bad_args)){ 109 | print_options(all_options) 110 | stop(paste(names(options)[bad_args], collapse=','), ' not valid option\n') 111 | } 112 | 113 | options = get_defaults(options, all_options) 114 | 115 | start_time = now() 116 | 117 | message('Submitting Primer-BLAST query') 118 | response = POST_retry(paste(url, 'primertool.cgi', sep=''), body=options) 119 | 120 | values = get_refresh_from_meta(response) 121 | 122 | while(length(values) > 0){ 123 | message('BLAST alignment processing, refreshing in ', values[1], ' seconds...') 124 | Sys.sleep(values[1]) 125 | response = GET_retry(values[2]) 126 | 127 | values = get_refresh_from_meta(response) 128 | } 129 | total_time = start_time %--% now() 130 | message('BLAST alignment completed in ', total_time %/% seconds(1), ' seconds') 131 | response 132 | } 133 | 134 | #modify a function to check the status and retry until success 135 | retry = function(fun, num_retry=5, ...){ 136 | function(...){ 137 | res = fun(...) 138 | itr = 0 139 | status = http_status(res) 140 | while(itr < num_retry && inherits(res, 'response') && tolower(status$category) != 'success'){ 141 | warning('request failed, retry attempt ', itr+1) 142 | res = fun(...) 143 | status = http_status(res) 144 | itr = itr + 1 145 | #sleep to avoid hitting NCBI query rate limit :'-( 146 | Sys.sleep(0.4) 147 | } 148 | res 149 | } 150 | } 151 | 152 | GET_retry = retry(GET) 153 | POST_retry = retry(POST) 154 | 155 | #' Parse the primer hits 156 | #' 157 | #' @param response a httr response object obtained from \code{\link{primer_search}} 158 | #' @export 159 | parse_primer_hits = function(response){ 160 | content = parsable_html(response) 161 | rbind.fill(xpathApply(content, '//pre', parse_pre)) 162 | } 163 | parse_a = function(a){ 164 | #links like entrez/viewer.fcgi?db=nucleotide&id=452085006 165 | m = regexpr('id=\\d+', xmlAttrs(a)['href']) 166 | gi = gsub('id=', '', unlist(regmatches(xmlAttrs(a)['href'], m))) 167 | if(length(gi) == 0L){ 168 | #links like nucleotide/449036831?from=1107741&to=1107929&report=gbwithparts 169 | m = regexpr('nucleotide/\\d+', xmlAttrs(a)['href']) 170 | gi = gsub('nucleotide/', '', unlist(regmatches(xmlAttrs(a)['href'], m))) 171 | } 172 | if(length(gi) == 0L) gi = NA 173 | data.frame(gi=as.character(gi), accession=as.character(xmlValue(a))) 174 | } 175 | 176 | parse_pre = function(pre){ 177 | pre_text = xmlValue(pre) 178 | 179 | a = getNodeSet(pre, './preceding-sibling::a[1]') 180 | if(length(a) <= 0) 181 | stop('Parsing failed for ', pre_text) 182 | 183 | ids = parse_a(a[[1]]) 184 | 185 | product_length_regex = 'product length = (\\d+)' 186 | template_regex = 'Template[^\\d]+(\\d+)[^.ACGT]+([.ACGT]+)[^\\d]+(\\d+)' 187 | full_regex = paste('[\\S\\W]*', product_length_regex, '[\\S\\W]*?', 188 | template_regex, '[\\S\\W]*', template_regex, '[\\S\\W]*', sep='') 189 | values = str_split(gsub(full_regex, paste('\\', 1:8, sep='', collapse='|'), 190 | pre_text, perl=T), '[|]')[[1]] 191 | data.frame(ids, 192 | product_length=as.numeric(values[1]), 193 | mismatch_forward=str_count(values[3], '[ACGT]'), 194 | mismatch_reverse=str_count(values[6], '[ACGT]'), 195 | forward_start = as.numeric(values[2]), 196 | forward_stop = as.numeric(values[4]), 197 | reverse_start = as.numeric(values[5]), 198 | reverse_stop = as.numeric(values[7]), 199 | product_start=min(as.numeric(values[c(2,4,5,7)])), 200 | product_stop=max(as.numeric(values[c(2,4,5,7)])) 201 | ) 202 | } 203 | get_refresh_from_meta = function(response){ 204 | content = parsable_html(response) 205 | meta = content['//meta[@http-equiv="Refresh"]'] 206 | if(length(meta) > 0){ 207 | values = str_split(xmlAttrs(meta[[1]])['content'], '; URL=')[[1]] 208 | return(values) 209 | } 210 | return() 211 | } 212 | 213 | get_defaults = function(set_options, options){ 214 | #only look at values with set defaults 215 | options = options[ !is.na(options$defval), ] 216 | unchanged_options = setdiff(options$name, names(set_options)) 217 | default_values = as.character(options[ match(unchanged_options, options$name), 'defval' ]) 218 | names(default_values) = unchanged_options 219 | c(set_options, default_values) 220 | } 221 | 222 | get_options = function(content){ 223 | options = rbind.fill(xpathApply(content, '//form//input | //form//select', parse_attributes)) 224 | options$type = as.character(options$type) 225 | 226 | #add dropdown type if they are NA 227 | options$type[is.na(options$type)] <- 'dropdown' 228 | 229 | #make default values for checkboxes on or off rather than checked or unchecked 230 | options$defval = as.character(options$defval) 231 | check_map = c('checked' = 'on', 'unchecked' = '') 232 | checkboxes = which(options$type == 'checkbox') 233 | options$defval[ checkboxes ] = check_map[ options$defval[ checkboxes ] ] 234 | 235 | options[ options$type != 'hidden', c('name', 'type', 'defval') ] 236 | } 237 | 238 | parse_attributes = function(x){ 239 | as.data.frame(t(xmlAttrs(x))) 240 | } 241 | parsable_html = function(response){ 242 | txt <- content(response, as='text', encoding='UTF-8') 243 | 244 | # remove any unicode characters 245 | Encoding(txt) <- "UTF-8" 246 | txt <- iconv(txt, "UTF-8", "ASCII", sub = "") 247 | 248 | #this gsub regex is to remove the definition lines, some of which have 249 | # bracketed in them, which messes up the parsing 250 | txt <- gsub('("new_entrez".*?).*?
\n\n', '\\1\n
', txt)
251 |   htmlParse(txt)
252 | }
253 | filter_duplicates = function(hits){
254 |   location_columns = c('accession', 'forward_start', 'forward_stop', 'reverse_start', 'reverse_stop')
255 |   hits[!duplicated(t(apply(hits[location_columns], 1, range))),]
256 | }
257 | 


--------------------------------------------------------------------------------
/R/sequence.R:
--------------------------------------------------------------------------------
  1 | #PrimerTree
  2 | #Copyright (C) 2013 Jim Hester
  3 | 
  4 | #' Retrieves a fasta sequence from NCBI nucleotide database.
  5 | #'
  6 | #' @param accession nucleotide accession to retrieve.
  7 | #' @param start start base to retrieve, numbered beginning at 1.  If NULL the
  8 | #'        beginning of the sequence.
  9 | 
 10 | #' @param stop last base to retrieve, numbered beginning at 1. if NULL the end of
 11 | #'        the sequence.
 12 | #' @param api_key NCBI api-key to allow faster sequence retrieval.
 13 | #' @return an DNAbin object.
 14 | #' @seealso \code{\link{DNAbin}}
 15 | #' @export
 16 | 
 17 | get_sequence = function(accession, start=NULL, stop=NULL, api_key=Sys.getenv("NCBI_API_KEY")){
 18 | 
 19 |   fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
 20 | 
 21 |   query=list(db='nuccore', rettype='fasta', retmode='text', id=accession)
 22 | 
 23 |   if(!is.null(start))
 24 |     query$seq_start = start
 25 | 
 26 |   if(!is.null(stop))
 27 |     query$seq_stop = stop
 28 | 
 29 |   if(nzchar(api_key))
 30 |     query$api_key = api_key
 31 | 
 32 |   response = POST_retry(fetch_url, body=query)
 33 | 
 34 |   #stop if response failed
 35 |   stop_for_status(response)
 36 | 
 37 |   content = content(response, as='raw')
 38 | 
 39 |   #from ape package read.FASTA
 40 |   res <- .Call("rawStreamToDNAbin", content)
 41 |   names(res) <- sub("^ +", "", names(res))
 42 |   class(res) <- "DNAbin"
 43 |   res
 44 | }
 45 | 
 46 | #' Retrieves fasta sequences from NCBI nucleotide database.
 47 | #'
 48 | #' @param accession the accession number of the sequence to retrieve
 49 | #' @param start start bases to retrieve, numbered beginning at 1.  If NULL the
 50 | #'        beginning of the sequence.
 51 | 
 52 | #' @param stop stop bases to retrieve, numbered beginning at 1. if NULL the stop of
 53 | #'        the sequence.
 54 | #' @param api_key NCBI api-key to allow faster sequence retrieval.
 55 | #' @param simplify simplify the FASTA headers to include only the genbank
 56 | #'        accession.
 57 | #' @param .parallel if 'TRUE', perform in parallel, using parallel backend
 58 | #'        provided by foreach
 59 | #' @param .progress name of the progress bar to use, see 'create_progress_bar'
 60 | #' @return an DNAbin object.
 61 | #' @seealso \code{\link{DNAbin}}
 62 | #' @export
 63 | 
 64 | get_sequences = function(accession, start=NULL, stop=NULL, api_key=Sys.getenv("NCBI_API_KEY"), simplify=TRUE, .parallel=FALSE, .progress='none'){
 65 |   #expand arguments by recycling
 66 |   args = expand_arguments(accession=accession, start=start, stop=stop)
 67 |   #assign expanded arguments to actual arguments
 68 |   lapply(seq_along(args), function(i) names(args)[i] <<- args[i])
 69 | 
 70 |   #define rate to query NCBI servers with get_sequences
 71 |   query_rate <- 3; #queries per second
 72 | 
 73 |   if(nzchar(api_key)) {
 74 |     query_rate <- 10
 75 |   } else {
 76 |     warning("Sequence retrieval limited to 3 per second. Provide an api_key to increase this to 10. See:
 77 |   https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/", immediate. = TRUE)
 78 |   }
 79 | 
 80 |   size = length(accession)
 81 |   get_sequence_itr = function(i){
 82 |     start_time <- Sys.time()
 83 |     sequence = get_sequence(accession[i], start[i], stop[i], api_key)
 84 |     stop_time <- Sys.time()
 85 |     if((stop_time - start_time) < (1 / query_rate)) {
 86 |       #sleep to limit query rate :-(
 87 |       Sys.sleep((1/query_rate) - (stop_time - start_time))
 88 |     }
 89 |     sequence
 90 |   }
 91 |   sequences = alply(seq_along(accession), .margins=1, .parallel=.parallel, .progress=.progress, failwith(NA, f=get_sequence_itr))
 92 |   names = if(simplify) accession else laply(sequences, names)
 93 |   sequences = llply(sequences, `[[`, 1)
 94 |   names(sequences) = names
 95 |   class(sequences) = 'DNAbin'
 96 |   sequences
 97 | }
 98 | # from http://stackoverflow.com/questions/9335099/implementation-of-standard-recycling-rules
 99 | expand_arguments <- function(...){
100 |   dotList <- list(...)
101 |   max.length <- max(sapply(dotList, length))
102 |   suppressWarnings(lapply(dotList, rep, length=max.length))
103 | }
104 | #' Construct a neighbor joining tree from a dna alignment
105 | #'
106 | #' @param dna fasta dna object the tree is to be constructed from
107 | #' @param pairwise.deletion a logical indicating if the distance matrix should 
108 | #' be constructed using pairwise deletion
109 | #' @param ... furthur arguments to dist.dna
110 | #' @seealso \code{\link{dist.dna}}, \code{\link{nj}}
111 | #' @export
112 | tree_from_alignment = function(dna, pairwise.deletion=TRUE, ...){
113 |   nj(dist.dna(dna, model="N", pairwise.deletion=pairwise.deletion, ...))
114 | }
115 | #' Multiple sequence alignment with clustal omega
116 | #'
117 | #' Calls clustal omega to align a set of sequences of class DNAbin.  Run
118 | #' without any arguments to see all the options you can pass to the command
119 | #' line clustal omega.
120 | #' @param x an object of class 'DNAbin'
121 | #' @param exec a character string with the name or path to the program
122 | #' @param quiet whether to supress output to stderr or stdout
123 | #' @param original.ordering use the original ordering of the sequences
124 | #' @param ... additional arguments passed to the command line clustalo
125 | #' @export
126 | clustalo = function (x, exec = 'clustalo', quiet = TRUE, original.ordering = TRUE, ...)
127 | {
128 |     help_text = system(paste(exec, '--help'), intern=TRUE)
129 |     all_options = get_command_options(help_text)
130 | 
131 |     inf <- tempfile(fileext='.fas')
132 |     outf <- tempfile(fileext='.aln')
133 | 
134 |     options = c(infile=inf, outfile=outf, list(...))
135 |     match_args = pmatch(names(options), names(all_options), duplicates.ok=TRUE)
136 |     bad_args = is.na(match_args)
137 | 
138 |     if (missing(x)){
139 |         message(paste(help_text, collapse="\n"))
140 |         stop('No input')
141 |     }
142 |     if(any(bad_args)){
143 |       stop(paste(names(options)[bad_args], collapse=','), ' not valid option\n')
144 |     }
145 | 
146 |     write.dna(x, inf, "fasta")
147 |     args = paste(paste(all_options[match_args], options, collapse=' '))
148 |     system2(exec, args=args, stdout = ifelse(quiet, FALSE, ''), stderr = ifelse(quiet, FALSE, ''))
149 |     res <- read.dna(outf, "fasta")
150 |     if (original.ordering)
151 |         res <- res[labels(x), ]
152 |     res
153 | }
154 | 
155 | #parses the usage and enumerates the commands
156 | get_command_options = function(usage){
157 |   m = gregexpr('-+\\w+', usage)
158 |   arguments = unlist(regmatches(usage, m))
159 |   names(arguments) = gsub('-+', '', arguments)
160 |   arguments
161 | }
162 | 


--------------------------------------------------------------------------------
/R/taxonomy.R:
--------------------------------------------------------------------------------
 1 | #PrimerTree
 2 | #Copyright (C) 2013 Jim Hester
 3 | 
 4 | #' Retrieve the taxonomy information from NCBI for a set of nucleotide gis.
 5 | #'
 6 | #' @param accessions a character vector of the accessions to retrieve
 7 | #' @return data.frame of the 'accessions, taxIds, and taxonomy
 8 | #' @export
 9 | 
10 | get_taxonomy = function(accessions){
11 | 
12 |   accessions = unique(as.character(accessions))
13 |   taxids = accession2taxid(accessions)
14 | 
15 |   taxonomy=fetch_taxonomy(unique(taxids))
16 |   merge(
17 |     data.frame(accession=names(taxids), taxId=taxids, stringsAsFactors=FALSE),
18 |     taxonomy
19 |   )
20 | }
21 | #' Maps a nucleotide database accession to a taxonomy database taxId
22 | #'
23 | #' @param accessions accessions character vector to lookup.
24 | #' @return named vector of taxIds.
25 | #' @export
26 | 
27 | accession2taxid = function(accessions){
28 |   url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
29 | 
30 |   names(accessions) = rep('id', times=length(accessions))
31 |   query=c(list(db='taxonomy', dbfrom='nuccore', idtype='acc'), accessions)
32 | 
33 |   response=POST_retry(url, body=query)
34 | 
35 |   #stop if response failed
36 |   stop_for_status(response)
37 | 
38 |   parsed = xmlParse(content(response, as="text"))
39 | 
40 |   xpathSApply(parsed, '//LinkSet', parse_LinkSet)
41 | }
42 | parse_LinkSet = function(LinkSet){
43 |   gid = xpathSApply(LinkSet, './/IdList/Id', xmlValue)
44 |   taxid = xpathSApply(LinkSet, './/LinkSetDb/Link/Id', xmlValue)
45 |   if(length(taxid) != 1)
46 |     taxid = NA
47 |   names(taxid) = gid
48 |   taxid
49 | }
50 | 
51 | fetch_taxonomy = function(taxid) {
52 | 
53 |   fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
54 | 
55 |   query=list(db='taxonomy', rettype='null', retmode='xml', id=paste(taxid, collapse=','))
56 | 
57 |   response = POST_retry(fetch_url, body=query)
58 | 
59 |   #stop if response failed
60 |   stop_for_status(response)
61 | 
62 |   parse_taxonomy_xml(xmlParse(content(response, as="text")))
63 | }
64 | parse_taxonomy_xml = function(xml){
65 |   rbind.fill(xpathApply(xml, '//TaxaSet/Taxon', parse_taxon))
66 | }
67 | parse_taxon = function(taxon){
68 |   tax_id = xpathSApply(taxon, './TaxId', xmlValue)
69 |   ranks = xpathSApply(taxon, './/Rank', xmlValue)
70 |   names = xpathSApply(taxon, './/ScientificName', xmlValue)
71 |   names(names) = ranks
72 |   names = names[ranks != 'no rank']
73 |   names['taxId'] = tax_id
74 |   as.data.frame(t(names), stringsAsFactors=FALSE)
75 | }
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PrimerTree #
 2 | [![Travis-CI Build Status](https://travis-ci.org/jimhester/primerTree.svg?branch=master)](https://travis-ci.org/jimhester/primerTree)
 3 | PrimerTree: Visually Assessing the Specificity and Informativeness of Primer Pairs
 4 | 
 5 | * [Features](#features)
 6 | * [Examples](#examples)
 7 | * [Installation](#installation)
 8 | * [Usage](#usage)
 9 | 
10 | ## Features ##
11 | * Automatically query a primer pair and generate a tree of the products.
12 | * Analysis can be run and then plotted in two commands.
13 | * All options of Primer-Blast and clustal omega are supported.
14 | 
15 | ## Installation ##
16 | ### R installation ###
17 | #### CRAN ####
18 | ```s
19 | install.packages('primerTree')
20 | ```
21 | #### Github ####
22 | ```s
23 | # install.packages("devtools")
24 | devtools::install_github("jimhester/primerTree")
25 | ```
26 | ### [Clustal Omega](http://www.clustal.org/omega/#Download) Installation ###
27 | #### Windows ####
28 | Use the pre-compiled windows binary.  Either put the installed clustalo.exe in your path, or pass the path to the executable in the clustal_options option
29 | ```s
30 | library(primerTree)
31 | mammals_16S = search_primer_pair(name='Mammals 16S',
32 |   'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT', clustal_options=c(exec='C:\Program Files\Clustal Omega\clustalo.exe'))
33 | ```
34 | #### Linux ####
35 | Simple installation from source
36 | ```bash
37 | ./configure && make && make install
38 | ```
39 | If the resulting clustalo program is in your path it should be automatically found,
40 | otherwise see the windows instructions on how to specify the path to the
41 | executable.
42 | 
43 | #### Mac OSX ####
44 | Clustal Omega can be installed from source but requires Xcode CLT and argtable, which are not pre-installed on OSX.
45 | 
46 | First install Xcode CLT.
47 | ```bash
48 | xcode-select --install
49 | ```
50 | then install argtable
51 | ```bash
52 | brew install argtable
53 | ```
54 | Now you can install Clustal Omega from source.
55 | From the clustalo directory:
56 | 
57 | ```bash
58 | ./configure && sudo make install
59 | ```
60 | 
61 | ## Usage ##
62 | Simple search for a Mammal 16S primer
63 | ```s
64 | library(primerTree)
65 | mammals_16S = search_primer_pair(name='Mammals 16S', 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT')
66 | plot(mammals_16S)
67 | ```
68 | 
69 | Using the parallel features with the multicore package using the doMC backend, with 8 threads.
70 | ```s
71 | library(doMC)
72 | registerDoMC(8)
73 | library(primerTree)
74 | mammals_16S = search_primer_pair(name='Mammals 16S',
75 |   'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT', .parallel=T)
76 | plot(mammals_16S)
77 | ```
78 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Documentation #
 2 | [x] - figure out a good name.
 3 | [x] - document all exported functions with examples
 4 | [ ] - create vignette showing workflow by piece and automatic
 5 | [x] - write wrapping function for entire workflow
 6 | [x] - implement plot_rank function to plot a single rank by itself
 7 | 
 8 | # Implementation #
 9 | [x] - filter alignments for exact sequences (how to handle different taxa???)
10 | [x] - handle hits to the same gi at different locations, this will crash clustalw currently. # USE CLUSTALO
11 | [ ] - enumerate dropdown options
12 | [x] - very degenerate primers?
13 | [x] - specify clustalo parameters, location
14 | [x] - retry failed connections
15 | 
16 | # Paper #
17 | [x] - submit paper to bioinformatics
18 | [ ] - submit package to CRAN
19 | 


--------------------------------------------------------------------------------
/data/bryophytes_trnL.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVesuviusC/primerTree/6b257ba8f6113bb5506b777c19b07bc06191be94/data/bryophytes_trnL.RData


--------------------------------------------------------------------------------
/data/mammals_16S.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVesuviusC/primerTree/6b257ba8f6113bb5506b777c19b07bc06191be94/data/mammals_16S.RData


--------------------------------------------------------------------------------
/man/accession2taxid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/taxonomy.R
 3 | \name{accession2taxid}
 4 | \alias{accession2taxid}
 5 | \title{Maps a nucleotide database accession to a taxonomy database taxId}
 6 | \usage{
 7 | accession2taxid(accessions)
 8 | }
 9 | \arguments{
10 | \item{accessions}{accessions character vector to lookup.}
11 | }
12 | \value{
13 | named vector of taxIds.
14 | }
15 | \description{
16 | Maps a nucleotide database accession to a taxonomy database taxId
17 | }
18 | 


--------------------------------------------------------------------------------
/man/bryophytes_trnL.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \docType{data}
 4 | \name{bryophytes_trnL}
 5 | \alias{bryophytes_trnL}
 6 | \title{PrimerTree results for the bryophyte trnL primers}
 7 | \description{
 8 | PrimerTree results for the bryophyte trnL primers
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/calc_rank_dist_ave.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{calc_rank_dist_ave}
 4 | \alias{calc_rank_dist_ave}
 5 | \title{Summarize pairwise differences.}
 6 | \usage{
 7 | calc_rank_dist_ave(x, ranks = common_ranks)
 8 | }
 9 | \arguments{
10 | \item{x}{a primerTree object}
11 | 
12 | \item{ranks}{ranks to show unique counts for, defaults to the common ranks}
13 | }
14 | \value{
15 | returns a data frame of results
16 | }
17 | \description{
18 | Summarize pairwise differences.
19 | }
20 | \details{
21 | The purpose of this function is to calculate the average number
22 | of nucleotide differences between species within each taxa of given taxonomic
23 | level.
24 | 
25 | For example, at the genus level, the function calculates the average number
26 | of nucleotide differences between all species within each genus and reports
27 | the mean of those values.
28 | 
29 | There are several key assumptions and calculations made in this
30 | function.
31 | 
32 | First, the function randomly selects one sequence from each species
33 | in the primerTree results. This is to keep any one species (e.g.
34 | human, cow, etc.) with many hits from skewing the results.
35 | 
36 | Second, for each taxonomic level tested, the function divides the
37 | sequences by each taxon at that level and calculates the mean
38 | number of nucleotide differences within that taxa, then returns the
39 | mean of those values.
40 | 
41 | Third, when calculating the average distance, any taxa for which
42 | there is only one species is omitted, as the number of nucleotide
43 | differences will always be 0.
44 | }
45 | \examples{
46 | \dontrun{
47 | calc_rank_dist_ave(mammals_16S)
48 | 
49 | calc_rank_dist_ave(bryophytes_trnL)
50 | 
51 | # Note that the differences between the results from these two primers
52 | # the mean nucleotide differences is much higher for the mammal primers
53 | # than the byrophyte primers. This suggests that the mammal primers have
54 | # better resolution to distinguish individual species.
55 | }
56 | }
57 | 


--------------------------------------------------------------------------------
/man/clustalo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{clustalo}
 4 | \alias{clustalo}
 5 | \title{Multiple sequence alignment with clustal omega}
 6 | \usage{
 7 | clustalo(x, exec = "clustalo", quiet = TRUE, original.ordering = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{an object of class 'DNAbin'}
11 | 
12 | \item{exec}{a character string with the name or path to the program}
13 | 
14 | \item{quiet}{whether to supress output to stderr or stdout}
15 | 
16 | \item{original.ordering}{use the original ordering of the sequences}
17 | 
18 | \item{...}{additional arguments passed to the command line clustalo}
19 | }
20 | \description{
21 | Calls clustal omega to align a set of sequences of class DNAbin.  Run
22 | without any arguments to see all the options you can pass to the command
23 | line clustal omega.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/filter_seqs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterResults.R
 3 | \name{filter_seqs}
 4 | \alias{filter_seqs}
 5 | \alias{filter_seqs.primerTree}
 6 | \title{Filter out sequences retrieved by search_primer_pair() that are either too
 7 | short or too long. The alignment and tree will be recalculated after removing
 8 | unwanted reads.}
 9 | \usage{
10 | filter_seqs(x, ...)
11 | 
12 | \method{filter_seqs}{primerTree}(x, min_length = 0, max_length = Inf, ...)
13 | }
14 | \arguments{
15 | \item{x}{a primerTree object}
16 | 
17 | \item{...}{additional arguments passed to methods.}
18 | 
19 | \item{min_length}{the minimum sequence length to keep}
20 | 
21 | \item{max_length}{the maximum sequence length to keep}
22 | }
23 | \value{
24 | a primerTree object
25 | }
26 | \description{
27 | Filter out sequences retrieved by search_primer_pair() that are either too
28 | short or too long. The alignment and tree will be recalculated after removing
29 | unwanted reads.
30 | }
31 | \section{Methods (by class)}{
32 | \itemize{
33 | \item \code{filter_seqs(primerTree)}: Method for primerTree objects
34 | 
35 | }}
36 | \examples{
37 | \dontrun{
38 | # filter out sequences longer or shorter than desired:
39 | mammals_16S_filtered <- filter_seqs(mammals_16S, min_length=131, max_length=156)
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/man/get_sequence.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{get_sequence}
 4 | \alias{get_sequence}
 5 | \title{Retrieves a fasta sequence from NCBI nucleotide database.}
 6 | \usage{
 7 | get_sequence(
 8 |   accession,
 9 |   start = NULL,
10 |   stop = NULL,
11 |   api_key = Sys.getenv("NCBI_API_KEY")
12 | )
13 | }
14 | \arguments{
15 | \item{accession}{nucleotide accession to retrieve.}
16 | 
17 | \item{start}{start base to retrieve, numbered beginning at 1.  If NULL the
18 | beginning of the sequence.}
19 | 
20 | \item{stop}{last base to retrieve, numbered beginning at 1. if NULL the end of
21 | the sequence.}
22 | 
23 | \item{api_key}{NCBI api-key to allow faster sequence retrieval.}
24 | }
25 | \value{
26 | an DNAbin object.
27 | }
28 | \description{
29 | Retrieves a fasta sequence from NCBI nucleotide database.
30 | }
31 | \seealso{
32 | \code{\link{DNAbin}}
33 | }
34 | 


--------------------------------------------------------------------------------
/man/get_sequences.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{get_sequences}
 4 | \alias{get_sequences}
 5 | \title{Retrieves fasta sequences from NCBI nucleotide database.}
 6 | \usage{
 7 | get_sequences(
 8 |   accession,
 9 |   start = NULL,
10 |   stop = NULL,
11 |   api_key = Sys.getenv("NCBI_API_KEY"),
12 |   simplify = TRUE,
13 |   .parallel = FALSE,
14 |   .progress = "none"
15 | )
16 | }
17 | \arguments{
18 | \item{accession}{the accession number of the sequence to retrieve}
19 | 
20 | \item{start}{start bases to retrieve, numbered beginning at 1.  If NULL the
21 | beginning of the sequence.}
22 | 
23 | \item{stop}{stop bases to retrieve, numbered beginning at 1. if NULL the stop of
24 | the sequence.}
25 | 
26 | \item{api_key}{NCBI api-key to allow faster sequence retrieval.}
27 | 
28 | \item{simplify}{simplify the FASTA headers to include only the genbank
29 | accession.}
30 | 
31 | \item{.parallel}{if 'TRUE', perform in parallel, using parallel backend
32 | provided by foreach}
33 | 
34 | \item{.progress}{name of the progress bar to use, see 'create_progress_bar'}
35 | }
36 | \value{
37 | an DNAbin object.
38 | }
39 | \description{
40 | Retrieves fasta sequences from NCBI nucleotide database.
41 | }
42 | \seealso{
43 | \code{\link{DNAbin}}
44 | }
45 | 


--------------------------------------------------------------------------------
/man/get_taxonomy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/taxonomy.R
 3 | \name{get_taxonomy}
 4 | \alias{get_taxonomy}
 5 | \title{Retrieve the taxonomy information from NCBI for a set of nucleotide gis.}
 6 | \usage{
 7 | get_taxonomy(accessions)
 8 | }
 9 | \arguments{
10 | \item{accessions}{a character vector of the accessions to retrieve}
11 | }
12 | \value{
13 | data.frame of the 'accessions, taxIds, and taxonomy
14 | }
15 | \description{
16 | Retrieve the taxonomy information from NCBI for a set of nucleotide gis.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/identify.primerTree_plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{identify.primerTree_plot}
 4 | \alias{identify.primerTree_plot}
 5 | \title{identify the point closest to the mouse click
 6 | only works on single ranks}
 7 | \usage{
 8 | \method{identify}{primerTree_plot}(x, ...)
 9 | }
10 | \arguments{
11 | \item{x}{the plot to identify}
12 | 
13 | \item{...}{additional arguments passed to annotate}
14 | }
15 | \description{
16 | identify the point closest to the mouse click
17 | only works on single ranks
18 | }
19 | 


--------------------------------------------------------------------------------
/man/layout_tree_ape.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{layout_tree_ape}
 4 | \alias{layout_tree_ape}
 5 | \title{layout a tree using ape, return an object to be plotted by
 6 | \code{\link{plot_tree}}}
 7 | \usage{
 8 | layout_tree_ape(tree, ...)
 9 | }
10 | \arguments{
11 | \item{tree}{The \code{\link{phylo}} tree to be plotted}
12 | 
13 | \item{...}{additional arguments to \code{\link{plot.phylo}}}
14 | }
15 | \value{
16 | \item{edge}{list of x, y and xend, yend coordinates
17 | as well as ids for the edges}
18 | \item{tips}{list of x, y, label and id for the tips}
19 | \item{nodes}{list of x, y and id for the nodes}
20 | }
21 | \description{
22 | layout a tree using ape, return an object to be plotted by
23 | \code{\link{plot_tree}}
24 | }
25 | 


--------------------------------------------------------------------------------
/man/mammals_16S.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \docType{data}
 4 | \name{mammals_16S}
 5 | \alias{mammals_16S}
 6 | \title{PrimerTree results for the mammalian 16S primers}
 7 | \description{
 8 | PrimerTree results for the mammalian 16S primers
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/parse_primer_hits.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/search.R
 3 | \name{parse_primer_hits}
 4 | \alias{parse_primer_hits}
 5 | \title{Parse the primer hits}
 6 | \usage{
 7 | parse_primer_hits(response)
 8 | }
 9 | \arguments{
10 | \item{response}{a httr response object obtained from \code{\link{primer_search}}}
11 | }
12 | \description{
13 | Parse the primer hits
14 | }
15 | 


--------------------------------------------------------------------------------
/man/plot.primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{plot.primerTree}
 4 | \alias{plot.primerTree}
 5 | \title{plot function for a primerTree object, calls plot_tree_ranks}
 6 | \usage{
 7 | \method{plot}{primerTree}(x, ranks = NULL, main = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{primerTree object to plot}
11 | 
12 | \item{ranks}{The ranks to include, defaults to all common ranks, if NULL
13 | print all ranks.  If 'none' just print the layout.}
14 | 
15 | \item{main}{an optional title to display, if NULL displays the name as the title}
16 | 
17 | \item{...}{additional arguments passed to plot_tree_ranks}
18 | }
19 | \description{
20 | plot function for a primerTree object, calls plot_tree_ranks
21 | }
22 | \examples{
23 | library(gridExtra)
24 | library(directlabels)
25 | #plot with all common ranks
26 | plot(mammals_16S)
27 | 
28 | #plot only the class
29 | plot(mammals_16S, 'class')
30 | 
31 | #plot the layout only
32 | plot(mammals_16S, 'none')
33 | }
34 | \seealso{
35 | \code{\link{plot_tree_ranks}}, \code{\link{plot_tree}}
36 | }
37 | 


--------------------------------------------------------------------------------
/man/plot_tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot_tree}
 4 | \alias{plot_tree}
 5 | \title{plots a tree, optionally with colored and labeled points by taxonomic rank}
 6 | \usage{
 7 | plot_tree(
 8 |   tree,
 9 |   type = "unrooted",
10 |   main = NULL,
11 |   guide_size = NULL,
12 |   rank = NULL,
13 |   taxonomy = NULL,
14 |   size = 2,
15 |   legend_cutoff = 25,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{tree}{to be plotted, use layout_tree to layout tree.}
21 | 
22 | \item{type}{The type of tree to plot, default unrooted.}
23 | 
24 | \item{main}{An optional title for the plot}
25 | 
26 | \item{guide_size}{The size of the length guide.  If NULL auto detects a
27 | reasonable size.}
28 | 
29 | \item{rank}{The rank to include, if null only the tree is plotted}
30 | 
31 | \item{taxonomy}{A data.frame with an accession field corresponding to the
32 | tree tip labels.}
33 | 
34 | \item{size}{The size of the colored points}
35 | 
36 | \item{legend_cutoff}{The number of different taxa names after which the
37 | names are no longer printed.}
38 | 
39 | \item{...}{additional arguments passed to \code{\link{layout_tree_ape}}}
40 | }
41 | \value{
42 | plot to be printed.
43 | }
44 | \description{
45 | plots a tree, optionally with colored and labeled points by taxonomic rank
46 | }
47 | 


--------------------------------------------------------------------------------
/man/plot_tree_ranks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot_tree_ranks}
 4 | \alias{plot_tree_ranks}
 5 | \title{plots a tree along with a series of taxonomic ranks}
 6 | \usage{
 7 | plot_tree_ranks(
 8 |   tree,
 9 |   taxonomy,
10 |   main = NULL,
11 |   type = "unrooted",
12 |   ranks = common_ranks,
13 |   size = 2,
14 |   guide_size = NULL,
15 |   legend_cutoff = 25,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{tree}{to be plotted, use layout_tree to layout tree.}
21 | 
22 | \item{taxonomy}{A data.frame with an accession field corresponding to the
23 | tree tip labels.}
24 | 
25 | \item{main}{An optional title for the plot}
26 | 
27 | \item{type}{The type of tree to plot, default unrooted.}
28 | 
29 | \item{ranks}{The ranks to include, defaults to all common ranks, if null
30 | print all ranks.}
31 | 
32 | \item{size}{The size of the colored points}
33 | 
34 | \item{guide_size}{The size of the length guide.  If NULL auto detects a
35 | reasonable size.}
36 | 
37 | \item{legend_cutoff}{The number of different taxa names after which the
38 | names are no longer printed.}
39 | 
40 | \item{...}{additional arguments passed to \code{\link{layout_tree_ape}}}
41 | }
42 | \description{
43 | plots a tree along with a series of taxonomic ranks
44 | }
45 | \examples{
46 | library(gridExtra)
47 | library(directlabels)
48 | #plot all the common ranks
49 | plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy)
50 | #plot specific ranks, with a larger dot size
51 | plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy,
52 |   ranks=c('kingdom', 'class', 'family'), size=3)
53 | }
54 | \seealso{
55 | \code{\link{plot_tree}} to plot only a single rank or the just the
56 | tree layout.
57 | }
58 | 


--------------------------------------------------------------------------------
/man/primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \docType{package}
 4 | \name{primerTree}
 5 | \alias{primerTree}
 6 | \alias{primerTree-package}
 7 | \title{\pkg{primerTree} Visually Assessing the Specificity and Informativeness of Primer Pairs}
 8 | \description{
 9 | \code{primerTree} has two main commands:
10 | \code{\link{search_primer_pair}} which takes a primer pair and returns an
11 | primerTree object of the search results
12 | \code{\link{plot.primerTree}} a S3 method for plotting the primerTree object
13 | obtained using \code{\link{search_primer_pair}}
14 | }
15 | 


--------------------------------------------------------------------------------
/man/primer_search.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/search.R
 3 | \name{primer_search}
 4 | \alias{primer_search}
 5 | \title{Query a pair of primers using ncbi's Primer-BLAST, if primers contain iupac}
 6 | \usage{
 7 | primer_search(
 8 |   forward,
 9 |   reverse,
10 |   num_aligns = 500,
11 |   num_permutations = 25,
12 |   ...,
13 |   .parallel = FALSE,
14 |   .progress = "none"
15 | )
16 | }
17 | \arguments{
18 | \item{forward}{forward primer to search by 5'-3' on plus strand}
19 | 
20 | \item{reverse}{reverse primer to search by 5'-3' on minus strand}
21 | 
22 | \item{num_aligns}{number of alignment results to keep}
23 | 
24 | \item{num_permutations}{the number of primer permutations to search, if the degenerate bases
25 | cause more than this number of permutations to exist, this number will be
26 | sampled from all possible permutations.}
27 | 
28 | \item{...}{additional arguments passed to Primer-Blast}
29 | 
30 | \item{.parallel}{if 'TRUE', perform in parallel, using parallel backend
31 | provided by foreach}
32 | 
33 | \item{.progress}{name of the progress bar to use, see 'create_progress_bar'}
34 | }
35 | \value{
36 | httr response object of the query, pass to \code{\link{parse_primer_hits}} to parse the results.
37 | }
38 | \description{
39 | ambiguity codes, enumerate all possible combinations and combine the
40 | results.
41 | }
42 | 


--------------------------------------------------------------------------------
/man/search_primer_pair.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{search_primer_pair}
 4 | \alias{search_primer_pair}
 5 | \title{Automatic primer searching Search a given primer pair, retrieving the alignment
 6 | results, their product sequences, the taxonomic information for the sequences,
 7 | a multiple alignment of the products}
 8 | \usage{
 9 | search_primer_pair(
10 |   forward,
11 |   reverse,
12 |   name = NULL,
13 |   num_aligns = 500,
14 |   num_permutations = 25,
15 |   simplify = TRUE,
16 |   clustal_options = list(),
17 |   distance_options = list(model = "N", pairwise.deletion = T),
18 |   api_key = Sys.getenv("NCBI_API_KEY"),
19 |   ...,
20 |   .parallel = FALSE,
21 |   .progress = "none"
22 | )
23 | }
24 | \arguments{
25 | \item{forward}{forward primer to search by 5'-3' on plus strand}
26 | 
27 | \item{reverse}{reverse primer to search by 5'-3' on minus strand}
28 | 
29 | \item{name}{name to give to the primer pair}
30 | 
31 | \item{num_aligns}{number of alignment results to keep}
32 | 
33 | \item{num_permutations}{the number of primer permutations to search, if the degenerate bases
34 | cause more than this number of permutations to exist, this number will be
35 | sampled from all possible permutations.}
36 | 
37 | \item{simplify}{use simple names for primer hit results or complex}
38 | 
39 | \item{clustal_options}{a list of options to pass to clustal omega, see
40 | \code{link{clustalo}} for a list of options}
41 | 
42 | \item{distance_options}{a list of options to pass to dist.dna, see
43 | \code{link{dist.dna}} for a list of options}
44 | 
45 | \item{api_key}{NCBI api-key to allow faster sequence retrieval}
46 | 
47 | \item{...}{additional arguments passed to Primer-Blast}
48 | 
49 | \item{.parallel}{if 'TRUE', perform in parallel, using parallel backend
50 | provided by foreach}
51 | 
52 | \item{.progress}{name of the progress bar to use, see
53 | \code{\link{create_progress_bar}}}
54 | }
55 | \value{
56 | A list with the following elements,
57 | \item{name}{name of the primer pair}
58 | \item{BLAST_result}{html blast results from Primer-BLAST as
59 |  'a \code{\link{response}}} object.
60 | \item{taxonomy}{taxonomy for the primer products from NCBI}
61 | \item{sequence}{sequence of the primer products}
62 | \item{alignment}{multiple alignment of the primer products}
63 | \item{tree}{phylogenetic tree of the reconstructed from the
64 | 'multiple alignment}
65 | }
66 | \description{
67 | Automatic primer searching Search a given primer pair, retrieving the alignment
68 | results, their product sequences, the taxonomic information for the sequences,
69 | a multiple alignment of the products
70 | }
71 | \examples{
72 | \dontrun{
73 | #simple search
74 | mammals_16S = search_primer_pair(name='Mammals 16S',
75 |  'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT')
76 | #returning 1000 alignments, allow up to 3 mismatches in primer
77 | mammals_16S = search_primer_pair(name='Mammals 16S',
78 |  'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT',
79 |  num_aligns=1000, total_primer_specificity_mismatch=3)
80 | }
81 | }
82 | \seealso{
83 | \code{\link{primer_search}}, \code{\link{clustalo}}
84 | }
85 | 


--------------------------------------------------------------------------------
/man/seq_lengths.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterResults.R
 3 | \name{seq_lengths}
 4 | \alias{seq_lengths}
 5 | \title{Get a summary of sequence lengths from a primerTree object}
 6 | \usage{
 7 | seq_lengths(x, summarize = TRUE)
 8 | }
 9 | \arguments{
10 | \item{x}{a primerTree object.}
11 | 
12 | \item{summarize}{a logical indicating if a summary should be displayed}
13 | }
14 | \value{
15 | a table of sequence length frequencies
16 | }
17 | \description{
18 | Get a summary of sequence lengths from a primerTree object
19 | }
20 | \examples{
21 | 
22 | # Show the counts for each length
23 | seq_lengths(mammals_16S)
24 | 
25 | # Plot the distribution of lengths
26 | seqLengths <- seq_lengths(mammals_16S)
27 | barplot(seqLengths, 
28 |  main = "Frequency of sequence lengths for 16S mammal primers", 
29 |  xlab="Amplicon length (in bp)", 
30 |  ylab=("Frequency"))
31 | }
32 | 


--------------------------------------------------------------------------------
/man/seq_lengths.primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterResults.R
 3 | \name{seq_lengths.primerTree}
 4 | \alias{seq_lengths.primerTree}
 5 | \title{Method for primerTree objects}
 6 | \usage{
 7 | \method{seq_lengths}{primerTree}(x, summarize = TRUE)
 8 | }
 9 | \arguments{
10 | \item{x}{a primerTree object.}
11 | 
12 | \item{summarize}{a logical indicating if a summary should be displayed}
13 | }
14 | \description{
15 | Method for primerTree objects
16 | }
17 | 


--------------------------------------------------------------------------------
/man/summary.primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{summary.primerTree}
 4 | \alias{summary.primerTree}
 5 | \title{Summarize a primerTree result, printing quantiles of sequence length and
 6 | pairwise differences.}
 7 | \usage{
 8 | \method{summary}{primerTree}(object, ..., probs = c(0, 0.05, 0.5, 0.95, 1), ranks = common_ranks)
 9 | }
10 | \arguments{
11 | \item{object}{the primerTree object to summarise}
12 | 
13 | \item{...}{Ignored options}
14 | 
15 | \item{probs}{quantile probabilities to compute, defaults to 0, 5, 50, 95,
16 | and 100 probabilities.}
17 | 
18 | \item{ranks}{ranks to show unique counts for, defaults to the common ranks}
19 | }
20 | \value{
21 | invisibly returns a list containing the printed results
22 | }
23 | \description{
24 | Summarize a primerTree result, printing quantiles of sequence length and
25 | pairwise differences.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/tree_from_alignment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{tree_from_alignment}
 4 | \alias{tree_from_alignment}
 5 | \title{Construct a neighbor joining tree from a dna alignment}
 6 | \usage{
 7 | tree_from_alignment(dna, pairwise.deletion = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{dna}{fasta dna object the tree is to be constructed from}
11 | 
12 | \item{pairwise.deletion}{a logical indicating if the distance matrix should 
13 | be constructed using pairwise deletion}
14 | 
15 | \item{...}{furthur arguments to dist.dna}
16 | }
17 | \description{
18 | Construct a neighbor joining tree from a dna alignment
19 | }
20 | \seealso{
21 | \code{\link{dist.dna}}, \code{\link{nj}}
22 | }
23 | 


--------------------------------------------------------------------------------
/paper.Rmd:
--------------------------------------------------------------------------------
  1 | ```{r paper_setup, cache=FALSE, include=FALSE}
  2 | library(knitcitations)
  3 | # create bibtex entries
  4 | write.bibtex(c(knitr=citation('knitr'),
  5 |                knitcitations=citation('knitcitations'), plyr=citation('plyr'),
  6 |                ggplot2=citation('ggplot2'), ape=citation('ape'),
  7 |                directlabels=citation('directlabels'),
  8 |                plyr=citation('plyr'), foreach=citation('foreach')),
  9 |              file='citations.bib')
 10 | bib = read.bibtex('citations.bib')
 11 | ```
 12 | 
 13 | # PrimerTree: Visually Assessing the Specificity and Informativeness of Primer Pairs#
 14 | James Hester and David Serre
 15 | 
 16 | Genomic Medicine Institute, Cleveland Clinic Lerner Research Institute, Cleveland, OH 44195, USA
 17 | # Abstract #
 18 | ## Summary ##
 19 | Designing PCR primers is a critical step in most genetic studies. Primers need
 20 | to be sensitive and specific, as well as yield informative DNA sequences. For
 21 | example, in metagenomic studies, primers need to amplify only DNA from the
 22 | targeted community and amplify DNA sequences that enable differentiating all
 23 | members of this community. Several informatics tools exist for designing
 24 | primers based on a template DNA sequence and for identifying potential target
 25 | sequences in a database. However, there are no tools to systematically analyze
 26 | and visualize the specificity of PCR primers and the informativeness of the
 27 | amplified sequence. PrimerTree is an R package that enables identifying
 28 | potential target sequences for a set of primers and generates
 29 | taxonomically annotated phylogenetic trees with the predicted amplification
 30 | products.
 31 | 
 32 | ## Availability ##
 33 | PrimerTree is an R package released under the GPL-2 license and is available through
 34 | [CRAN](http://cran.rproject.org/web/packages/primerTree/index.html).  Source
 35 | code and developmental versions are available at .
 36 | 
 37 | ## Contact ##
 38 | [hesterj@ccf.org](mailto:hesterj@ccf.org)
 39 | 
 40 | ## Supplemental Information ##
 41 | Supplementary data are available at Bioinformatics online.
 42 | 
 43 | # Introduction #
 44 | Designing primers for PCR is the initial step in many biomedical, forensic and
 45 | metagenomic studies. There are a number of important factors to consider in
 46 | primer design. First one must consider the chemical properties of the primers,
 47 | including their length, melting temperature, GC content, secondary structure
 48 | and the likelihood of primer dimer formation. The primer pair must also
 49 | amplify specifically DNA from the target of interest and must not produce
 50 | offtarget products. In addition, for some studies, the DNA sequence amplified
 51 | must provide enough information to identify the source of the DNA. In
 52 | metagenomic studies, the primers need to amplify DNA from the taxon of interest
 53 | (e.g., bacteria, fungus, birds), not amplify unrelated taxon and provide enough
 54 | information after sequencing to characterize which members of the communities
 55 | are present in a given sample. Some clinical studies need to amplify only a
 56 | specific pathogen species and enable identifying which strains are present.
 57 | 
 58 | While there are efficient online tools for designing primers `r citep('10.1093/nar/gks596')`
 59 | and testing their specificity `r citep(c(primer_BLAST='10.1186/1471-2105-13-134'))`, there is no simple
 60 | tool to systematically assess the informativeness of the primer products.
 61 | PrimerTree is an R package that, given a set of primer pairs, provides visual
 62 | assessment of their specificity and informativeness by constructing
 63 | phylogenetic trees of the predicted PCR products along with their taxonomic
 64 | annotation. PrimerTree can run on a wide variety of hardware and only requires
 65 | two commands, making it accessible to a large audience.
 66 | 
 67 | # Methods #
 68 | PrimerTree successively performs primer search, retrieval of DNA sequences
 69 | predicted to be amplified, taxonomic identification of these sequences,
 70 | multiple sequence alignment, reconstruction of a phylogenetic tree, and
 71 | visualization of the tree with taxonomic annotation. Note that multiple primer
 72 | pairs can be queried simultaneously using PrimerTree.
 73 | 
 74 | PrimerTree utilizes the primer search implemented in Primer-BLAST
 75 | `r citep(c(primer_BLAST='10.1186/1471-2105-13-134'))`
 76 | by directly querying the NCBI Primer-BLAST search page. This allows using
 77 | all the options available on the NCBI site. By default PrimerTree searches the
 78 | NCBI (nt) nucleotide database but alternative NCBI databases can be chosen
 79 | through the function options. Note that when the primers are degenerated,
 80 | PrimerTree automatically tests all possible combinations of primer sequences
 81 | in Primer-BLAST and merges the results. The primer alignment results are then
 82 | processed using the NCBI Eutilities `r citep('http://www.ncbi.nlm.nih.gov/books/NBK25500/')`
 83 | to i) retrieve DNA sequences located between the primers (i.e., “amplified”)
 84 | and ii) obtain taxonomic information related to each DNA sequence using the
 85 | NCBI’s taxonomy database `r citep('http://www.ncbi.nlm.nih.gov/books/NBK21100/')`.
 86 | 
 87 | PrimerTree next aligns all amplified sequences using Clustal Omega
 88 | `r citep('10.1038/msb.2011.75')` and reconstructs a Neighbor-Joining tree using the ape package
 89 |  `r citep('10.1093/bioinformatics/btg412')`. Finally, PrimerTree displays the resulting phylogenetic
 90 | tree using the ggplot2 package, labeling each taxon in a different color and
 91 | adding the names of the main taxa using the directlabels package
 92 | `r citep(c(bib[['ggplot2']], bib[['directlabels']]))`.
 93 | 
 94 | # Results #
 95 | Figure 1 shows a subset of PrimerTree results for universal non-vascular plant (Bryophyte) primers
 96 | `r citep('10.1111/j.1365-294X.2012.05537.x')` targeting the chloroplast trnL gene (Figure 1A)
 97 | and mammal mitochondrial 16S ribosomal rna gene (Figure1B)
 98 | `r citep('10.1093/oxfordjournals.molbev.a025566')`. The tree display enables
 99 | rapid evaluation of the specificity of the primer pairs (e.g., offtarget
100 | amplification of amphibians and ray-finned fishes on Fig. 1B). In addition, the
101 | information encoded in the amplified DNA sequence can also be easily assessed
102 | by the length of the branches leading to different sequences (scaled in number
103 | of nucleotide differences). By default, PrimerTree displays the annotated
104 | phylogenetic tree for all taxonomic levels (e.g., kindom, phylum, class)
105 | enabling the user to determine the level of specificity of each primers (see
106 | e.g., Supplemental Figure 1).
107 | 
108 | For nondegenerated primers and using a single thread, PrimerTree usually runs
109 | in less than 240 seconds. However, the average runtime varies greatly depending
110 | on the primer specificity (i.e. how many DNA sequences are “amplified”), the
111 | search parameters chosen, and the current load on the NCBI servers and the
112 | internet connection. Highly degenerated primers result in large numbers of
113 | possible primer pairs, which can increase the runtime considerably. To limit
114 | maximum runtime in this situation, PrimerTree randomly samples only a portion
115 | of the total primer permutations. This provides a representative sample for
116 | most cases. Changing the number of sampled permutations or turning off sampling
117 | completely is possible. PrimerTree uses the plyr package extensively and has
118 | full support for any of the parallel backends compatible with the foreach
119 | package `r citep(c(bib[['plyr']], bib[['foreach']]))`. In particular, parallel retrieval of
120 | the primer sequences from NCBI speeds up the total runtime considerably. Note
121 | that parallel queries to Primer-BLAST are queued by NCBI’s servers and only
122 | processed once there is free compute time.
123 | 
124 | # Conclusions #
125 | Designing primers can be challenging, especially for metagenomic study, and
126 | dramatically impact the study results. We have developed the PrimerTree
127 | package to help assessing large numbers of designed primers and choosing the
128 | best primer pair for a given experiment. PrimerTree is an R package available
129 | on a wide variety of platforms and is very simple to use and install.
130 | PrimerTree is released under an opensource license and can be downloaded from
131 | , which also provides additional
132 | documentation.
133 | 
134 | ## Funding ##
135 | XXX
136 | 
137 | ## Conflict of Interest ##
138 | None declared
139 | 
140 | # References #
141 | ```{r paper_bibliography, results='asis', echo=F}
142 | bibliography(style='markdown')
143 | ```
144 | 
145 | ```{r paper_get_data, echo=F, eval=F}
146 | library(doMC)
147 | registerDoMC(8)
148 | mammals_16S = search_primer_pair(name='Mammals 16S', 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT', num_aligns=1000, .parallel=T)
149 | bryophytes_trnL = search_primer_pair(name='Bryophyte trnL', 'GATTCAGGGAAACTTAGGTTG', 'CCATTGAGTCTCTGCACC', num_aligns=1000, .parallel=T)
150 | save(file='data/mammals_16S.RData', mammals_16S)
151 | save(file='data/bryophytes_trnL.RData', bryophytes_trnL)
152 | ```
153 | ```{r paper_figure1, dev='png', fig.height=4.5, fig.width=6, warning=FALSE}
154 | change_mapping = function(plot){
155 |   plot$layers[[1]]$geom_params$colour = 'lightgrey'
156 |   plot$layers[[4]]$mapping = aes(x=x, y=y, color=class, shape=class) #add shape to points mapping
157 |   plot$layers[[5]]$mapping = aes(x=x, y=y, label=class) #remove color from direct labels mapping
158 |   plot
159 | }
160 | p1 = change_mapping(plot(mammals_16S, ranks='class', main='Mammal 16S', rotate=45, size=4) + scale_colour_grey(start=0, end=.8)) #scale_color_brewer(palette="Set1"))
161 | p2 = change_mapping(plot(bryophytes_trnL, ranks='class', main='Bryophyte trnL', size=4) + scale_colour_grey(start=0, end=.8)) #scale_color_brewer(palette="Set1"))
162 | library(gridExtra)
163 | p3 = arrangeGrob(p2,p1, ncol=2, clip=F)
164 | grid.draw(p3)
165 | ```
166 | ```{r paper_figure1_big, dev='svg', fig.height=12, fig.width=12, depth=300, dev='postscript', warning=FALSE}
167 | setEPS()
168 | grid.draw(p3)
169 | ```
170 | ```{r paper_supplemental, fig.height=12, fig.width=12, depth=300, dev='postscript', warning=FALSE}
171 | setEPS()
172 | rm(scale_colour_discrete)
173 | plot(mammals_16S)
174 | plot(bryophytes_trnL)
175 | ```
176 | 


--------------------------------------------------------------------------------
/primerTree.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageInstallArgs: --no-multiarch --with-keep.source
17 | PackageRoxygenize: rd,namespace
18 | 


--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
 1 | #include 
 2 | #include 
 3 | #include  // for NULL
 4 | #include 
 5 | 
 6 | /* FIXME:
 7 |    Check these declarations against the C/Fortran source code.
 8 | */
 9 | 
10 | /* .Call calls */
11 | extern SEXP rawStreamToDNAbin(SEXP);
12 | 
13 | static const R_CallMethodDef CallEntries[] = {
14 |     {"rawStreamToDNAbin", (DL_FUNC) &rawStreamToDNAbin, 1},
15 |     {NULL, NULL, 0}
16 | };
17 | 
18 | void R_init_primerTree(DllInfo *dll)
19 | {
20 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
21 |     R_useDynamicSymbols(dll, FALSE);
22 | }
23 | 


--------------------------------------------------------------------------------
/src/read_dna.c:
--------------------------------------------------------------------------------
  1 | /* read_dna.c       2014-03-05 */
  2 | 
  3 | /* Copyright 2013-2014 Emmanuel Paradis */
  4 | 
  5 | /* This file is part of the R-package `ape'. */
  6 | /* See the file ../COPYING for licensing issues. */
  7 | 
  8 | #include 
  9 | #include 
 10 | 
 11 | // The initial code defining and initialising the translation table:
 12 | //
 13 | //	for (i = 0; i < 122; i++) tab_trans[i] = 0x00;
 14 | //
 15 | //	tab_trans[65] = 0x88; /* A */
 16 | //	tab_trans[71] = 0x48; /* G */
 17 | //	tab_trans[67] = 0x28; /* C */
 18 | // 	tab_trans[84] = 0x18; /* T */
 19 | // 	tab_trans[82] = 0xc0; /* R */
 20 | // 	tab_trans[77] = 0xa0; /* M */
 21 | // 	tab_trans[87] = 0x90; /* W */
 22 | // 	tab_trans[83] = 0x60; /* S */
 23 | // 	tab_trans[75] = 0x50; /* K */
 24 | // 	tab_trans[89] = 0x30; /* Y */
 25 | // 	tab_trans[86] = 0xe0; /* V */
 26 | // 	tab_trans[72] = 0xb0; /* H */
 27 | // 	tab_trans[68] = 0xd0; /* D */
 28 | //  	tab_trans[66] = 0x70; /* B */
 29 | // 	tab_trans[78] = 0xf0; /* N */
 30 | //
 31 | //	tab_trans[97] = 0x88; /* a */
 32 | //	tab_trans[103] = 0x48; /* g */
 33 | //	tab_trans[99] = 0x28; /* c */
 34 | // 	tab_trans[116] = 0x18; /* t */
 35 | // 	tab_trans[114] = 0xc0; /* r */
 36 | // 	tab_trans[109] = 0xa0; /* m */
 37 | // 	tab_trans[119] = 0x90; /* w */
 38 | // 	tab_trans[115] = 0x60; /* s */
 39 | // 	tab_trans[107] = 0x50; /* k */
 40 | // 	tab_trans[121] = 0x30; /* y */
 41 | // 	tab_trans[118] = 0xe0; /* v */
 42 | // 	tab_trans[104] = 0xb0; /* h */
 43 | // 	tab_trans[100] = 0xd0; /* d */
 44 | //  	tab_trans[98] = 0x70; /* b */
 45 | // 	tab_trans[110] = 0xf0; /* n */
 46 | //
 47 | //  	tab_trans[45] = 0x04; /* - */
 48 | //  	tab_trans[63] = 0x02; /* ? */
 49 | 
 50 | static const unsigned char tab_trans[] = {
 51 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0-9 */
 52 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10-19 */
 53 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 20-29 */
 54 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 30-39 */
 55 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, /* 40-49 */
 56 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 50-59 */
 57 | 	0x00, 0x00, 0x00, 0x02, 0x00, 0x88, 0x70, 0x28, 0xd0, 0x00, /* 60-69 */
 58 | 	0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, 0xf0, 0x00, /* 70-79 */
 59 | 	0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, 0x00, 0x30, /* 80-89 */
 60 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x70, 0x28, /* 90-99 */
 61 | 	0xd0, 0x00, 0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, /* 100-109 */
 62 | 	0xf0, 0x00, 0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, /* 110-119 */
 63 | 	0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120-129 */
 64 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 130-139 */
 65 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 140-149 */
 66 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 150-159 */
 67 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160-169 */
 68 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 170-179 */
 69 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 180-189 */
 70 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 190-199 */
 71 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 200-209 */
 72 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 210-219 */
 73 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 220-229 */
 74 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 230-239 */
 75 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 240-249 */
 76 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 250-255 */
 77 | 
 78 | static const unsigned char hook = 0x3e;
 79 | static const unsigned char lineFeed = 0x0a;
 80 | /* static const unsigned char space = 0x20; */
 81 | 
 82 | SEXP rawStreamToDNAbin(SEXP x)
 83 | {
 84 | 	int N, i, j, k, n, startOfSeq;
 85 | 	unsigned char *xr, *rseq, *buffer, tmp;
 86 | 	SEXP obj, nms, seq;
 87 | 
 88 | 	PROTECT(x = coerceVector(x, RAWSXP));
 89 | 	N = LENGTH(x);
 90 | 	xr = RAW(x);
 91 | 
 92 | /* do a 1st pass to find the number of sequences
 93 | 
 94 |    this code should be robust to '>' present inside
 95 |    a label or in the header text before the sequences */
 96 | 
 97 | 	n = j = 0; /* use j as a flag */
 98 | 	if (xr[0] == hook) {
 99 | 		j = 1;
100 | 		startOfSeq = 0;
101 | 	}
102 | 	i = 1;
103 | 	for (i = 1; i < N; i++) {
104 | 		if (j && xr[i] == lineFeed) {
105 | 			n++;
106 | 			j = 0;
107 | 		} else if (xr[i] == hook) {
108 | 			if (!n) startOfSeq = i;
109 | 			j = 1;
110 | 		}
111 | 	}
112 | 
113 | 	PROTECT(obj = allocVector(VECSXP, n));
114 | 	PROTECT(nms = allocVector(STRSXP, n));
115 | 
116 | /* Refine the way the size of the buffer is set? */
117 | 	buffer = (unsigned char *)R_alloc(N, sizeof(unsigned char *));
118 | 
119 | 	i = startOfSeq;
120 | 	j = 0; /* gives the index of the sequence */
121 | 	while (i < N) {
122 | 		/* 1st read the label... */
123 | 		i++;
124 | 		k = 0;
125 | 		while (xr[i] != lineFeed) buffer[k++] = xr[i++];
126 | 		buffer[k] = '\0';
127 | 		SET_STRING_ELT(nms, j, mkChar((char *)buffer));
128 | 		/* ... then read the sequence */
129 | 		n = 0;
130 | 		while (i < N && xr[i] != hook) {
131 | 			tmp = tab_trans[xr[i++]];
132 | /* If we are sure that the FASTA file is correct (ie, the sequence on
133 |    a single line and only the IUAPC code (plus '-' and '?') is used,
134 |    then the following check would not be needed; additionally the size
135 |    of tab_trans could be restriced to 0-121. This check has the
136 |    advantage that all invalid characters are simply ignored without
137 |    causing error -- except if '>' occurs in the middle of a sequence. */
138 | 			if (tmp) buffer[n++] = tmp;
139 | 		}
140 | 		PROTECT(seq = allocVector(RAWSXP, n));
141 | 		rseq = RAW(seq);
142 | 		for (k = 0; k < n; k++) rseq[k] = buffer[k];
143 | 		SET_VECTOR_ELT(obj, j, seq);
144 | 		UNPROTECT(1);
145 | 		j++;
146 | 	}
147 | 	setAttrib(obj, R_NamesSymbol, nms);
148 | 	UNPROTECT(3);
149 | 	return obj;
150 | }
151 | 


--------------------------------------------------------------------------------