├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE.md
├── Makefile
├── NAMESPACE
├── NEWS.md
├── R
    ├── filterResults.R
    ├── plot.R
    ├── primerTree.R
    ├── search.R
    ├── sequence.R
    └── taxonomy.R
├── README.md
├── TODO.md
├── data
    ├── bryophytes_trnL.RData
    └── mammals_16S.RData
├── man
    ├── accession2taxid.Rd
    ├── bryophytes_trnL.Rd
    ├── calc_rank_dist_ave.Rd
    ├── clustalo.Rd
    ├── filter_seqs.Rd
    ├── get_sequence.Rd
    ├── get_sequences.Rd
    ├── get_taxonomy.Rd
    ├── identify.primerTree_plot.Rd
    ├── layout_tree_ape.Rd
    ├── mammals_16S.Rd
    ├── parse_primer_hits.Rd
    ├── plot.primerTree.Rd
    ├── plot_tree.Rd
    ├── plot_tree_ranks.Rd
    ├── primerTree.Rd
    ├── primer_search.Rd
    ├── search_primer_pair.Rd
    ├── seq_lengths.Rd
    ├── seq_lengths.primerTree.Rd
    ├── summary.primerTree.Rd
    └── tree_from_alignment.Rd
├── paper.Rmd
├── primerTree.Rproj
└── src
    ├── init.c
    └── read_dna.c


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^CRAN-RELEASE$
 2 | ^.*\.Rproj$
 3 | ^\.Rproj\.user$
 4 | ^Makefile$
 5 | ^.*docx$
 6 | ^.*.md$
 7 | ^figure$
 8 | ^citations$
 9 | ^.*\.bib$
10 | ^test\.fa$
11 | ^.*\.html$
12 | ^\.travis\.yml$
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 | src/*.o
5 | src/*.so
6 | src/*.dll
7 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | # R for travis: see documentation at https://docs.travis-ci.com/user/languages/r
2 | 
3 | language: R
4 | sudo: false
5 | cache: packages
6 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: primerTree
 2 | Title: Visually Assessing the Specificity and Informativeness of
 3 |     Primer Pairs
 4 | Version: 1.0.7
 5 | Authors@R: 
 6 |     c(person(given = "Jim",
 7 |           family = "Hester",
 8 |           role = "aut",
 9 |           email = "james.hester@gmail.com"),
10 |       person(given = "Matt",
11 |         family = "Cannon",
12 |         role = c("aut", "cre"),
13 |         email = "matthewvc1@gmail.com"))
14 | Description: Identifies potential target sequences for a given
15 |     set of primers and generates phylogenetic trees annotated with the
16 |     taxonomies of the predicted amplification products.
17 | License: GPL-2
18 | Depends:
19 |     directlabels,
20 |     gridExtra,
21 |     R (>= 3.0.0)
22 | Imports:
23 |     ape,
24 |     foreach,
25 |     ggplot2,
26 |     grid,
27 |     httr,
28 |     lubridate,
29 |     plyr,
30 |     reshape2,
31 |     scales,
32 |     stringr,
33 |     XML
34 | Encoding: UTF-8
35 | LazyData: true
36 | RoxygenNote: 7.2.3
37 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | PrimerTree
  2 | Copyright (C) 2013 Jim Hester
  3 | 
  4 |                     GNU GENERAL PUBLIC LICENSE
  5 |                        Version 2, June 1991
  6 | 
  7 |  Copyright (C) 1989, 1991 Free Software Foundation, Inc., [http://fsf.org/]
  8 |  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  9 |  Everyone is permitted to copy and distribute verbatim copies
 10 |  of this license document, but changing it is not allowed.
 11 | 
 12 |                             Preamble
 13 | 
 14 |   The licenses for most software are designed to take away your
 15 | freedom to share and change it.  By contrast, the GNU General Public
 16 | License is intended to guarantee your freedom to share and change free
 17 | software--to make sure the software is free for all its users.  This
 18 | General Public License applies to most of the Free Software
 19 | Foundation's software and to any other program whose authors commit to
 20 | using it.  (Some other Free Software Foundation software is covered by
 21 | the GNU Lesser General Public License instead.)  You can apply it to
 22 | your programs, too.
 23 | 
 24 |   When we speak of free software, we are referring to freedom, not
 25 | price.  Our General Public Licenses are designed to make sure that you
 26 | have the freedom to distribute copies of free software (and charge for
 27 | this service if you wish), that you receive source code or can get it
 28 | if you want it, that you can change the software or use pieces of it
 29 | in new free programs; and that you know you can do these things.
 30 | 
 31 |   To protect your rights, we need to make restrictions that forbid
 32 | anyone to deny you these rights or to ask you to surrender the rights.
 33 | These restrictions translate to certain responsibilities for you if you
 34 | distribute copies of the software, or if you modify it.
 35 | 
 36 |   For example, if you distribute copies of such a program, whether
 37 | gratis or for a fee, you must give the recipients all the rights that
 38 | you have.  You must make sure that they, too, receive or can get the
 39 | source code.  And you must show them these terms so they know their
 40 | rights.
 41 | 
 42 |   We protect your rights with two steps: (1) copyright the software, and
 43 | (2) offer you this license which gives you legal permission to copy,
 44 | distribute and/or modify the software.
 45 | 
 46 |   Also, for each author's protection and ours, we want to make certain
 47 | that everyone understands that there is no warranty for this free
 48 | software.  If the software is modified by someone else and passed on, we
 49 | want its recipients to know that what they have is not the original, so
 50 | that any problems introduced by others will not reflect on the original
 51 | authors' reputations.
 52 | 
 53 |   Finally, any free program is threatened constantly by software
 54 | patents.  We wish to avoid the danger that redistributors of a free
 55 | program will individually obtain patent licenses, in effect making the
 56 | program proprietary.  To prevent this, we have made it clear that any
 57 | patent must be licensed for everyone's free use or not licensed at all.
 58 | 
 59 |   The precise terms and conditions for copying, distribution and
 60 | modification follow.
 61 | 
 62 |                     GNU GENERAL PUBLIC LICENSE
 63 |    TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
 64 | 
 65 |   0. This License applies to any program or other work which contains
 66 | a notice placed by the copyright holder saying it may be distributed
 67 | under the terms of this General Public License.  The "Program", below,
 68 | refers to any such program or work, and a "work based on the Program"
 69 | means either the Program or any derivative work under copyright law:
 70 | that is to say, a work containing the Program or a portion of it,
 71 | either verbatim or with modifications and/or translated into another
 72 | language.  (Hereinafter, translation is included without limitation in
 73 | the term "modification".)  Each licensee is addressed as "you".
 74 | 
 75 | Activities other than copying, distribution and modification are not
 76 | covered by this License; they are outside its scope.  The act of
 77 | running the Program is not restricted, and the output from the Program
 78 | is covered only if its contents constitute a work based on the
 79 | Program (independent of having been made by running the Program).
 80 | Whether that is true depends on what the Program does.
 81 | 
 82 |   1. You may copy and distribute verbatim copies of the Program's
 83 | source code as you receive it, in any medium, provided that you
 84 | conspicuously and appropriately publish on each copy an appropriate
 85 | copyright notice and disclaimer of warranty; keep intact all the
 86 | notices that refer to this License and to the absence of any warranty;
 87 | and give any other recipients of the Program a copy of this License
 88 | along with the Program.
 89 | 
 90 | You may charge a fee for the physical act of transferring a copy, and
 91 | you may at your option offer warranty protection in exchange for a fee.
 92 | 
 93 |   2. You may modify your copy or copies of the Program or any portion
 94 | of it, thus forming a work based on the Program, and copy and
 95 | distribute such modifications or work under the terms of Section 1
 96 | above, provided that you also meet all of these conditions:
 97 | 
 98 |     a) You must cause the modified files to carry prominent notices
 99 |     stating that you changed the files and the date of any change.
100 | 
101 |     b) You must cause any work that you distribute or publish, that in
102 |     whole or in part contains or is derived from the Program or any
103 |     part thereof, to be licensed as a whole at no charge to all third
104 |     parties under the terms of this License.
105 | 
106 |     c) If the modified program normally reads commands interactively
107 |     when run, you must cause it, when started running for such
108 |     interactive use in the most ordinary way, to print or display an
109 |     announcement including an appropriate copyright notice and a
110 |     notice that there is no warranty (or else, saying that you provide
111 |     a warranty) and that users may redistribute the program under
112 |     these conditions, and telling the user how to view a copy of this
113 |     License.  (Exception: if the Program itself is interactive but
114 |     does not normally print such an announcement, your work based on
115 |     the Program is not required to print an announcement.)
116 | 
117 | These requirements apply to the modified work as a whole.  If
118 | identifiable sections of that work are not derived from the Program,
119 | and can be reasonably considered independent and separate works in
120 | themselves, then this License, and its terms, do not apply to those
121 | sections when you distribute them as separate works.  But when you
122 | distribute the same sections as part of a whole which is a work based
123 | on the Program, the distribution of the whole must be on the terms of
124 | this License, whose permissions for other licensees extend to the
125 | entire whole, and thus to each and every part regardless of who wrote it.
126 | 
127 | Thus, it is not the intent of this section to claim rights or contest
128 | your rights to work written entirely by you; rather, the intent is to
129 | exercise the right to control the distribution of derivative or
130 | collective works based on the Program.
131 | 
132 | In addition, mere aggregation of another work not based on the Program
133 | with the Program (or with a work based on the Program) on a volume of
134 | a storage or distribution medium does not bring the other work under
135 | the scope of this License.
136 | 
137 |   3. You may copy and distribute the Program (or a work based on it,
138 | under Section 2) in object code or executable form under the terms of
139 | Sections 1 and 2 above provided that you also do one of the following:
140 | 
141 |     a) Accompany it with the complete corresponding machine-readable
142 |     source code, which must be distributed under the terms of Sections
143 |     1 and 2 above on a medium customarily used for software interchange; or,
144 | 
145 |     b) Accompany it with a written offer, valid for at least three
146 |     years, to give any third party, for a charge no more than your
147 |     cost of physically performing source distribution, a complete
148 |     machine-readable copy of the corresponding source code, to be
149 |     distributed under the terms of Sections 1 and 2 above on a medium
150 |     customarily used for software interchange; or,
151 | 
152 |     c) Accompany it with the information you received as to the offer
153 |     to distribute corresponding source code.  (This alternative is
154 |     allowed only for noncommercial distribution and only if you
155 |     received the program in object code or executable form with such
156 |     an offer, in accord with Subsection b above.)
157 | 
158 | The source code for a work means the preferred form of the work for
159 | making modifications to it.  For an executable work, complete source
160 | code means all the source code for all modules it contains, plus any
161 | associated interface definition files, plus the scripts used to
162 | control compilation and installation of the executable.  However, as a
163 | special exception, the source code distributed need not include
164 | anything that is normally distributed (in either source or binary
165 | form) with the major components (compiler, kernel, and so on) of the
166 | operating system on which the executable runs, unless that component
167 | itself accompanies the executable.
168 | 
169 | If distribution of executable or object code is made by offering
170 | access to copy from a designated place, then offering equivalent
171 | access to copy the source code from the same place counts as
172 | distribution of the source code, even though third parties are not
173 | compelled to copy the source along with the object code.
174 | 
175 |   4. You may not copy, modify, sublicense, or distribute the Program
176 | except as expressly provided under this License.  Any attempt
177 | otherwise to copy, modify, sublicense or distribute the Program is
178 | void, and will automatically terminate your rights under this License.
179 | However, parties who have received copies, or rights, from you under
180 | this License will not have their licenses terminated so long as such
181 | parties remain in full compliance.
182 | 
183 |   5. You are not required to accept this License, since you have not
184 | signed it.  However, nothing else grants you permission to modify or
185 | distribute the Program or its derivative works.  These actions are
186 | prohibited by law if you do not accept this License.  Therefore, by
187 | modifying or distributing the Program (or any work based on the
188 | Program), you indicate your acceptance of this License to do so, and
189 | all its terms and conditions for copying, distributing or modifying
190 | the Program or works based on it.
191 | 
192 |   6. Each time you redistribute the Program (or any work based on the
193 | Program), the recipient automatically receives a license from the
194 | original licensor to copy, distribute or modify the Program subject to
195 | these terms and conditions.  You may not impose any further
196 | restrictions on the recipients' exercise of the rights granted herein.
197 | You are not responsible for enforcing compliance by third parties to
198 | this License.
199 | 
200 |   7. If, as a consequence of a court judgment or allegation of patent
201 | infringement or for any other reason (not limited to patent issues),
202 | conditions are imposed on you (whether by court order, agreement or
203 | otherwise) that contradict the conditions of this License, they do not
204 | excuse you from the conditions of this License.  If you cannot
205 | distribute so as to satisfy simultaneously your obligations under this
206 | License and any other pertinent obligations, then as a consequence you
207 | may not distribute the Program at all.  For example, if a patent
208 | license would not permit royalty-free redistribution of the Program by
209 | all those who receive copies directly or indirectly through you, then
210 | the only way you could satisfy both it and this License would be to
211 | refrain entirely from distribution of the Program.
212 | 
213 | If any portion of this section is held invalid or unenforceable under
214 | any particular circumstance, the balance of the section is intended to
215 | apply and the section as a whole is intended to apply in other
216 | circumstances.
217 | 
218 | It is not the purpose of this section to induce you to infringe any
219 | patents or other property right claims or to contest validity of any
220 | such claims; this section has the sole purpose of protecting the
221 | integrity of the free software distribution system, which is
222 | implemented by public license practices.  Many people have made
223 | generous contributions to the wide range of software distributed
224 | through that system in reliance on consistent application of that
225 | system; it is up to the author/donor to decide if he or she is willing
226 | to distribute software through any other system and a licensee cannot
227 | impose that choice.
228 | 
229 | This section is intended to make thoroughly clear what is believed to
230 | be a consequence of the rest of this License.
231 | 
232 |   8. If the distribution and/or use of the Program is restricted in
233 | certain countries either by patents or by copyrighted interfaces, the
234 | original copyright holder who places the Program under this License
235 | may add an explicit geographical distribution limitation excluding
236 | those countries, so that distribution is permitted only in or among
237 | countries not thus excluded.  In such case, this License incorporates
238 | the limitation as if written in the body of this License.
239 | 
240 |   9. The Free Software Foundation may publish revised and/or new versions
241 | of the General Public License from time to time.  Such new versions will
242 | be similar in spirit to the present version, but may differ in detail to
243 | address new problems or concerns.
244 | 
245 | Each version is given a distinguishing version number.  If the Program
246 | specifies a version number of this License which applies to it and "any
247 | later version", you have the option of following the terms and conditions
248 | either of that version or of any later version published by the Free
249 | Software Foundation.  If the Program does not specify a version number of
250 | this License, you may choose any version ever published by the Free Software
251 | Foundation.
252 | 
253 |   10. If you wish to incorporate parts of the Program into other free
254 | programs whose distribution conditions are different, write to the author
255 | to ask for permission.  For software which is copyrighted by the Free
256 | Software Foundation, write to the Free Software Foundation; we sometimes
257 | make exceptions for this.  Our decision will be guided by the two goals
258 | of preserving the free status of all derivatives of our free software and
259 | of promoting the sharing and reuse of software generally.
260 | 
261 |                             NO WARRANTY
262 | 
263 |   11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
264 | FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
265 | OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
266 | PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
267 | OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
268 | MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
269 | TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
270 | PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
271 | REPAIR OR CORRECTION.
272 | 
273 |   12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
274 | WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
275 | REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
276 | INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
277 | OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
278 | TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
279 | YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
280 | PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
281 | POSSIBILITY OF SUCH DAMAGES.
282 | 
283 |                      END OF TERMS AND CONDITIONS
284 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | BASE=$(wildcard R/*R)
 2 | all: install
 3 | 
 4 | install: R_package
 5 | 
 6 | R_package: $(BASE)
 7 | 	Rscript -e 'library(devtools);install(".")'
 8 | 	touch R_package
 9 | 
10 | make clean:
11 | 	rm -f inst/doc/*.html inst/doc/*.md
12 | 
13 | #from yihui's knitr Makefile
14 | PKGNAME := $(shell sed -n "s/Package: *\([^ ]*\)/\1/p" DESCRIPTION)
15 | PKGVERS := $(shell sed -n "s/Version: *\([^ ]*\)/\1/p" DESCRIPTION)
16 | PKGSRC  := $(shell basename `pwd`)
17 | 
18 | docs:
19 | 	Rscript -e 'library(devtools);library(methods);library(utils);document(".")'
20 | 
21 | build:
22 | 	cd ..;\
23 | 	R CMD build $(PKGSRC)
24 | 
25 | check: build
26 | 	cd ..;\
27 | 	R CMD check $(PKGNAME)_$(PKGVERS).tar.gz --as-cran
28 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(filter_seqs,primerTree)
 4 | S3method(identify,primerTree_plot)
 5 | S3method(plot,primerTree)
 6 | S3method(print,primerTree)
 7 | S3method(print,primerTree_plot_multi)
 8 | S3method(seq_lengths,primerTree)
 9 | S3method(summary,primerTree)
10 | export(accession2taxid)
11 | export(calc_rank_dist_ave)
12 | export(clustalo)
13 | export(filter_seqs)
14 | export(get_sequence)
15 | export(get_sequences)
16 | export(get_taxonomy)
17 | export(parse_primer_hits)
18 | export(plot_tree)
19 | export(plot_tree_ranks)
20 | export(primer_search)
21 | export(search_primer_pair)
22 | export(seq_lengths)
23 | export(tree_from_alignment)
24 | import(XML)
25 | import(ape)
26 | import(directlabels)
27 | import(foreach)
28 | import(ggplot2)
29 | import(gridExtra)
30 | import(httr)
31 | import(plyr)
32 | import(stringr)
33 | importFrom(grDevices,dev.cur)
34 | importFrom(grDevices,dev.off)
35 | importFrom(grDevices,dev.set)
36 | importFrom(grDevices,pdf)
37 | importFrom(grid,grid.locator)
38 | importFrom(lubridate,"%--%")
39 | importFrom(lubridate,now)
40 | importFrom(lubridate,seconds)
41 | importFrom(reshape2,melt)
42 | importFrom(scales,expand_range)
43 | importFrom(stats,na.omit)
44 | importFrom(stats,quantile)
45 | importFrom(utils,capture.output)
46 | useDynLib(primerTree,rawStreamToDNAbin)
47 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | # primerTree 1.0.7
 2 | 
 3 | * Couple of internal changes to fix CRAN issue
 4 | * Also updated documentation for filter_seqs
 5 | 
 6 | # primerTree 1.0.6
 7 | 
 8 | * Changed how matches are handled internally to use accession numbers instead of
 9 |   gi numbers.
10 | 
11 | # primerTree 1.0.5
12 | 
13 | * `search_primer_pair()` gains a `api_key` parameter, or you can set `NCBI_API_KEY` as an environment variable (@MVesuviusC, #36)
14 | * Add RCurl to Imports, it is an implicit dependency due to a S4 Class being
15 |   included in the data for the package and CRAN is now checking that packages
16 |   load without Suggested packages.
17 | 
18 | # primerTree 1.0.4
19 | 
20 | * Add RCurl to Suggests, it is an implicit dependency due to a S4 Class being
21 |   included in the data for the package.
22 | 
23 | * Added a `NEWS.md` file to track changes to the package.
24 | 


--------------------------------------------------------------------------------
/R/filterResults.R:
--------------------------------------------------------------------------------
 1 | #PrimerTree
 2 | #Copyright (C) 2013 Jim Hester
 3 | 
 4 | 
 5 | #' Filter out sequences retrieved by search_primer_pair() that are either too
 6 | #' short or too long. The alignment and tree will be recalculated after removing
 7 | #' unwanted reads.
 8 | #' @param x a primerTree object
 9 | #' @param min_length the minimum sequence length to keep
10 | #' @param max_length the maximum sequence length to keep
11 | #' @param ... additional arguments passed to methods.
12 | #' @return a primerTree object
13 | #' @examples
14 | #' \dontrun{
15 | #' # filter out sequences longer or shorter than desired:
16 | #' mammals_16S_filtered <- filter_seqs(mammals_16S, min_length=131, max_length=156)
17 | #' }
18 | #' @export
19 | filter_seqs = function(x, ...) UseMethod("filter_seqs")
20 | 
21 | #' @describeIn filter_seqs Method for primerTree objects
22 | #' @export
23 | filter_seqs.primerTree = function(x, min_length = 0, max_length = Inf, ...) {
24 |   #if(is.null(x$sequence)) {
25 |   #  print "No sequences in primerTree object"
26 |   #}
27 |   
28 |   # calculate how many removed and print out to screen
29 |   lengths <- vapply(x$sequence, length, integer(1))
30 |   above_max <- lengths >= max_length
31 |   below_min <- lengths <= min_length
32 |   message(sum(below_min), " sequences below ", min_length) # same for min
33 |   message(sum(above_max), " sequences above ", max_length) # same for min
34 |   
35 |   #filter the sequences
36 |   x$sequence <- x$sequence[lengths >= min_length & lengths <= max_length]
37 |       
38 |   # realign filtered sequences
39 |   x$alignment <- clustalo(x$sequence)
40 |   
41 |   # remake tree
42 |   x$tree <- tree_from_alignment(x$alignment)
43 |   x
44 | }
45 | 
46 | #' Get a summary of sequence lengths from a primerTree object
47 | #' @param x a primerTree object. 
48 | #' @param summarize a logical indicating if a summary should be displayed
49 | #' @return a table of sequence length frequencies
50 | #' @examples 
51 | #' 
52 | #' # Show the counts for each length
53 | #' seq_lengths(mammals_16S)
54 | #' 
55 | #' # Plot the distribution of lengths
56 | #' seqLengths <- seq_lengths(mammals_16S)
57 | #' barplot(seqLengths, 
58 | #'  main = "Frequency of sequence lengths for 16S mammal primers", 
59 | #'  xlab="Amplicon length (in bp)", 
60 | #'  ylab=("Frequency"))
61 | #' @export
62 | seq_lengths <- function(x,summarize = TRUE) UseMethod("seq_lengths")
63 | 
64 | #' Method for primerTree objects
65 | #' @inheritParams seq_lengths
66 | #' @export
67 | seq_lengths.primerTree <- function(x, summarize = TRUE) {
68 |   lengths <- vapply(x$sequence, length, integer(1))
69 |   message("sequence length distribution: ")
70 |   table(as.factor(lengths))
71 | }
72 | 


--------------------------------------------------------------------------------
/R/plot.R:
--------------------------------------------------------------------------------
  1 | #PrimerTree
  2 | #Copyright (C) 2013 Jim Hester
  3 | 
  4 | #' plots a tree along with a series of taxonomic ranks
  5 | #' @param ranks The ranks to include, defaults to all common ranks, if null
  6 | #' print all ranks.
  7 | #' @inheritParams plot_tree
  8 | #' @seealso \code{\link{plot_tree}} to plot only a single rank or the just the
  9 | #' tree layout.
 10 | #' @export
 11 | #' @examples
 12 | #' library(gridExtra)
 13 | #' library(directlabels)
 14 | #' #plot all the common ranks
 15 | #' plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy)
 16 | #' #plot specific ranks, with a larger dot size
 17 | #' plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy,
 18 | #'   ranks=c('kingdom', 'class', 'family'), size=3)
 19 | 
 20 | plot_tree_ranks = function(tree, taxonomy, main=NULL, type='unrooted',
 21 |                       ranks=common_ranks,  size=2,
 22 |                                     guide_size=NULL, legend_cutoff=25, ...){
 23 |   if(is.null(ranks))
 24 |     ranks = setdiff(names(taxonomy), c('accession', 'taxId'))
 25 | 
 26 |   plots = list()
 27 |   plots$structure = plot_tree(tree, main=main, guide_size=guide_size, type=type, ...)
 28 | 
 29 |   for(rank in intersect(ranks, names(taxonomy))){
 30 |     if(length(na.omit(taxonomy[rank])) > 0){
 31 |       plots[[rank]]= plot_tree(tree, guide_size=guide_size, type=type, rank=rank, taxonomy=taxonomy, size=size, legend_cutoff=legend_cutoff, ...)
 32 |     }
 33 |   }
 34 |   p = do.call(arrangeGrob, plots)
 35 |   class(p) <- c("primerTree_plot_multi", class(p))
 36 | 
 37 |   p
 38 | }
 39 | common_ranks = c("kingdom", "phylum", "class", "order", "family", "genus", "species")
 40 | 
 41 | #' plots a tree, optionally with colored and labeled points by taxonomic rank
 42 | #'
 43 | #' @param tree to be plotted, use layout_tree to layout tree.
 44 | #' @param taxonomy A data.frame with an accession field corresponding to the
 45 | #'        tree tip labels.
 46 | #' @param main An optional title for the plot
 47 | #' @param type The type of tree to plot, default unrooted.
 48 | #' @param rank The rank to include, if null only the tree is plotted
 49 | #' @param size The size of the colored points
 50 | #' @param guide_size The size of the length guide.  If NULL auto detects a
 51 | #'        reasonable size.
 52 | #' @param legend_cutoff The number of different taxa names after which the
 53 | #'        names are no longer printed.
 54 | #' @param ... additional arguments passed to \code{\link{layout_tree_ape}}
 55 | #' @return plot to be printed.
 56 | #' @export
 57 | 
 58 | plot_tree = function(tree, type='unrooted', main=NULL, guide_size=NULL,
 59 |                      rank=NULL, taxonomy=NULL,  size=2, legend_cutoff=25, ...){
 60 | 
 61 |   x = layout_tree_ape(tree, type=type, ...)
 62 | 
 63 |   range_x = range(x$edge$x, x$tip$x)
 64 |   range_y = expand_range(range(x$edqe$y, x$tip$y), mul=.1)
 65 | 
 66 |   if(is.null(guide_size)){
 67 |     guide_size = 10**(round_any(log10(range_x[2]-range_x[1]), 1)-1)
 68 |   }
 69 |   p = ggplot() +
 70 |    geom_segment(data=x$edge, aes_string(x='x', y='y', xend='xend', yend='yend')) +
 71 |     theme_noaxis() +
 72 |     annotate('segment', x=range_x[1], xend=range_x[1]+guide_size,
 73 |              y=range_y[1], yend=range_y[1],
 74 |              arrow=arrow(ends="both",angle=90, length=unit(.2,"cm"))) +
 75 |     annotate('text', x=range_x[1]+(guide_size/2), y=range_y[1], label=guide_size, vjust=-.5)
 76 | 
 77 |   if(!is.null(rank)){
 78 |     if(is.null(taxonomy))
 79 |       stop('Must provide a taxonomy if plotting a rank')
 80 | 
 81 |     if(is.null(main))
 82 |       main = rank
 83 | 
 84 |       x$tip = merge(x$tip, taxonomy, by.x='label', by.y='accession', all.x=T)
 85 | 
 86 |       rows = na.omit(x$tip[, c('x','y',rank)])
 87 |       p = p+geom_point(data=rows, aes_string(x='x', y='y', color=rank),
 88 |                      size=size, na.rm=T) + theme(legend.position='none')
 89 | 
 90 |         if(length(unique(rows[[rank]])) < legend_cutoff){
 91 |           smart.grid2 = list('get.means', 'calc.boxes', 'empty.grid')
 92 |           p = p + geom_dl(data=rows, method=smart.grid2,
 93 |                     aes_string(x='x', y='y', color=rank, label=rank))
 94 |         }
 95 |   }
 96 |   p = p + ggtitle(main)
 97 |   class(p) = c(class(p), 'primerTree_plot')
 98 |   p
 99 | }
100 | 
101 | #' layout a tree using ape, return an object to be plotted by
102 | #' \code{\link{plot_tree}}
103 | #' @param tree The \code{\link{phylo}} tree to be plotted
104 | #' @param ... additional arguments to \code{\link{plot.phylo}}
105 | #' @return \item{edge}{list of x, y and xend, yend coordinates
106 | #' as well as ids for the edges}
107 | #' \item{tips}{list of x, y, label and id for the tips}
108 | #' \item{nodes}{list of x, y and id for the nodes}
109 | layout_tree_ape = function(tree, ...){
110 |   #hack to write no output
111 |   cur_dev = dev.cur() #store previous dev
112 |   temp_file = tempfile()
113 |   pdf(file=temp_file)
114 |   plot.phylo(tree, plot=F, ...)
115 |   dev.off()
116 |   unlink(temp_file)
117 |   dev.set(cur_dev) #restore previous dev
118 | 
119 |   last = .PlotPhyloEnv$last_plot.phylo
120 |   new = list()
121 |   new$edge$x = last$xx[last$edge[,1]]
122 |   new$edge$xend = last$xx[last$edge[,2]]
123 | 
124 |   new$edge$y = last$yy[last$edge[,1]]
125 |   new$edge$yend = last$yy[last$edge[,2]]
126 |   new$edge$id = tree$edge
127 | 
128 |   new$edge = data.frame(new$edge, stringsAsFactors=F)
129 | 
130 |   new$tip$x = last$xx[1:last$Ntip]
131 |   new$tip$y = last$yy[1:last$Ntip]
132 |   new$tip$label = tree$tip.label
133 |   new$tip$id = 1:last$Ntip
134 | 
135 |   new$tip = data.frame(new$tip, stringsAsFactors=F)
136 | 
137 |   new$node$x = last$xx[(last$Ntip + 1):length(last$xx)]
138 |   new$node$y = last$yy[(last$Ntip + 1):length(last$yy)]
139 |   new$node$label = tree$node.label
140 |   new$node = data.frame(new$node, stringsAsFactors=F)
141 | 
142 |   new
143 | }
144 | theme_noaxis = function(){
145 |   theme(panel.border=element_blank(), panel.grid=element_blank(),
146 |         axis.line=element_blank(), axis.text=element_blank(),
147 |         axis.title=element_blank(), axis.ticks=element_blank(),
148 |         plot.margin=unit(c(0, 0, -1, -1), 'lines'))
149 | }
150 | 
151 | #' @export
152 | print.primerTree_plot_multi <- function(x, ...) {
153 |    grid.arrange(x, ...)
154 | }
155 | 


--------------------------------------------------------------------------------
/R/primerTree.R:
--------------------------------------------------------------------------------
  1 | #PrimerTree
  2 | #Copyright (C) 2013 Jim Hester
  3 | 
  4 | #' \pkg{primerTree} Visually Assessing the Specificity and Informativeness of Primer Pairs
  5 | #'
  6 | #' \code{primerTree} has two main commands:
  7 | #' \code{\link{search_primer_pair}} which takes a primer pair and returns an
  8 | #' primerTree object of the search results
  9 | #' \code{\link{plot.primerTree}} a S3 method for plotting the primerTree object
 10 | #' obtained using \code{\link{search_primer_pair}}
 11 | #' @name primerTree
 12 | #' @docType package
 13 | #' @aliases primerTree-package
 14 | #' @import ggplot2 XML ape httr plyr directlabels gridExtra
 15 | #'   stringr foreach
 16 | #' @importFrom lubridate %--% seconds now
 17 | #' @importFrom grid grid.locator
 18 | #' @importFrom scales expand_range
 19 | #' @importFrom grDevices dev.cur dev.off dev.set pdf
 20 | #' @importFrom stats na.omit quantile
 21 | #' @importFrom utils capture.output
 22 | #' @useDynLib primerTree rawStreamToDNAbin
 23 | NULL
 24 | 
 25 | #' PrimerTree results for the mammalian 16S primers
 26 | #' @name mammals_16S
 27 | #' @docType data
 28 | NULL
 29 | 
 30 | #' PrimerTree results for the bryophyte trnL primers
 31 | #' @name bryophytes_trnL
 32 | #' @docType data
 33 | NULL
 34 | 
 35 | #' @export
 36 | print.primerTree = function(x, ...){
 37 |   cat("Name: ", x$name, "\n",
 38 |       "  Arguments: ", paste(names(x$arguments), x$arguments, sep=":", collapse=' '), "\n")
 39 |   cat("\nHTTP Response\n")
 40 |       print(x$response[[1]])
 41 |   cat("\nPrimer Products\n")
 42 |       print(x$sequence)
 43 |   cat("\nAligned Products\n")
 44 |       print(x$alignment)
 45 |   cat("\nPhylogenetic Tree\n")
 46 |       print(x$tree)
 47 | }
 48 | #' plot function for a primerTree object, calls plot_tree_ranks
 49 | #' @param x primerTree object to plot
 50 | #' @param ranks The ranks to include, defaults to all common ranks, if NULL
 51 | #'   print all ranks.  If 'none' just print the layout.
 52 | #' @param main an optional title to display, if NULL displays the name as the title
 53 | #' @param ... additional arguments passed to plot_tree_ranks
 54 | #' @export
 55 | #' @seealso \code{\link{plot_tree_ranks}}, \code{\link{plot_tree}}
 56 | #' @examples
 57 | #' library(gridExtra)
 58 | #' library(directlabels)
 59 | #' #plot with all common ranks
 60 | #' plot(mammals_16S)
 61 | #'
 62 | #' #plot only the class
 63 | #' plot(mammals_16S, 'class')
 64 | #'
 65 | #' #plot the layout only
 66 | #' plot(mammals_16S, 'none')
 67 | plot.primerTree = function(x, ranks=NULL, main=NULL, ...){
 68 |   if(is.null(ranks)){
 69 |     if(is.null(main))
 70 |       main = x$name
 71 |     plot_tree_ranks(x$tree, x$taxonomy, main=main, ...)
 72 |   }
 73 |   else if(length(ranks) > 1){
 74 |     if(is.null(main))
 75 |       main = x$name
 76 |     plot_tree_ranks(x$tree, x$taxonomy, ranks=ranks, main=main, ...)
 77 |   }
 78 |   else {
 79 |     if(ranks == 'none') {
 80 |       plot_tree(x$tree, main=main, ...)
 81 |     }
 82 |     else{
 83 |       plot_tree(x$tree, taxonomy=x$taxonomy, rank=ranks, main=main, ...)
 84 |     }
 85 |   }
 86 | }
 87 | #' Automatic primer searching Search a given primer pair, retrieving the alignment
 88 | #' results, their product sequences, the taxonomic information for the sequences,
 89 | #' a multiple alignment of the products
 90 | #' @param name name to give to the primer pair
 91 | #' @param simplify use simple names for primer hit results or complex
 92 | #' @param .progress name of the progress bar to use, see
 93 | #' \code{\link{create_progress_bar}}
 94 | #' @param clustal_options a list of options to pass to clustal omega, see
 95 | #'    \code{link{clustalo}} for a list of options
 96 | #' @param distance_options a list of options to pass to dist.dna, see
 97 | #'    \code{link{dist.dna}} for a list of options
 98 | #' @param api_key NCBI api-key to allow faster sequence retrieval
 99 | #' @inheritParams primer_search
100 | #' @return A list with the following elements,
101 | #' \item{name}{name of the primer pair}
102 | #' \item{BLAST_result}{html blast results from Primer-BLAST as
103 | #'  'a \code{\link{response}}} object.
104 | #' \item{taxonomy}{taxonomy for the primer products from NCBI}
105 | #' \item{sequence}{sequence of the primer products}
106 | #' \item{alignment}{multiple alignment of the primer products}
107 | #' \item{tree}{phylogenetic tree of the reconstructed from the
108 | #' 'multiple alignment}
109 | #' @seealso \code{\link{primer_search}}, \code{\link{clustalo}}
110 | #' @export
111 | #' @examples
112 | #' \dontrun{
113 | #' #simple search
114 | #' mammals_16S = search_primer_pair(name='Mammals 16S',
115 | #'  'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT')
116 | #' #returning 1000 alignments, allow up to 3 mismatches in primer
117 | #' mammals_16S = search_primer_pair(name='Mammals 16S',
118 | #'  'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT',
119 | #'  num_aligns=1000, total_primer_specificity_mismatch=3)
120 | #' }
121 | search_primer_pair = function(forward, reverse, name=NULL, num_aligns=500,
122 |     num_permutations=25, simplify=TRUE, clustal_options=list(), 
123 |     distance_options=list(model="N", pairwise.deletion=T), api_key=Sys.getenv("NCBI_API_KEY"),
124 |     ..., .parallel=FALSE, .progress='none'){
125 | 
126 |   #HACK, primerTree is an environment rather than a list so we can treat it as
127 |   #a pointer, I could make it a reference class, but that seems to be overkill
128 |   #as I am converting to a list at the end of the function anyway...
129 | 
130 |   if(missing(forward) || missing(reverse)){
131 |     BLAST_primer()
132 |     return()
133 |   }
134 | 
135 |   primer_search = new.env(parent=globalenv())
136 |   #list all primers used to search
137 |   primer_search = env2list(
138 |     try_default({
139 |       primer_search$name = if(!is.null(name)) name
140 |                     else name=paste(forward, reverse, sep='_')
141 | 
142 |       primer_search$arguments =
143 |         c(forward=forward, reverse=reverse, name=name,
144 |           num_aligns=num_aligns, num_permutations = num_permutations,
145 |           simplify=simplify, clustal_options=clustal_options, list(...))
146 | 
147 |       primer_search$response = primer_search(forward, reverse,
148 |                                              num_permutations=num_permutations,
149 |                                              .progress=.progress,
150 |                                              .parallel=.parallel,
151 |                                              num_aligns=num_aligns,
152 |                                              ...)
153 |       start_time = now()
154 |       primer_search$BLAST_result =
155 |         filter_duplicates(ldply(primer_search$response, parse_primer_hits, .parallel=.parallel))
156 | 
157 |       message(nrow(primer_search$BLAST_result), ' BLAST alignments parsed in ', seconds_elapsed_text(start_time))
158 | 
159 |       start_time = now()
160 |       primer_search$taxonomy = get_taxonomy(primer_search$BLAST_result$accession)
161 |       message('taxonomy retrieved in ', seconds_elapsed_text(start_time))
162 | 
163 |       start_time = now()
164 |       primer_search$sequence = get_sequences(primer_search$BLAST_result$accession,
165 |                                              primer_search$BLAST_result$product_start,
166 |                                              primer_search$BLAST_result$product_stop,
167 |                                              api_key=api_key,
168 |                                              simplify=simplify,
169 |                                              .parallel=.parallel)
170 | 
171 |       lengths = laply(primer_search$sequence, length)
172 |       message(length(primer_search$sequence), ' sequences retrieved from NCBI',
173 |               ' in ', seconds_elapsed_text(start_time), ', product length',
174 |               ' min:', min(lengths),
175 |               ' mean:', round(mean(lengths),2),
176 |               ' max:', max(lengths))
177 | 
178 |       start_time = now()
179 |       primer_search$alignment = do.call(clustalo, c(list(primer_search$sequence, threads=getDoParWorkers()), clustal_options))
180 |       message(nrow(primer_search$alignment), ' sequences aligned in ',
181 |               seconds_elapsed_text(start_time),
182 |               ' length:', ncol(primer_search$alignment))
183 | 
184 |       start_time = now()
185 |       primer_search$distances = do.call(dist.dna, c(list(primer_search$alignment), distance_options))
186 |       message('pairwise DNA distances calculated in ',
187 |               seconds_elapsed_text(start_time))
188 | 
189 |       start_time = now()
190 |       primer_search$tree = tree_from_alignment(primer_search$alignment)
191 |       message('constructed neighbor joining tree in ', seconds_elapsed_text(start_time))
192 | 
193 |       primer_search
194 |     }, default=primer_search)
195 |   )
196 |   class(primer_search) = 'primerTree'
197 |   primer_search
198 | }
199 | 
200 | #given a start time print out the number of seconds which have elapsed
201 | seconds_elapsed_text = function(start_time){
202 |   paste((start_time %--% now()) %/% seconds(1), 'seconds')
203 | }
204 | 
205 | #fast way to convert a environment to a list
206 | env2list = function(env){
207 |   names = ls(env)
208 |   mget(names, env)
209 | }
210 | #' identify the point closest to the mouse click
211 | #' only works on single ranks
212 | #' @param x the plot to identify
213 | #' @param ... additional arguments passed to annotate
214 | #' @export
215 | identify.primerTree_plot = function(x, ...) {
216 |   point <- gglocator(x$layers[[4]])
217 |   distances <- distance(point, x$layers[[4]]$data[,c('x','y')])
218 |   closest <- which(distances == min(distances))[1]
219 |   point$label <- x$layers[[4]]$data[closest,deparse(x$layers[[4]]$mapping$colour)]
220 |   x + annotate("text", label=point$label, x=point$x, y=point$y, ...)
221 | }
222 | gglocator = function(object) {
223 |   loc <-  as.numeric(grid.locator("npc"))
224 | 
225 |   xrng <- with(object, range(data[,deparse(mapping$x)]))
226 |   yrng <- with(object, range(data[,deparse(mapping$y)]))
227 | 
228 |   point <- data.frame(xrng[1] + loc[1]*diff(xrng), yrng[1] + loc[2]*diff(yrng))
229 |   names(point) <- with(object, c(deparse(mapping$x), deparse(mapping$y)))
230 |   point
231 | }
232 | 
233 | #returns the distance from a point in point to the points in points
234 | distance <- function(point,points){
235 |   sqrt((point$x-points$x)^2 + (point$y-points$y)^2)
236 | }
237 | 
238 | #' Summarize a primerTree result, printing quantiles of sequence length and
239 | #' pairwise differences.
240 | 
241 | #' @param object the primerTree object to summarise
242 | #' @param ... Ignored options
243 | #' @param probs quantile probabilities to compute, defaults to 0, 5, 50, 95,
244 | #' and 100 probabilities.
245 | #' @param ranks ranks to show unique counts for, defaults to the common ranks
246 | #' @return invisibly returns a list containing the printed results
247 | #' @export
248 | summary.primerTree <- function(object, ..., probs=c(0, .05, .5, .95, 1), ranks = common_ranks) {
249 | 
250 |   res = list()
251 |   res[['lengths']] = t(data.frame('Sequence lengths'=labeled_quantile(laply(object$sequence, length), sprintf('%.0f%%', probs*100), probs=probs), check.names=F))
252 |   print(res[['lengths']])
253 | 
254 |   res[['distances']] = t(data.frame('Pairwise differences'=labeled_quantile(object$distances, sprintf('%.0f%%', probs*100), probs=probs), check.names=F))
255 |   cat('\n')
256 |   print(res[['distances']])
257 | 
258 |   res[['rankDistances']] = calc_rank_dist_ave(object, common_ranks)
259 |   print(res[['rankDistances']])
260 | 
261 |   res[['ranks']] = laply(object$taxonomy[common_ranks], function(x) length(unique(x)))
262 |   cat('\n', 'Unique taxa out of ', length(object$sequence), ' sequences\n', sep='')
263 |   names(res[['ranks']]) = ranks
264 |   print(res[['ranks']])
265 | 
266 |   invisible(res)
267 | }
268 | 
269 | labeled_quantile = function(x, labels, ...){
270 |   res = quantile(x, ...)
271 |   names(res) = labels
272 |   res
273 | }
274 | 
275 | #' Summarize pairwise differences.
276 | 
277 | #' @param x a primerTree object
278 | #' @param ranks ranks to show unique counts for, defaults to the common ranks
279 | #' @return returns a data frame of results
280 | #' @details
281 | #' The purpose of this function is to calculate the average number
282 | #' of nucleotide differences between species within each taxa of given taxonomic
283 | #' level.
284 | #'
285 | #' For example, at the genus level, the function calculates the average number
286 | #' of nucleotide differences between all species within each genus and reports
287 | #' the mean of those values.
288 | #'
289 | #' There are several key assumptions and calculations made in this
290 | #' function.
291 | #'
292 | #' First, the function randomly selects one sequence from each species
293 | #' in the primerTree results. This is to keep any one species (e.g.
294 | #' human, cow, etc.) with many hits from skewing the results.
295 | #'
296 | #' Second, for each taxonomic level tested, the function divides the
297 | #' sequences by each taxon at that level and calculates the mean
298 | #' number of nucleotide differences within that taxa, then returns the
299 | #' mean of those values.
300 | #'
301 | #' Third, when calculating the average distance, any taxa for which
302 | #' there is only one species is omitted, as the number of nucleotide
303 | #' differences will always be 0.
304 | #'
305 | #' @examples
306 | #' \dontrun{
307 | #' calc_rank_dist_ave(mammals_16S)
308 | #'
309 | #' calc_rank_dist_ave(bryophytes_trnL)
310 | #'
311 | #' # Note that the differences between the results from these two primers
312 | #' # the mean nucleotide differences is much higher for the mammal primers
313 | #' # than the byrophyte primers. This suggests that the mammal primers have
314 | #' # better resolution to distinguish individual species.
315 | #' }
316 | #' @importFrom reshape2 melt
317 | #' @export
318 | 
319 | # using tree data format info from http://www.phytools.org/eqg/Exercise_3.2/
320 | calc_rank_dist_ave <- function(x, ranks = common_ranks) {
321 |   used_ranks <- grep("species", ranks, invert = T, value = T)
322 |   rank_dist_mean <- data.frame(matrix(nrow = 1, ncol = 0))
323 | 
324 |   # Raw taxonomy data
325 |   taxa <- as.data.frame(x$taxonomy)
326 | 
327 |   # Randomize the order of the taxa data frame
328 |   taxa <- taxa[sample(nrow(taxa)), ]
329 |   rownames(taxa) <- taxa$accession
330 | 
331 |   # Pick random example per species and add back in the taxa info
332 |   unique_factors <- as.data.frame(unique(taxa$species))
333 |   colnames(unique_factors) <- "species"
334 |   unique_factors <- join(unique_factors, taxa, type = "left", match = "first", by = "species")
335 | 
336 |   # Get sequences for randomly selected species
337 |   seqs <- x$sequence
338 |   seqs <- seqs[names(seqs) %in% unique_factors$accession]
339 | 
340 |   # Align and calculate pairwise distances and convert dists to dataframe
341 |   align <- clustalo(seqs)
342 |   dists <- as.data.frame(as.matrix(dist.dna(align, model = "N", pairwise.deletion = T)))
343 |   dists$accession <- row.names(dists)
344 | 
345 |   # Melt the dists dataframe so I can drop any distance that isn't within the (rank)
346 |   melted <- melt(dists, id = "accession", variable.name = "accession2", value.name = "dist")
347 | 
348 |   for(rank in used_ranks) {
349 | 
350 |     # Gather only the needed taxa data
351 |     unique_factors_sub <- unique_factors[ , colnames(unique_factors) %in% c("accession", "species", rank)]
352 | 
353 |     # Drop any row in (rank) where there is only one species represented
354 |     # Any instance of this leads to a distance within that rank of 0, skewing the results downward
355 |     counts <- as.data.frame(table(unique_factors_sub[[rank]]))
356 |     colnames(counts) <- c(rank, "count")
357 |     unique_factors_sub <- join(unique_factors_sub, counts, by = rank)
358 |     unique_factors_sub <- unique_factors_sub[unique_factors_sub$count > 1, ]
359 | 
360 |     # Pull the nucleotide distance data in
361 |     # Replace the rank1 accession with the rank1 taxa
362 |     melted_sub <- join(melted, unique_factors_sub,by = "accession")
363 |     melted_sub$rank1 <- as.factor(melted_sub[[rank]])
364 | 
365 |     # Drop all columns except the three needed so the next join doesn't get messed up
366 |     melted_sub <- melted_sub[, colnames(melted_sub) %in% c("accession2", "dist", "rank1", "species")]
367 |     colnames(melted_sub)[1] <- "accession"
368 | 
369 |     # Replace the rank2 accession with the rank2 taxa
370 |     melted_sub <- join(melted_sub, unique_factors_sub, by = "accession")
371 |     melted_sub$rank2 <- as.factor(melted_sub[[rank]])
372 | 
373 |     # Drop all columns except the three needed
374 |     melted_sub <- melted_sub[ , colnames(melted_sub) %in% c("rank2", "dist", "rank1", "species")]
375 | 
376 |     # Drop all rows with missing information
377 |     melted_sub <- na.omit(melted_sub)
378 | 
379 |     # We only want distances within a taxa, so drop all comparisons between taxa
380 |     # We also want to drop any comparisons of a species to itself, which will have dist == 0
381 |     melted_sub <- melted_sub[melted_sub$rank1 == melted_sub$rank2 & melted_sub$species != melted_sub$species.1, ]
382 | 
383 |     # Calculate the mean distance for each taxa compared
384 |     #   We calculate each separately to avoid any one taxa with lots of hits (like human seqs)
385 |     #   from skewing the mean
386 |     melted_sub$group <- paste(melted_sub$rank1, melted_sub$rank2)
387 | 
388 |     # Plug the means into the storage dataframe
389 |     rank_dist_mean[[rank]] <- mean(melted_sub$dist)
390 |   }
391 |   message("\nAverage number of nucleotide differences between sequences within a given taxonomic group")
392 |   message("See function description for further details")
393 |   rank_dist_mean
394 | }
395 | 


--------------------------------------------------------------------------------
/R/search.R:
--------------------------------------------------------------------------------
  1 | #PrimerTree
  2 | #Copyright (C) 2013 Jim Hester
  3 | 
  4 | #' Query a pair of primers using ncbi's Primer-BLAST, if primers contain iupac
  5 | #'
  6 | #' ambiguity codes, enumerate all possible combinations and combine the
  7 | #' results.
  8 | #' @param forward forward primer to search by 5'-3' on plus strand
  9 | #' @param reverse reverse primer to search by 5'-3' on minus strand
 10 | #' @param num_aligns number of alignment results to keep
 11 | #' @param num_permutations the number of primer permutations to search, if the degenerate bases
 12 | #'        cause more than this number of permutations to exist, this number will be
 13 | #'        sampled from all possible permutations.
 14 | #' @param ... additional arguments passed to Primer-Blast
 15 | #' @param .parallel if 'TRUE', perform in parallel, using parallel backend
 16 | #'        provided by foreach
 17 | #' @param .progress name of the progress bar to use, see 'create_progress_bar'
 18 | #' @return httr response object of the query, pass to \code{\link{parse_primer_hits}} to parse the results.
 19 | #' @export
 20 | primer_search = function(forward, reverse, num_aligns=500, num_permutations=25, ..., .parallel=FALSE, .progress='none'){
 21 |   if(missing(forward) || missing(reverse))
 22 |     BLAST_primer()
 23 |   primers = enumerate_primers(forward, reverse)
 24 |   num_primers = nrow(primers)
 25 |   if(num_primers > num_permutations){
 26 |     warning(immediate.=T, 'primers have ', num_primers, ' possible combinations due to degenerate bases, sampling ', num_permutations, " primers, use 'num_permutations' to change")
 27 |     primers = primers[ sample.int(num_primers, num_permutations, replace=F), ]
 28 |   }
 29 |   message('BLASTing ', nrow(primers), ' primer combinations')
 30 |   #enumerate all combinations to handle ambiguity codes
 31 |   alply(primers, .margins=1, .expand=F, .parallel=.parallel, .progress=.progress,
 32 |         function(row) BLAST_primer(row$forward, row$reverse, num_targets_with_primers=max(num_aligns %/% nrow(primers), 1), ...))
 33 | }
 34 | iupac = list( "M" = list("A", "C"),
 35 |               "R" = list("A", "G"),
 36 |               "W" = list("A", "T"),
 37 |               "S" = list("C", "G"),
 38 |               "Y" = list("C", "T"),
 39 |               "K" = list("G", "T"),
 40 |               "V" = list("A", "C", "G"),
 41 |               "H" = list("A", "C", "T"),
 42 |               "D" = list("A", "G", "T"),
 43 |               "B" = list("C", "G", "T"),
 44 |               "N" = list("A", "C", "G", "T"),
 45 |               "I" = list("A", "T", "C"))
 46 | 
 47 | enumerate_primers = function(forward, reverse){
 48 |   forward_primers = enumerate_ambiguity(forward)
 49 |   data.frame(forward=forward_primers,
 50 |              reverse=rep(enumerate_ambiguity(reverse),
 51 |                          each=length(forward_primers)),
 52 |              stringsAsFactors = FALSE)
 53 | }
 54 | enumerate_ambiguity = function(sequence){
 55 |   search_regex = paste(names(iupac), collapse='|')
 56 |   locs = str_locate_all(sequence, search_regex)
 57 |   sequences = list()
 58 |   count = 1
 59 |   for (i in seq_len(nrow(locs[[1]]))){
 60 |     loc = locs[[1]][i,]
 61 |     ambiguity = str_sub(sequence, loc[1], loc[2])
 62 |     for(type in iupac[[ambiguity]]){
 63 |       new_seq = sequence
 64 |       str_sub(new_seq, loc[1], loc[2]) <- type
 65 |       sequences[[count]] = enumerate_ambiguity(new_seq)
 66 |       count = count + 1
 67 |     }
 68 |     return(unlist(sequences))
 69 |   }
 70 |   return(sequence)
 71 | }
 72 | 
 73 | print_options = function(options){
 74 |   output = capture.output( print(
 75 |                                  options[ is.na(options$type) | options$type != 'hidden',
 76 |                                         c('name', 'type', 'defval') ])
 77 |                           )
 78 | 
 79 |   message(paste(output, "\n", sep=""))
 80 | }
 81 | 
 82 | BLAST_primer = function(forward, reverse, ..., organism='',
 83 |   primer_specificity_database='nt', exclude_env='on'){
 84 | 
 85 |   url = 'https://www.ncbi.nlm.nih.gov/tools/primer-blast/'
 86 |   form = GET_retry(url)
 87 | 
 88 |   content = parsable_html(form)
 89 | 
 90 |   all_options = get_options(content)
 91 | 
 92 |   if(missing(forward) || missing(reverse)){
 93 |     print_options(all_options)
 94 |     stop('No primers specified')
 95 |   }
 96 | 
 97 |   options = list(..., primer_left_input=forward, primer_right_input=reverse,
 98 |                  organism=organism,
 99 |                  primer_specificity_database=primer_specificity_database,
100 |                  exclude_env=exclude_env,
101 |                  search_specific_primer='on')
102 | 
103 |   names(options) = toupper(names(options))
104 | 
105 |   match_args = pmatch(names(options), all_options$name)
106 |   bad_args = is.na(match_args)
107 | 
108 |   if(any(bad_args)){
109 |     print_options(all_options)
110 |     stop(paste(names(options)[bad_args], collapse=','), ' not valid option\n')
111 |   }
112 | 
113 |   options = get_defaults(options, all_options)
114 | 
115 |   start_time = now()
116 | 
117 |   message('Submitting Primer-BLAST query')
118 |   response = POST_retry(paste(url, 'primertool.cgi', sep=''), body=options)
119 | 
120 |   values = get_refresh_from_meta(response)
121 | 
122 |   while(length(values) > 0){
123 |     message('BLAST alignment processing, refreshing in ', values[1], ' seconds...')
124 |     Sys.sleep(values[1])
125 |     response = GET_retry(values[2])
126 | 
127 |     values = get_refresh_from_meta(response)
128 |   }
129 |   total_time = start_time %--% now()
130 |   message('BLAST alignment completed in ', total_time %/% seconds(1), ' seconds')
131 |   response
132 | }
133 | 
134 | #modify a function to check the status and retry until success
135 | retry = function(fun, num_retry=5, ...){
136 |   function(...){
137 |     res = fun(...)
138 |     itr = 0
139 |     status = http_status(res)
140 |     while(itr < num_retry && inherits(res, 'response') && tolower(status$category) != 'success'){
141 |       warning('request failed, retry attempt ', itr+1)
142 |       res = fun(...)
143 |       status = http_status(res)
144 |       itr = itr + 1
145 |       #sleep to avoid hitting NCBI query rate limit :'-(
146 |       Sys.sleep(0.4)
147 |     }
148 |     res
149 |   }
150 | }
151 | 
152 | GET_retry = retry(GET)
153 | POST_retry = retry(POST)
154 | 
155 | #' Parse the primer hits
156 | #'
157 | #' @param response a httr response object obtained from \code{\link{primer_search}}
158 | #' @export
159 | parse_primer_hits = function(response){
160 |   content = parsable_html(response)
161 |   rbind.fill(xpathApply(content, '//pre', parse_pre))
162 | }
163 | parse_a = function(a){
164 |   #links like entrez/viewer.fcgi?db=nucleotide&id=452085006
165 |   m = regexpr('id=\\d+', xmlAttrs(a)['href'])
166 |   gi = gsub('id=', '', unlist(regmatches(xmlAttrs(a)['href'], m)))
167 |   if(length(gi) == 0L){
168 |     #links like nucleotide/449036831?from=1107741&to=1107929&report=gbwithparts
169 |     m = regexpr('nucleotide/\\d+', xmlAttrs(a)['href'])
170 |     gi = gsub('nucleotide/', '', unlist(regmatches(xmlAttrs(a)['href'], m)))
171 |   }
172 |   if(length(gi) == 0L) gi = NA
173 |   data.frame(gi=as.character(gi), accession=as.character(xmlValue(a)))
174 | }
175 | 
176 | parse_pre = function(pre){
177 |   pre_text = xmlValue(pre)
178 | 
179 |   a = getNodeSet(pre, './preceding-sibling::a[1]')
180 |   if(length(a) <= 0)
181 |     stop('Parsing failed for ', pre_text)
182 | 
183 |   ids = parse_a(a[[1]])
184 | 
185 |   product_length_regex = 'product length = (\\d+)'
186 |   template_regex = 'Template[^\\d]+(\\d+)[^.ACGT]+([.ACGT]+)[^\\d]+(\\d+)'
187 |   full_regex = paste('[\\S\\W]*', product_length_regex, '[\\S\\W]*?',
188 |                      template_regex, '[\\S\\W]*', template_regex, '[\\S\\W]*', sep='')
189 |   values = str_split(gsub(full_regex, paste('\\', 1:8, sep='', collapse='|'),
190 |                            pre_text, perl=T), '[|]')[[1]]
191 |   data.frame(ids,
192 |              product_length=as.numeric(values[1]),
193 |              mismatch_forward=str_count(values[3], '[ACGT]'),
194 |              mismatch_reverse=str_count(values[6], '[ACGT]'),
195 |              forward_start = as.numeric(values[2]),
196 |              forward_stop = as.numeric(values[4]),
197 |              reverse_start = as.numeric(values[5]),
198 |              reverse_stop = as.numeric(values[7]),
199 |              product_start=min(as.numeric(values[c(2,4,5,7)])),
200 |              product_stop=max(as.numeric(values[c(2,4,5,7)]))
201 |              )
202 | }
203 | get_refresh_from_meta = function(response){
204 |   content = parsable_html(response)
205 |   meta = content['//meta[@http-equiv="Refresh"]']
206 |   if(length(meta) > 0){
207 |     values = str_split(xmlAttrs(meta[[1]])['content'], '; URL=')[[1]]
208 |     return(values)
209 |   }
210 |   return()
211 | }
212 | 
213 | get_defaults = function(set_options, options){
214 |   #only look at values with set defaults
215 |   options = options[ !is.na(options$defval), ]
216 |   unchanged_options = setdiff(options$name, names(set_options))
217 |   default_values = as.character(options[ match(unchanged_options, options$name), 'defval' ])
218 |   names(default_values) = unchanged_options
219 |   c(set_options, default_values)
220 | }
221 | 
222 | get_options = function(content){
223 |   options = rbind.fill(xpathApply(content, '//form//input | //form//select', parse_attributes))
224 |   options$type = as.character(options$type)
225 | 
226 |   #add dropdown type if they are NA
227 |   options$type[is.na(options$type)] <- 'dropdown'
228 | 
229 |   #make default values for checkboxes on or off rather than checked or unchecked
230 |   options$defval = as.character(options$defval)
231 |   check_map = c('checked' = 'on', 'unchecked' = '')
232 |   checkboxes = which(options$type == 'checkbox')
233 |   options$defval[ checkboxes ] = check_map[ options$defval[ checkboxes ] ]
234 | 
235 |   options[ options$type != 'hidden',  c('name', 'type', 'defval') ]
236 | }
237 | 
238 | parse_attributes = function(x){
239 |   as.data.frame(t(xmlAttrs(x)))
240 | }
241 | parsable_html = function(response){
242 |   txt <- content(response, as='text', encoding='UTF-8')
243 | 
244 |   # remove any unicode characters
245 |   Encoding(txt) <- "UTF-8"
246 |   txt <- iconv(txt, "UTF-8", "ASCII", sub = "")
247 | 
248 |   #this gsub regex is to remove the definition lines, some of which have
249 |   #  bracketed <junk> in them, which messes up the parsing
250 |   txt <- gsub('("new_entrez".*?</a>).*?<pre>\n\n', '\\1\n<pre>', txt)
251 |   htmlParse(txt)
252 | }
253 | filter_duplicates = function(hits){
254 |   location_columns = c('accession', 'forward_start', 'forward_stop', 'reverse_start', 'reverse_stop')
255 |   hits[!duplicated(t(apply(hits[location_columns], 1, range))),]
256 | }
257 | 


--------------------------------------------------------------------------------
/R/sequence.R:
--------------------------------------------------------------------------------
  1 | #PrimerTree
  2 | #Copyright (C) 2013 Jim Hester
  3 | 
  4 | #' Retrieves a fasta sequence from NCBI nucleotide database.
  5 | #'
  6 | #' @param accession nucleotide accession to retrieve.
  7 | #' @param start start base to retrieve, numbered beginning at 1.  If NULL the
  8 | #'        beginning of the sequence.
  9 | 
 10 | #' @param stop last base to retrieve, numbered beginning at 1. if NULL the end of
 11 | #'        the sequence.
 12 | #' @param api_key NCBI api-key to allow faster sequence retrieval.
 13 | #' @return an DNAbin object.
 14 | #' @seealso \code{\link{DNAbin}}
 15 | #' @export
 16 | 
 17 | get_sequence = function(accession, start=NULL, stop=NULL, api_key=Sys.getenv("NCBI_API_KEY")){
 18 | 
 19 |   fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
 20 | 
 21 |   query=list(db='nuccore', rettype='fasta', retmode='text', id=accession)
 22 | 
 23 |   if(!is.null(start))
 24 |     query$seq_start = start
 25 | 
 26 |   if(!is.null(stop))
 27 |     query$seq_stop = stop
 28 | 
 29 |   if(nzchar(api_key))
 30 |     query$api_key = api_key
 31 | 
 32 |   response = POST_retry(fetch_url, body=query)
 33 | 
 34 |   #stop if response failed
 35 |   stop_for_status(response)
 36 | 
 37 |   content = content(response, as='raw')
 38 | 
 39 |   #from ape package read.FASTA
 40 |   res <- .Call("rawStreamToDNAbin", content)
 41 |   names(res) <- sub("^ +", "", names(res))
 42 |   class(res) <- "DNAbin"
 43 |   res
 44 | }
 45 | 
 46 | #' Retrieves fasta sequences from NCBI nucleotide database.
 47 | #'
 48 | #' @param accession the accession number of the sequence to retrieve
 49 | #' @param start start bases to retrieve, numbered beginning at 1.  If NULL the
 50 | #'        beginning of the sequence.
 51 | 
 52 | #' @param stop stop bases to retrieve, numbered beginning at 1. if NULL the stop of
 53 | #'        the sequence.
 54 | #' @param api_key NCBI api-key to allow faster sequence retrieval.
 55 | #' @param simplify simplify the FASTA headers to include only the genbank
 56 | #'        accession.
 57 | #' @param .parallel if 'TRUE', perform in parallel, using parallel backend
 58 | #'        provided by foreach
 59 | #' @param .progress name of the progress bar to use, see 'create_progress_bar'
 60 | #' @return an DNAbin object.
 61 | #' @seealso \code{\link{DNAbin}}
 62 | #' @export
 63 | 
 64 | get_sequences = function(accession, start=NULL, stop=NULL, api_key=Sys.getenv("NCBI_API_KEY"), simplify=TRUE, .parallel=FALSE, .progress='none'){
 65 |   #expand arguments by recycling
 66 |   args = expand_arguments(accession=accession, start=start, stop=stop)
 67 |   #assign expanded arguments to actual arguments
 68 |   lapply(seq_along(args), function(i) names(args)[i] <<- args[i])
 69 | 
 70 |   #define rate to query NCBI servers with get_sequences
 71 |   query_rate <- 3; #queries per second
 72 | 
 73 |   if(nzchar(api_key)) {
 74 |     query_rate <- 10
 75 |   } else {
 76 |     warning("Sequence retrieval limited to 3 per second. Provide an api_key to increase this to 10. See:
 77 |   https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/", immediate. = TRUE)
 78 |   }
 79 | 
 80 |   size = length(accession)
 81 |   get_sequence_itr = function(i){
 82 |     start_time <- Sys.time()
 83 |     sequence = get_sequence(accession[i], start[i], stop[i], api_key)
 84 |     stop_time <- Sys.time()
 85 |     if((stop_time - start_time) < (1 / query_rate)) {
 86 |       #sleep to limit query rate :-(
 87 |       Sys.sleep((1/query_rate) - (stop_time - start_time))
 88 |     }
 89 |     sequence
 90 |   }
 91 |   sequences = alply(seq_along(accession), .margins=1, .parallel=.parallel, .progress=.progress, failwith(NA, f=get_sequence_itr))
 92 |   names = if(simplify) accession else laply(sequences, names)
 93 |   sequences = llply(sequences, `[[`, 1)
 94 |   names(sequences) = names
 95 |   class(sequences) = 'DNAbin'
 96 |   sequences
 97 | }
 98 | # from http://stackoverflow.com/questions/9335099/implementation-of-standard-recycling-rules
 99 | expand_arguments <- function(...){
100 |   dotList <- list(...)
101 |   max.length <- max(sapply(dotList, length))
102 |   suppressWarnings(lapply(dotList, rep, length=max.length))
103 | }
104 | #' Construct a neighbor joining tree from a dna alignment
105 | #'
106 | #' @param dna fasta dna object the tree is to be constructed from
107 | #' @param pairwise.deletion a logical indicating if the distance matrix should 
108 | #' be constructed using pairwise deletion
109 | #' @param ... furthur arguments to dist.dna
110 | #' @seealso \code{\link{dist.dna}}, \code{\link{nj}}
111 | #' @export
112 | tree_from_alignment = function(dna, pairwise.deletion=TRUE, ...){
113 |   nj(dist.dna(dna, model="N", pairwise.deletion=pairwise.deletion, ...))
114 | }
115 | #' Multiple sequence alignment with clustal omega
116 | #'
117 | #' Calls clustal omega to align a set of sequences of class DNAbin.  Run
118 | #' without any arguments to see all the options you can pass to the command
119 | #' line clustal omega.
120 | #' @param x an object of class 'DNAbin'
121 | #' @param exec a character string with the name or path to the program
122 | #' @param quiet whether to supress output to stderr or stdout
123 | #' @param original.ordering use the original ordering of the sequences
124 | #' @param ... additional arguments passed to the command line clustalo
125 | #' @export
126 | clustalo = function (x, exec = 'clustalo', quiet = TRUE, original.ordering = TRUE, ...)
127 | {
128 |     help_text = system(paste(exec, '--help'), intern=TRUE)
129 |     all_options = get_command_options(help_text)
130 | 
131 |     inf <- tempfile(fileext='.fas')
132 |     outf <- tempfile(fileext='.aln')
133 | 
134 |     options = c(infile=inf, outfile=outf, list(...))
135 |     match_args = pmatch(names(options), names(all_options), duplicates.ok=TRUE)
136 |     bad_args = is.na(match_args)
137 | 
138 |     if (missing(x)){
139 |         message(paste(help_text, collapse="\n"))
140 |         stop('No input')
141 |     }
142 |     if(any(bad_args)){
143 |       stop(paste(names(options)[bad_args], collapse=','), ' not valid option\n')
144 |     }
145 | 
146 |     write.dna(x, inf, "fasta")
147 |     args = paste(paste(all_options[match_args], options, collapse=' '))
148 |     system2(exec, args=args, stdout = ifelse(quiet, FALSE, ''), stderr = ifelse(quiet, FALSE, ''))
149 |     res <- read.dna(outf, "fasta")
150 |     if (original.ordering)
151 |         res <- res[labels(x), ]
152 |     res
153 | }
154 | 
155 | #parses the usage and enumerates the commands
156 | get_command_options = function(usage){
157 |   m = gregexpr('-+\\w+', usage)
158 |   arguments = unlist(regmatches(usage, m))
159 |   names(arguments) = gsub('-+', '', arguments)
160 |   arguments
161 | }
162 | 


--------------------------------------------------------------------------------
/R/taxonomy.R:
--------------------------------------------------------------------------------
 1 | #PrimerTree
 2 | #Copyright (C) 2013 Jim Hester
 3 | 
 4 | #' Retrieve the taxonomy information from NCBI for a set of nucleotide gis.
 5 | #'
 6 | #' @param accessions a character vector of the accessions to retrieve
 7 | #' @return data.frame of the 'accessions, taxIds, and taxonomy
 8 | #' @export
 9 | 
10 | get_taxonomy = function(accessions){
11 | 
12 |   accessions = unique(as.character(accessions))
13 |   taxids = accession2taxid(accessions)
14 | 
15 |   taxonomy=fetch_taxonomy(unique(taxids))
16 |   merge(
17 |     data.frame(accession=names(taxids), taxId=taxids, stringsAsFactors=FALSE),
18 |     taxonomy
19 |   )
20 | }
21 | #' Maps a nucleotide database accession to a taxonomy database taxId
22 | #'
23 | #' @param accessions accessions character vector to lookup.
24 | #' @return named vector of taxIds.
25 | #' @export
26 | 
27 | accession2taxid = function(accessions){
28 |   url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
29 | 
30 |   names(accessions) = rep('id', times=length(accessions))
31 |   query=c(list(db='taxonomy', dbfrom='nuccore', idtype='acc'), accessions)
32 | 
33 |   response=POST_retry(url, body=query)
34 | 
35 |   #stop if response failed
36 |   stop_for_status(response)
37 | 
38 |   parsed = xmlParse(content(response, as="text"))
39 | 
40 |   xpathSApply(parsed, '//LinkSet', parse_LinkSet)
41 | }
42 | parse_LinkSet = function(LinkSet){
43 |   gid = xpathSApply(LinkSet, './/IdList/Id', xmlValue)
44 |   taxid = xpathSApply(LinkSet, './/LinkSetDb/Link/Id', xmlValue)
45 |   if(length(taxid) != 1)
46 |     taxid = NA
47 |   names(taxid) = gid
48 |   taxid
49 | }
50 | 
51 | fetch_taxonomy = function(taxid) {
52 | 
53 |   fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
54 | 
55 |   query=list(db='taxonomy', rettype='null', retmode='xml', id=paste(taxid, collapse=','))
56 | 
57 |   response = POST_retry(fetch_url, body=query)
58 | 
59 |   #stop if response failed
60 |   stop_for_status(response)
61 | 
62 |   parse_taxonomy_xml(xmlParse(content(response, as="text")))
63 | }
64 | parse_taxonomy_xml = function(xml){
65 |   rbind.fill(xpathApply(xml, '//TaxaSet/Taxon', parse_taxon))
66 | }
67 | parse_taxon = function(taxon){
68 |   tax_id = xpathSApply(taxon, './TaxId', xmlValue)
69 |   ranks = xpathSApply(taxon, './/Rank', xmlValue)
70 |   names = xpathSApply(taxon, './/ScientificName', xmlValue)
71 |   names(names) = ranks
72 |   names = names[ranks != 'no rank']
73 |   names['taxId'] = tax_id
74 |   as.data.frame(t(names), stringsAsFactors=FALSE)
75 | }
76 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PrimerTree #
 2 | [![Travis-CI Build Status](https://travis-ci.org/jimhester/primerTree.svg?branch=master)](https://travis-ci.org/jimhester/primerTree)
 3 | PrimerTree: Visually Assessing the Specificity and Informativeness of Primer Pairs
 4 | 
 5 | * [Features](#features)
 6 | * [Examples](#examples)
 7 | * [Installation](#installation)
 8 | * [Usage](#usage)
 9 | 
10 | ## Features ##
11 | * Automatically query a primer pair and generate a tree of the products.
12 | * Analysis can be run and then plotted in two commands.
13 | * All options of Primer-Blast and clustal omega are supported.
14 | 
15 | ## Installation ##
16 | ### R installation ###
17 | #### CRAN ####
18 | ```s
19 | install.packages('primerTree')
20 | ```
21 | #### Github ####
22 | ```s
23 | # install.packages("devtools")
24 | devtools::install_github("jimhester/primerTree")
25 | ```
26 | ### [Clustal Omega](http://www.clustal.org/omega/#Download) Installation ###
27 | #### Windows ####
28 | Use the pre-compiled windows binary.  Either put the installed clustalo.exe in your path, or pass the path to the executable in the clustal_options option
29 | ```s
30 | library(primerTree)
31 | mammals_16S = search_primer_pair(name='Mammals 16S',
32 |   'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT', clustal_options=c(exec='C:\Program Files\Clustal Omega\clustalo.exe'))
33 | ```
34 | #### Linux ####
35 | Simple installation from source
36 | ```bash
37 | ./configure && make && make install
38 | ```
39 | If the resulting clustalo program is in your path it should be automatically found,
40 | otherwise see the windows instructions on how to specify the path to the
41 | executable.
42 | 
43 | #### Mac OSX ####
44 | Clustal Omega can be installed from source but requires Xcode CLT and argtable, which are not pre-installed on OSX.
45 | 
46 | First install Xcode CLT.
47 | ```bash
48 | xcode-select --install
49 | ```
50 | then install argtable
51 | ```bash
52 | brew install argtable
53 | ```
54 | Now you can install Clustal Omega from source.
55 | From the clustalo directory:
56 | 
57 | ```bash
58 | ./configure && sudo make install
59 | ```
60 | 
61 | ## Usage ##
62 | Simple search for a Mammal 16S primer
63 | ```s
64 | library(primerTree)
65 | mammals_16S = search_primer_pair(name='Mammals 16S', 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT')
66 | plot(mammals_16S)
67 | ```
68 | 
69 | Using the parallel features with the multicore package using the doMC backend, with 8 threads.
70 | ```s
71 | library(doMC)
72 | registerDoMC(8)
73 | library(primerTree)
74 | mammals_16S = search_primer_pair(name='Mammals 16S',
75 |   'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT', .parallel=T)
76 | plot(mammals_16S)
77 | ```
78 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | # Documentation #
 2 | [x] - figure out a good name.
 3 | [x] - document all exported functions with examples
 4 | [ ] - create vignette showing workflow by piece and automatic
 5 | [x] - write wrapping function for entire workflow
 6 | [x] - implement plot_rank function to plot a single rank by itself
 7 | 
 8 | # Implementation #
 9 | [x] - filter alignments for exact sequences (how to handle different taxa???)
10 | [x] - handle hits to the same gi at different locations, this will crash clustalw currently. # USE CLUSTALO
11 | [ ] - enumerate dropdown options
12 | [x] - very degenerate primers?
13 | [x] - specify clustalo parameters, location
14 | [x] - retry failed connections
15 | 
16 | # Paper #
17 | [x] - submit paper to bioinformatics
18 | [ ] - submit package to CRAN
19 | 


--------------------------------------------------------------------------------
/data/bryophytes_trnL.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVesuviusC/primerTree/6b257ba8f6113bb5506b777c19b07bc06191be94/data/bryophytes_trnL.RData


--------------------------------------------------------------------------------
/data/mammals_16S.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MVesuviusC/primerTree/6b257ba8f6113bb5506b777c19b07bc06191be94/data/mammals_16S.RData


--------------------------------------------------------------------------------
/man/accession2taxid.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/taxonomy.R
 3 | \name{accession2taxid}
 4 | \alias{accession2taxid}
 5 | \title{Maps a nucleotide database accession to a taxonomy database taxId}
 6 | \usage{
 7 | accession2taxid(accessions)
 8 | }
 9 | \arguments{
10 | \item{accessions}{accessions character vector to lookup.}
11 | }
12 | \value{
13 | named vector of taxIds.
14 | }
15 | \description{
16 | Maps a nucleotide database accession to a taxonomy database taxId
17 | }
18 | 


--------------------------------------------------------------------------------
/man/bryophytes_trnL.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \docType{data}
 4 | \name{bryophytes_trnL}
 5 | \alias{bryophytes_trnL}
 6 | \title{PrimerTree results for the bryophyte trnL primers}
 7 | \description{
 8 | PrimerTree results for the bryophyte trnL primers
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/calc_rank_dist_ave.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{calc_rank_dist_ave}
 4 | \alias{calc_rank_dist_ave}
 5 | \title{Summarize pairwise differences.}
 6 | \usage{
 7 | calc_rank_dist_ave(x, ranks = common_ranks)
 8 | }
 9 | \arguments{
10 | \item{x}{a primerTree object}
11 | 
12 | \item{ranks}{ranks to show unique counts for, defaults to the common ranks}
13 | }
14 | \value{
15 | returns a data frame of results
16 | }
17 | \description{
18 | Summarize pairwise differences.
19 | }
20 | \details{
21 | The purpose of this function is to calculate the average number
22 | of nucleotide differences between species within each taxa of given taxonomic
23 | level.
24 | 
25 | For example, at the genus level, the function calculates the average number
26 | of nucleotide differences between all species within each genus and reports
27 | the mean of those values.
28 | 
29 | There are several key assumptions and calculations made in this
30 | function.
31 | 
32 | First, the function randomly selects one sequence from each species
33 | in the primerTree results. This is to keep any one species (e.g.
34 | human, cow, etc.) with many hits from skewing the results.
35 | 
36 | Second, for each taxonomic level tested, the function divides the
37 | sequences by each taxon at that level and calculates the mean
38 | number of nucleotide differences within that taxa, then returns the
39 | mean of those values.
40 | 
41 | Third, when calculating the average distance, any taxa for which
42 | there is only one species is omitted, as the number of nucleotide
43 | differences will always be 0.
44 | }
45 | \examples{
46 | \dontrun{
47 | calc_rank_dist_ave(mammals_16S)
48 | 
49 | calc_rank_dist_ave(bryophytes_trnL)
50 | 
51 | # Note that the differences between the results from these two primers
52 | # the mean nucleotide differences is much higher for the mammal primers
53 | # than the byrophyte primers. This suggests that the mammal primers have
54 | # better resolution to distinguish individual species.
55 | }
56 | }
57 | 


--------------------------------------------------------------------------------
/man/clustalo.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{clustalo}
 4 | \alias{clustalo}
 5 | \title{Multiple sequence alignment with clustal omega}
 6 | \usage{
 7 | clustalo(x, exec = "clustalo", quiet = TRUE, original.ordering = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{an object of class 'DNAbin'}
11 | 
12 | \item{exec}{a character string with the name or path to the program}
13 | 
14 | \item{quiet}{whether to supress output to stderr or stdout}
15 | 
16 | \item{original.ordering}{use the original ordering of the sequences}
17 | 
18 | \item{...}{additional arguments passed to the command line clustalo}
19 | }
20 | \description{
21 | Calls clustal omega to align a set of sequences of class DNAbin.  Run
22 | without any arguments to see all the options you can pass to the command
23 | line clustal omega.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/filter_seqs.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterResults.R
 3 | \name{filter_seqs}
 4 | \alias{filter_seqs}
 5 | \alias{filter_seqs.primerTree}
 6 | \title{Filter out sequences retrieved by search_primer_pair() that are either too
 7 | short or too long. The alignment and tree will be recalculated after removing
 8 | unwanted reads.}
 9 | \usage{
10 | filter_seqs(x, ...)
11 | 
12 | \method{filter_seqs}{primerTree}(x, min_length = 0, max_length = Inf, ...)
13 | }
14 | \arguments{
15 | \item{x}{a primerTree object}
16 | 
17 | \item{...}{additional arguments passed to methods.}
18 | 
19 | \item{min_length}{the minimum sequence length to keep}
20 | 
21 | \item{max_length}{the maximum sequence length to keep}
22 | }
23 | \value{
24 | a primerTree object
25 | }
26 | \description{
27 | Filter out sequences retrieved by search_primer_pair() that are either too
28 | short or too long. The alignment and tree will be recalculated after removing
29 | unwanted reads.
30 | }
31 | \section{Methods (by class)}{
32 | \itemize{
33 | \item \code{filter_seqs(primerTree)}: Method for primerTree objects
34 | 
35 | }}
36 | \examples{
37 | \dontrun{
38 | # filter out sequences longer or shorter than desired:
39 | mammals_16S_filtered <- filter_seqs(mammals_16S, min_length=131, max_length=156)
40 | }
41 | }
42 | 


--------------------------------------------------------------------------------
/man/get_sequence.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{get_sequence}
 4 | \alias{get_sequence}
 5 | \title{Retrieves a fasta sequence from NCBI nucleotide database.}
 6 | \usage{
 7 | get_sequence(
 8 |   accession,
 9 |   start = NULL,
10 |   stop = NULL,
11 |   api_key = Sys.getenv("NCBI_API_KEY")
12 | )
13 | }
14 | \arguments{
15 | \item{accession}{nucleotide accession to retrieve.}
16 | 
17 | \item{start}{start base to retrieve, numbered beginning at 1.  If NULL the
18 | beginning of the sequence.}
19 | 
20 | \item{stop}{last base to retrieve, numbered beginning at 1. if NULL the end of
21 | the sequence.}
22 | 
23 | \item{api_key}{NCBI api-key to allow faster sequence retrieval.}
24 | }
25 | \value{
26 | an DNAbin object.
27 | }
28 | \description{
29 | Retrieves a fasta sequence from NCBI nucleotide database.
30 | }
31 | \seealso{
32 | \code{\link{DNAbin}}
33 | }
34 | 


--------------------------------------------------------------------------------
/man/get_sequences.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{get_sequences}
 4 | \alias{get_sequences}
 5 | \title{Retrieves fasta sequences from NCBI nucleotide database.}
 6 | \usage{
 7 | get_sequences(
 8 |   accession,
 9 |   start = NULL,
10 |   stop = NULL,
11 |   api_key = Sys.getenv("NCBI_API_KEY"),
12 |   simplify = TRUE,
13 |   .parallel = FALSE,
14 |   .progress = "none"
15 | )
16 | }
17 | \arguments{
18 | \item{accession}{the accession number of the sequence to retrieve}
19 | 
20 | \item{start}{start bases to retrieve, numbered beginning at 1.  If NULL the
21 | beginning of the sequence.}
22 | 
23 | \item{stop}{stop bases to retrieve, numbered beginning at 1. if NULL the stop of
24 | the sequence.}
25 | 
26 | \item{api_key}{NCBI api-key to allow faster sequence retrieval.}
27 | 
28 | \item{simplify}{simplify the FASTA headers to include only the genbank
29 | accession.}
30 | 
31 | \item{.parallel}{if 'TRUE', perform in parallel, using parallel backend
32 | provided by foreach}
33 | 
34 | \item{.progress}{name of the progress bar to use, see 'create_progress_bar'}
35 | }
36 | \value{
37 | an DNAbin object.
38 | }
39 | \description{
40 | Retrieves fasta sequences from NCBI nucleotide database.
41 | }
42 | \seealso{
43 | \code{\link{DNAbin}}
44 | }
45 | 


--------------------------------------------------------------------------------
/man/get_taxonomy.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/taxonomy.R
 3 | \name{get_taxonomy}
 4 | \alias{get_taxonomy}
 5 | \title{Retrieve the taxonomy information from NCBI for a set of nucleotide gis.}
 6 | \usage{
 7 | get_taxonomy(accessions)
 8 | }
 9 | \arguments{
10 | \item{accessions}{a character vector of the accessions to retrieve}
11 | }
12 | \value{
13 | data.frame of the 'accessions, taxIds, and taxonomy
14 | }
15 | \description{
16 | Retrieve the taxonomy information from NCBI for a set of nucleotide gis.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/identify.primerTree_plot.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{identify.primerTree_plot}
 4 | \alias{identify.primerTree_plot}
 5 | \title{identify the point closest to the mouse click
 6 | only works on single ranks}
 7 | \usage{
 8 | \method{identify}{primerTree_plot}(x, ...)
 9 | }
10 | \arguments{
11 | \item{x}{the plot to identify}
12 | 
13 | \item{...}{additional arguments passed to annotate}
14 | }
15 | \description{
16 | identify the point closest to the mouse click
17 | only works on single ranks
18 | }
19 | 


--------------------------------------------------------------------------------
/man/layout_tree_ape.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{layout_tree_ape}
 4 | \alias{layout_tree_ape}
 5 | \title{layout a tree using ape, return an object to be plotted by
 6 | \code{\link{plot_tree}}}
 7 | \usage{
 8 | layout_tree_ape(tree, ...)
 9 | }
10 | \arguments{
11 | \item{tree}{The \code{\link{phylo}} tree to be plotted}
12 | 
13 | \item{...}{additional arguments to \code{\link{plot.phylo}}}
14 | }
15 | \value{
16 | \item{edge}{list of x, y and xend, yend coordinates
17 | as well as ids for the edges}
18 | \item{tips}{list of x, y, label and id for the tips}
19 | \item{nodes}{list of x, y and id for the nodes}
20 | }
21 | \description{
22 | layout a tree using ape, return an object to be plotted by
23 | \code{\link{plot_tree}}
24 | }
25 | 


--------------------------------------------------------------------------------
/man/mammals_16S.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \docType{data}
 4 | \name{mammals_16S}
 5 | \alias{mammals_16S}
 6 | \title{PrimerTree results for the mammalian 16S primers}
 7 | \description{
 8 | PrimerTree results for the mammalian 16S primers
 9 | }
10 | 


--------------------------------------------------------------------------------
/man/parse_primer_hits.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/search.R
 3 | \name{parse_primer_hits}
 4 | \alias{parse_primer_hits}
 5 | \title{Parse the primer hits}
 6 | \usage{
 7 | parse_primer_hits(response)
 8 | }
 9 | \arguments{
10 | \item{response}{a httr response object obtained from \code{\link{primer_search}}}
11 | }
12 | \description{
13 | Parse the primer hits
14 | }
15 | 


--------------------------------------------------------------------------------
/man/plot.primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{plot.primerTree}
 4 | \alias{plot.primerTree}
 5 | \title{plot function for a primerTree object, calls plot_tree_ranks}
 6 | \usage{
 7 | \method{plot}{primerTree}(x, ranks = NULL, main = NULL, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{primerTree object to plot}
11 | 
12 | \item{ranks}{The ranks to include, defaults to all common ranks, if NULL
13 | print all ranks.  If 'none' just print the layout.}
14 | 
15 | \item{main}{an optional title to display, if NULL displays the name as the title}
16 | 
17 | \item{...}{additional arguments passed to plot_tree_ranks}
18 | }
19 | \description{
20 | plot function for a primerTree object, calls plot_tree_ranks
21 | }
22 | \examples{
23 | library(gridExtra)
24 | library(directlabels)
25 | #plot with all common ranks
26 | plot(mammals_16S)
27 | 
28 | #plot only the class
29 | plot(mammals_16S, 'class')
30 | 
31 | #plot the layout only
32 | plot(mammals_16S, 'none')
33 | }
34 | \seealso{
35 | \code{\link{plot_tree_ranks}}, \code{\link{plot_tree}}
36 | }
37 | 


--------------------------------------------------------------------------------
/man/plot_tree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot_tree}
 4 | \alias{plot_tree}
 5 | \title{plots a tree, optionally with colored and labeled points by taxonomic rank}
 6 | \usage{
 7 | plot_tree(
 8 |   tree,
 9 |   type = "unrooted",
10 |   main = NULL,
11 |   guide_size = NULL,
12 |   rank = NULL,
13 |   taxonomy = NULL,
14 |   size = 2,
15 |   legend_cutoff = 25,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{tree}{to be plotted, use layout_tree to layout tree.}
21 | 
22 | \item{type}{The type of tree to plot, default unrooted.}
23 | 
24 | \item{main}{An optional title for the plot}
25 | 
26 | \item{guide_size}{The size of the length guide.  If NULL auto detects a
27 | reasonable size.}
28 | 
29 | \item{rank}{The rank to include, if null only the tree is plotted}
30 | 
31 | \item{taxonomy}{A data.frame with an accession field corresponding to the
32 | tree tip labels.}
33 | 
34 | \item{size}{The size of the colored points}
35 | 
36 | \item{legend_cutoff}{The number of different taxa names after which the
37 | names are no longer printed.}
38 | 
39 | \item{...}{additional arguments passed to \code{\link{layout_tree_ape}}}
40 | }
41 | \value{
42 | plot to be printed.
43 | }
44 | \description{
45 | plots a tree, optionally with colored and labeled points by taxonomic rank
46 | }
47 | 


--------------------------------------------------------------------------------
/man/plot_tree_ranks.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/plot.R
 3 | \name{plot_tree_ranks}
 4 | \alias{plot_tree_ranks}
 5 | \title{plots a tree along with a series of taxonomic ranks}
 6 | \usage{
 7 | plot_tree_ranks(
 8 |   tree,
 9 |   taxonomy,
10 |   main = NULL,
11 |   type = "unrooted",
12 |   ranks = common_ranks,
13 |   size = 2,
14 |   guide_size = NULL,
15 |   legend_cutoff = 25,
16 |   ...
17 | )
18 | }
19 | \arguments{
20 | \item{tree}{to be plotted, use layout_tree to layout tree.}
21 | 
22 | \item{taxonomy}{A data.frame with an accession field corresponding to the
23 | tree tip labels.}
24 | 
25 | \item{main}{An optional title for the plot}
26 | 
27 | \item{type}{The type of tree to plot, default unrooted.}
28 | 
29 | \item{ranks}{The ranks to include, defaults to all common ranks, if null
30 | print all ranks.}
31 | 
32 | \item{size}{The size of the colored points}
33 | 
34 | \item{guide_size}{The size of the length guide.  If NULL auto detects a
35 | reasonable size.}
36 | 
37 | \item{legend_cutoff}{The number of different taxa names after which the
38 | names are no longer printed.}
39 | 
40 | \item{...}{additional arguments passed to \code{\link{layout_tree_ape}}}
41 | }
42 | \description{
43 | plots a tree along with a series of taxonomic ranks
44 | }
45 | \examples{
46 | library(gridExtra)
47 | library(directlabels)
48 | #plot all the common ranks
49 | plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy)
50 | #plot specific ranks, with a larger dot size
51 | plot_tree_ranks(mammals_16S$tree, mammals_16S$taxonomy,
52 |   ranks=c('kingdom', 'class', 'family'), size=3)
53 | }
54 | \seealso{
55 | \code{\link{plot_tree}} to plot only a single rank or the just the
56 | tree layout.
57 | }
58 | 


--------------------------------------------------------------------------------
/man/primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \docType{package}
 4 | \name{primerTree}
 5 | \alias{primerTree}
 6 | \alias{primerTree-package}
 7 | \title{\pkg{primerTree} Visually Assessing the Specificity and Informativeness of Primer Pairs}
 8 | \description{
 9 | \code{primerTree} has two main commands:
10 | \code{\link{search_primer_pair}} which takes a primer pair and returns an
11 | primerTree object of the search results
12 | \code{\link{plot.primerTree}} a S3 method for plotting the primerTree object
13 | obtained using \code{\link{search_primer_pair}}
14 | }
15 | 


--------------------------------------------------------------------------------
/man/primer_search.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/search.R
 3 | \name{primer_search}
 4 | \alias{primer_search}
 5 | \title{Query a pair of primers using ncbi's Primer-BLAST, if primers contain iupac}
 6 | \usage{
 7 | primer_search(
 8 |   forward,
 9 |   reverse,
10 |   num_aligns = 500,
11 |   num_permutations = 25,
12 |   ...,
13 |   .parallel = FALSE,
14 |   .progress = "none"
15 | )
16 | }
17 | \arguments{
18 | \item{forward}{forward primer to search by 5'-3' on plus strand}
19 | 
20 | \item{reverse}{reverse primer to search by 5'-3' on minus strand}
21 | 
22 | \item{num_aligns}{number of alignment results to keep}
23 | 
24 | \item{num_permutations}{the number of primer permutations to search, if the degenerate bases
25 | cause more than this number of permutations to exist, this number will be
26 | sampled from all possible permutations.}
27 | 
28 | \item{...}{additional arguments passed to Primer-Blast}
29 | 
30 | \item{.parallel}{if 'TRUE', perform in parallel, using parallel backend
31 | provided by foreach}
32 | 
33 | \item{.progress}{name of the progress bar to use, see 'create_progress_bar'}
34 | }
35 | \value{
36 | httr response object of the query, pass to \code{\link{parse_primer_hits}} to parse the results.
37 | }
38 | \description{
39 | ambiguity codes, enumerate all possible combinations and combine the
40 | results.
41 | }
42 | 


--------------------------------------------------------------------------------
/man/search_primer_pair.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{search_primer_pair}
 4 | \alias{search_primer_pair}
 5 | \title{Automatic primer searching Search a given primer pair, retrieving the alignment
 6 | results, their product sequences, the taxonomic information for the sequences,
 7 | a multiple alignment of the products}
 8 | \usage{
 9 | search_primer_pair(
10 |   forward,
11 |   reverse,
12 |   name = NULL,
13 |   num_aligns = 500,
14 |   num_permutations = 25,
15 |   simplify = TRUE,
16 |   clustal_options = list(),
17 |   distance_options = list(model = "N", pairwise.deletion = T),
18 |   api_key = Sys.getenv("NCBI_API_KEY"),
19 |   ...,
20 |   .parallel = FALSE,
21 |   .progress = "none"
22 | )
23 | }
24 | \arguments{
25 | \item{forward}{forward primer to search by 5'-3' on plus strand}
26 | 
27 | \item{reverse}{reverse primer to search by 5'-3' on minus strand}
28 | 
29 | \item{name}{name to give to the primer pair}
30 | 
31 | \item{num_aligns}{number of alignment results to keep}
32 | 
33 | \item{num_permutations}{the number of primer permutations to search, if the degenerate bases
34 | cause more than this number of permutations to exist, this number will be
35 | sampled from all possible permutations.}
36 | 
37 | \item{simplify}{use simple names for primer hit results or complex}
38 | 
39 | \item{clustal_options}{a list of options to pass to clustal omega, see
40 | \code{link{clustalo}} for a list of options}
41 | 
42 | \item{distance_options}{a list of options to pass to dist.dna, see
43 | \code{link{dist.dna}} for a list of options}
44 | 
45 | \item{api_key}{NCBI api-key to allow faster sequence retrieval}
46 | 
47 | \item{...}{additional arguments passed to Primer-Blast}
48 | 
49 | \item{.parallel}{if 'TRUE', perform in parallel, using parallel backend
50 | provided by foreach}
51 | 
52 | \item{.progress}{name of the progress bar to use, see
53 | \code{\link{create_progress_bar}}}
54 | }
55 | \value{
56 | A list with the following elements,
57 | \item{name}{name of the primer pair}
58 | \item{BLAST_result}{html blast results from Primer-BLAST as
59 |  'a \code{\link{response}}} object.
60 | \item{taxonomy}{taxonomy for the primer products from NCBI}
61 | \item{sequence}{sequence of the primer products}
62 | \item{alignment}{multiple alignment of the primer products}
63 | \item{tree}{phylogenetic tree of the reconstructed from the
64 | 'multiple alignment}
65 | }
66 | \description{
67 | Automatic primer searching Search a given primer pair, retrieving the alignment
68 | results, their product sequences, the taxonomic information for the sequences,
69 | a multiple alignment of the products
70 | }
71 | \examples{
72 | \dontrun{
73 | #simple search
74 | mammals_16S = search_primer_pair(name='Mammals 16S',
75 |  'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT')
76 | #returning 1000 alignments, allow up to 3 mismatches in primer
77 | mammals_16S = search_primer_pair(name='Mammals 16S',
78 |  'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT',
79 |  num_aligns=1000, total_primer_specificity_mismatch=3)
80 | }
81 | }
82 | \seealso{
83 | \code{\link{primer_search}}, \code{\link{clustalo}}
84 | }
85 | 


--------------------------------------------------------------------------------
/man/seq_lengths.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterResults.R
 3 | \name{seq_lengths}
 4 | \alias{seq_lengths}
 5 | \title{Get a summary of sequence lengths from a primerTree object}
 6 | \usage{
 7 | seq_lengths(x, summarize = TRUE)
 8 | }
 9 | \arguments{
10 | \item{x}{a primerTree object.}
11 | 
12 | \item{summarize}{a logical indicating if a summary should be displayed}
13 | }
14 | \value{
15 | a table of sequence length frequencies
16 | }
17 | \description{
18 | Get a summary of sequence lengths from a primerTree object
19 | }
20 | \examples{
21 | 
22 | # Show the counts for each length
23 | seq_lengths(mammals_16S)
24 | 
25 | # Plot the distribution of lengths
26 | seqLengths <- seq_lengths(mammals_16S)
27 | barplot(seqLengths, 
28 |  main = "Frequency of sequence lengths for 16S mammal primers", 
29 |  xlab="Amplicon length (in bp)", 
30 |  ylab=("Frequency"))
31 | }
32 | 


--------------------------------------------------------------------------------
/man/seq_lengths.primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/filterResults.R
 3 | \name{seq_lengths.primerTree}
 4 | \alias{seq_lengths.primerTree}
 5 | \title{Method for primerTree objects}
 6 | \usage{
 7 | \method{seq_lengths}{primerTree}(x, summarize = TRUE)
 8 | }
 9 | \arguments{
10 | \item{x}{a primerTree object.}
11 | 
12 | \item{summarize}{a logical indicating if a summary should be displayed}
13 | }
14 | \description{
15 | Method for primerTree objects
16 | }
17 | 


--------------------------------------------------------------------------------
/man/summary.primerTree.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/primerTree.R
 3 | \name{summary.primerTree}
 4 | \alias{summary.primerTree}
 5 | \title{Summarize a primerTree result, printing quantiles of sequence length and
 6 | pairwise differences.}
 7 | \usage{
 8 | \method{summary}{primerTree}(object, ..., probs = c(0, 0.05, 0.5, 0.95, 1), ranks = common_ranks)
 9 | }
10 | \arguments{
11 | \item{object}{the primerTree object to summarise}
12 | 
13 | \item{...}{Ignored options}
14 | 
15 | \item{probs}{quantile probabilities to compute, defaults to 0, 5, 50, 95,
16 | and 100 probabilities.}
17 | 
18 | \item{ranks}{ranks to show unique counts for, defaults to the common ranks}
19 | }
20 | \value{
21 | invisibly returns a list containing the printed results
22 | }
23 | \description{
24 | Summarize a primerTree result, printing quantiles of sequence length and
25 | pairwise differences.
26 | }
27 | 


--------------------------------------------------------------------------------
/man/tree_from_alignment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sequence.R
 3 | \name{tree_from_alignment}
 4 | \alias{tree_from_alignment}
 5 | \title{Construct a neighbor joining tree from a dna alignment}
 6 | \usage{
 7 | tree_from_alignment(dna, pairwise.deletion = TRUE, ...)
 8 | }
 9 | \arguments{
10 | \item{dna}{fasta dna object the tree is to be constructed from}
11 | 
12 | \item{pairwise.deletion}{a logical indicating if the distance matrix should 
13 | be constructed using pairwise deletion}
14 | 
15 | \item{...}{furthur arguments to dist.dna}
16 | }
17 | \description{
18 | Construct a neighbor joining tree from a dna alignment
19 | }
20 | \seealso{
21 | \code{\link{dist.dna}}, \code{\link{nj}}
22 | }
23 | 


--------------------------------------------------------------------------------
/paper.Rmd:
--------------------------------------------------------------------------------
  1 | ```{r paper_setup, cache=FALSE, include=FALSE}
  2 | library(knitcitations)
  3 | # create bibtex entries
  4 | write.bibtex(c(knitr=citation('knitr'),
  5 |                knitcitations=citation('knitcitations'), plyr=citation('plyr'),
  6 |                ggplot2=citation('ggplot2'), ape=citation('ape'),
  7 |                directlabels=citation('directlabels'),
  8 |                plyr=citation('plyr'), foreach=citation('foreach')),
  9 |              file='citations.bib')
 10 | bib = read.bibtex('citations.bib')
 11 | ```
 12 | 
 13 | # PrimerTree: Visually Assessing the Specificity and Informativeness of Primer Pairs#
 14 | James Hester and David Serre
 15 | 
 16 | Genomic Medicine Institute, Cleveland Clinic Lerner Research Institute, Cleveland, OH 44195, USA
 17 | # Abstract #
 18 | ## Summary ##
 19 | Designing PCR primers is a critical step in most genetic studies. Primers need
 20 | to be sensitive and specific, as well as yield informative DNA sequences. For
 21 | example, in metagenomic studies, primers need to amplify only DNA from the
 22 | targeted community and amplify DNA sequences that enable differentiating all
 23 | members of this community. Several informatics tools exist for designing
 24 | primers based on a template DNA sequence and for identifying potential target
 25 | sequences in a database. However, there are no tools to systematically analyze
 26 | and visualize the specificity of PCR primers and the informativeness of the
 27 | amplified sequence. PrimerTree is an R package that enables identifying
 28 | potential target sequences for a set of primers and generates
 29 | taxonomically annotated phylogenetic trees with the predicted amplification
 30 | products.
 31 | 
 32 | ## Availability ##
 33 | PrimerTree is an R package released under the GPL-2 license and is available through
 34 | [CRAN](http://cran.rproject.org/web/packages/primerTree/index.html).  Source
 35 | code and developmental versions are available at <http://www.github.com/jimhester/primerTree>.
 36 | 
 37 | ## Contact ##
 38 | [hesterj@ccf.org](mailto:hesterj@ccf.org)
 39 | 
 40 | ## Supplemental Information ##
 41 | Supplementary data are available at Bioinformatics online.
 42 | 
 43 | # Introduction #
 44 | Designing primers for PCR is the initial step in many biomedical, forensic and
 45 | metagenomic studies. There are a number of important factors to consider in
 46 | primer design. First one must consider the chemical properties of the primers,
 47 | including their length, melting temperature, GC content, secondary structure
 48 | and the likelihood of primer dimer formation. The primer pair must also
 49 | amplify specifically DNA from the target of interest and must not produce
 50 | offtarget products. In addition, for some studies, the DNA sequence amplified
 51 | must provide enough information to identify the source of the DNA. In
 52 | metagenomic studies, the primers need to amplify DNA from the taxon of interest
 53 | (e.g., bacteria, fungus, birds), not amplify unrelated taxon and provide enough
 54 | information after sequencing to characterize which members of the communities
 55 | are present in a given sample. Some clinical studies need to amplify only a
 56 | specific pathogen species and enable identifying which strains are present.
 57 | 
 58 | While there are efficient online tools for designing primers `r citep('10.1093/nar/gks596')`
 59 | and testing their specificity `r citep(c(primer_BLAST='10.1186/1471-2105-13-134'))`, there is no simple
 60 | tool to systematically assess the informativeness of the primer products.
 61 | PrimerTree is an R package that, given a set of primer pairs, provides visual
 62 | assessment of their specificity and informativeness by constructing
 63 | phylogenetic trees of the predicted PCR products along with their taxonomic
 64 | annotation. PrimerTree can run on a wide variety of hardware and only requires
 65 | two commands, making it accessible to a large audience.
 66 | 
 67 | # Methods #
 68 | PrimerTree successively performs primer search, retrieval of DNA sequences
 69 | predicted to be amplified, taxonomic identification of these sequences,
 70 | multiple sequence alignment, reconstruction of a phylogenetic tree, and
 71 | visualization of the tree with taxonomic annotation. Note that multiple primer
 72 | pairs can be queried simultaneously using PrimerTree.
 73 | 
 74 | PrimerTree utilizes the primer search implemented in Primer-BLAST
 75 | `r citep(c(primer_BLAST='10.1186/1471-2105-13-134'))`
 76 | by directly querying the NCBI Primer-BLAST search page. This allows using
 77 | all the options available on the NCBI site. By default PrimerTree searches the
 78 | NCBI (nt) nucleotide database but alternative NCBI databases can be chosen
 79 | through the function options. Note that when the primers are degenerated,
 80 | PrimerTree automatically tests all possible combinations of primer sequences
 81 | in Primer-BLAST and merges the results. The primer alignment results are then
 82 | processed using the NCBI Eutilities `r citep('http://www.ncbi.nlm.nih.gov/books/NBK25500/')`
 83 | to i) retrieve DNA sequences located between the primers (i.e., “amplified”)
 84 | and ii) obtain taxonomic information related to each DNA sequence using the
 85 | NCBI’s taxonomy database `r citep('http://www.ncbi.nlm.nih.gov/books/NBK21100/')`.
 86 | 
 87 | PrimerTree next aligns all amplified sequences using Clustal Omega
 88 | `r citep('10.1038/msb.2011.75')` and reconstructs a Neighbor-Joining tree using the ape package
 89 |  `r citep('10.1093/bioinformatics/btg412')`. Finally, PrimerTree displays the resulting phylogenetic
 90 | tree using the ggplot2 package, labeling each taxon in a different color and
 91 | adding the names of the main taxa using the directlabels package
 92 | `r citep(c(bib[['ggplot2']], bib[['directlabels']]))`.
 93 | 
 94 | # Results #
 95 | Figure 1 shows a subset of PrimerTree results for universal non-vascular plant (Bryophyte) primers
 96 | `r citep('10.1111/j.1365-294X.2012.05537.x')` targeting the chloroplast trnL gene (Figure 1A)
 97 | and mammal mitochondrial 16S ribosomal rna gene (Figure1B)
 98 | `r citep('10.1093/oxfordjournals.molbev.a025566')`. The tree display enables
 99 | rapid evaluation of the specificity of the primer pairs (e.g., offtarget
100 | amplification of amphibians and ray-finned fishes on Fig. 1B). In addition, the
101 | information encoded in the amplified DNA sequence can also be easily assessed
102 | by the length of the branches leading to different sequences (scaled in number
103 | of nucleotide differences). By default, PrimerTree displays the annotated
104 | phylogenetic tree for all taxonomic levels (e.g., kindom, phylum, class)
105 | enabling the user to determine the level of specificity of each primers (see
106 | e.g., Supplemental Figure 1).
107 | 
108 | For nondegenerated primers and using a single thread, PrimerTree usually runs
109 | in less than 240 seconds. However, the average runtime varies greatly depending
110 | on the primer specificity (i.e. how many DNA sequences are “amplified”), the
111 | search parameters chosen, and the current load on the NCBI servers and the
112 | internet connection. Highly degenerated primers result in large numbers of
113 | possible primer pairs, which can increase the runtime considerably. To limit
114 | maximum runtime in this situation, PrimerTree randomly samples only a portion
115 | of the total primer permutations. This provides a representative sample for
116 | most cases. Changing the number of sampled permutations or turning off sampling
117 | completely is possible. PrimerTree uses the plyr package extensively and has
118 | full support for any of the parallel backends compatible with the foreach
119 | package `r citep(c(bib[['plyr']], bib[['foreach']]))`. In particular, parallel retrieval of
120 | the primer sequences from NCBI speeds up the total runtime considerably. Note
121 | that parallel queries to Primer-BLAST are queued by NCBI’s servers and only
122 | processed once there is free compute time.
123 | 
124 | # Conclusions #
125 | Designing primers can be challenging, especially for metagenomic study, and
126 | dramatically impact the study results. We have developed the PrimerTree
127 | package to help assessing large numbers of designed primers and choosing the
128 | best primer pair for a given experiment. PrimerTree is an R package available
129 | on a wide variety of platforms and is very simple to use and install.
130 | PrimerTree is released under an opensource license and can be downloaded from
131 | <http://github.com/jimhester/primerTree>, which also provides additional
132 | documentation.
133 | 
134 | ## Funding ##
135 | XXX
136 | 
137 | ## Conflict of Interest ##
138 | None declared
139 | 
140 | # References #
141 | ```{r paper_bibliography, results='asis', echo=F}
142 | bibliography(style='markdown')
143 | ```
144 | 
145 | ```{r paper_get_data, echo=F, eval=F}
146 | library(doMC)
147 | registerDoMC(8)
148 | mammals_16S = search_primer_pair(name='Mammals 16S', 'CGGTTGGGGTGACCTCGGA', 'GCTGTTATCCCTAGGGTAACT', num_aligns=1000, .parallel=T)
149 | bryophytes_trnL = search_primer_pair(name='Bryophyte trnL', 'GATTCAGGGAAACTTAGGTTG', 'CCATTGAGTCTCTGCACC', num_aligns=1000, .parallel=T)
150 | save(file='data/mammals_16S.RData', mammals_16S)
151 | save(file='data/bryophytes_trnL.RData', bryophytes_trnL)
152 | ```
153 | ```{r paper_figure1, dev='png', fig.height=4.5, fig.width=6, warning=FALSE}
154 | change_mapping = function(plot){
155 |   plot$layers[[1]]$geom_params$colour = 'lightgrey'
156 |   plot$layers[[4]]$mapping = aes(x=x, y=y, color=class, shape=class) #add shape to points mapping
157 |   plot$layers[[5]]$mapping = aes(x=x, y=y, label=class) #remove color from direct labels mapping
158 |   plot
159 | }
160 | p1 = change_mapping(plot(mammals_16S, ranks='class', main='Mammal 16S', rotate=45, size=4) + scale_colour_grey(start=0, end=.8)) #scale_color_brewer(palette="Set1"))
161 | p2 = change_mapping(plot(bryophytes_trnL, ranks='class', main='Bryophyte trnL', size=4) + scale_colour_grey(start=0, end=.8)) #scale_color_brewer(palette="Set1"))
162 | library(gridExtra)
163 | p3 = arrangeGrob(p2,p1, ncol=2, clip=F)
164 | grid.draw(p3)
165 | ```
166 | ```{r paper_figure1_big, dev='svg', fig.height=12, fig.width=12, depth=300, dev='postscript', warning=FALSE}
167 | setEPS()
168 | grid.draw(p3)
169 | ```
170 | ```{r paper_supplemental, fig.height=12, fig.width=12, depth=300, dev='postscript', warning=FALSE}
171 | setEPS()
172 | rm(scale_colour_discrete)
173 | plot(mammals_16S)
174 | plot(bryophytes_trnL)
175 | ```
176 | 


--------------------------------------------------------------------------------
/primerTree.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageInstallArgs: --no-multiarch --with-keep.source
17 | PackageRoxygenize: rd,namespace
18 | 


--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
 1 | #include <R.h>
 2 | #include <Rinternals.h>
 3 | #include <stdlib.h> // for NULL
 4 | #include <R_ext/Rdynload.h>
 5 | 
 6 | /* FIXME:
 7 |    Check these declarations against the C/Fortran source code.
 8 | */
 9 | 
10 | /* .Call calls */
11 | extern SEXP rawStreamToDNAbin(SEXP);
12 | 
13 | static const R_CallMethodDef CallEntries[] = {
14 |     {"rawStreamToDNAbin", (DL_FUNC) &rawStreamToDNAbin, 1},
15 |     {NULL, NULL, 0}
16 | };
17 | 
18 | void R_init_primerTree(DllInfo *dll)
19 | {
20 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
21 |     R_useDynamicSymbols(dll, FALSE);
22 | }
23 | 


--------------------------------------------------------------------------------
/src/read_dna.c:
--------------------------------------------------------------------------------
  1 | /* read_dna.c       2014-03-05 */
  2 | 
  3 | /* Copyright 2013-2014 Emmanuel Paradis */
  4 | 
  5 | /* This file is part of the R-package `ape'. */
  6 | /* See the file ../COPYING for licensing issues. */
  7 | 
  8 | #include <R.h>
  9 | #include <Rinternals.h>
 10 | 
 11 | // The initial code defining and initialising the translation table:
 12 | //
 13 | //	for (i = 0; i < 122; i++) tab_trans[i] = 0x00;
 14 | //
 15 | //	tab_trans[65] = 0x88; /* A */
 16 | //	tab_trans[71] = 0x48; /* G */
 17 | //	tab_trans[67] = 0x28; /* C */
 18 | // 	tab_trans[84] = 0x18; /* T */
 19 | // 	tab_trans[82] = 0xc0; /* R */
 20 | // 	tab_trans[77] = 0xa0; /* M */
 21 | // 	tab_trans[87] = 0x90; /* W */
 22 | // 	tab_trans[83] = 0x60; /* S */
 23 | // 	tab_trans[75] = 0x50; /* K */
 24 | // 	tab_trans[89] = 0x30; /* Y */
 25 | // 	tab_trans[86] = 0xe0; /* V */
 26 | // 	tab_trans[72] = 0xb0; /* H */
 27 | // 	tab_trans[68] = 0xd0; /* D */
 28 | //  	tab_trans[66] = 0x70; /* B */
 29 | // 	tab_trans[78] = 0xf0; /* N */
 30 | //
 31 | //	tab_trans[97] = 0x88; /* a */
 32 | //	tab_trans[103] = 0x48; /* g */
 33 | //	tab_trans[99] = 0x28; /* c */
 34 | // 	tab_trans[116] = 0x18; /* t */
 35 | // 	tab_trans[114] = 0xc0; /* r */
 36 | // 	tab_trans[109] = 0xa0; /* m */
 37 | // 	tab_trans[119] = 0x90; /* w */
 38 | // 	tab_trans[115] = 0x60; /* s */
 39 | // 	tab_trans[107] = 0x50; /* k */
 40 | // 	tab_trans[121] = 0x30; /* y */
 41 | // 	tab_trans[118] = 0xe0; /* v */
 42 | // 	tab_trans[104] = 0xb0; /* h */
 43 | // 	tab_trans[100] = 0xd0; /* d */
 44 | //  	tab_trans[98] = 0x70; /* b */
 45 | // 	tab_trans[110] = 0xf0; /* n */
 46 | //
 47 | //  	tab_trans[45] = 0x04; /* - */
 48 | //  	tab_trans[63] = 0x02; /* ? */
 49 | 
 50 | static const unsigned char tab_trans[] = {
 51 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0-9 */
 52 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10-19 */
 53 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 20-29 */
 54 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 30-39 */
 55 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, /* 40-49 */
 56 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 50-59 */
 57 | 	0x00, 0x00, 0x00, 0x02, 0x00, 0x88, 0x70, 0x28, 0xd0, 0x00, /* 60-69 */
 58 | 	0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, 0xf0, 0x00, /* 70-79 */
 59 | 	0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, 0x00, 0x30, /* 80-89 */
 60 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x88, 0x70, 0x28, /* 90-99 */
 61 | 	0xd0, 0x00, 0x00, 0x48, 0xb0, 0x00, 0x00, 0x50, 0x00, 0xa0, /* 100-109 */
 62 | 	0xf0, 0x00, 0x00, 0x00, 0xc0, 0x60, 0x18, 0x00, 0xe0, 0x90, /* 110-119 */
 63 | 	0x00, 0x30, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 120-129 */
 64 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 130-139 */
 65 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 140-149 */
 66 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 150-159 */
 67 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 160-169 */
 68 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 170-179 */
 69 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 180-189 */
 70 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 190-199 */
 71 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 200-209 */
 72 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 210-219 */
 73 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 220-229 */
 74 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 230-239 */
 75 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 240-249 */
 76 | 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; /* 250-255 */
 77 | 
 78 | static const unsigned char hook = 0x3e;
 79 | static const unsigned char lineFeed = 0x0a;
 80 | /* static const unsigned char space = 0x20; */
 81 | 
 82 | SEXP rawStreamToDNAbin(SEXP x)
 83 | {
 84 | 	int N, i, j, k, n, startOfSeq;
 85 | 	unsigned char *xr, *rseq, *buffer, tmp;
 86 | 	SEXP obj, nms, seq;
 87 | 
 88 | 	PROTECT(x = coerceVector(x, RAWSXP));
 89 | 	N = LENGTH(x);
 90 | 	xr = RAW(x);
 91 | 
 92 | /* do a 1st pass to find the number of sequences
 93 | 
 94 |    this code should be robust to '>' present inside
 95 |    a label or in the header text before the sequences */
 96 | 
 97 | 	n = j = 0; /* use j as a flag */
 98 | 	if (xr[0] == hook) {
 99 | 		j = 1;
100 | 		startOfSeq = 0;
101 | 	}
102 | 	i = 1;
103 | 	for (i = 1; i < N; i++) {
104 | 		if (j && xr[i] == lineFeed) {
105 | 			n++;
106 | 			j = 0;
107 | 		} else if (xr[i] == hook) {
108 | 			if (!n) startOfSeq = i;
109 | 			j = 1;
110 | 		}
111 | 	}
112 | 
113 | 	PROTECT(obj = allocVector(VECSXP, n));
114 | 	PROTECT(nms = allocVector(STRSXP, n));
115 | 
116 | /* Refine the way the size of the buffer is set? */
117 | 	buffer = (unsigned char *)R_alloc(N, sizeof(unsigned char *));
118 | 
119 | 	i = startOfSeq;
120 | 	j = 0; /* gives the index of the sequence */
121 | 	while (i < N) {
122 | 		/* 1st read the label... */
123 | 		i++;
124 | 		k = 0;
125 | 		while (xr[i] != lineFeed) buffer[k++] = xr[i++];
126 | 		buffer[k] = '\0';
127 | 		SET_STRING_ELT(nms, j, mkChar((char *)buffer));
128 | 		/* ... then read the sequence */
129 | 		n = 0;
130 | 		while (i < N && xr[i] != hook) {
131 | 			tmp = tab_trans[xr[i++]];
132 | /* If we are sure that the FASTA file is correct (ie, the sequence on
133 |    a single line and only the IUAPC code (plus '-' and '?') is used,
134 |    then the following check would not be needed; additionally the size
135 |    of tab_trans could be restriced to 0-121. This check has the
136 |    advantage that all invalid characters are simply ignored without
137 |    causing error -- except if '>' occurs in the middle of a sequence. */
138 | 			if (tmp) buffer[n++] = tmp;
139 | 		}
140 | 		PROTECT(seq = allocVector(RAWSXP, n));
141 | 		rseq = RAW(seq);
142 | 		for (k = 0; k < n; k++) rseq[k] = buffer[k];
143 | 		SET_VECTOR_ELT(obj, j, seq);
144 | 		UNPROTECT(1);
145 | 		j++;
146 | 	}
147 | 	setAttrib(obj, R_NamesSymbol, nms);
148 | 	UNPROTECT(3);
149 | 	return obj;
150 | }
151 | 


--------------------------------------------------------------------------------