├── .Rbuildignore
├── .clang-format
├── .github
    ├── .gitignore
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS.md
├── R
    ├── catch.R
    ├── diagnostics.R
    ├── nse.R
    ├── register.R
    ├── sourcetools.R
    └── util.R
├── README.Rmd
├── README.html
├── README.md
├── TODO.md
├── benchmark
    ├── benchmark-parser.R
    ├── benchmark-read.R
    └── benchmark-tokenizer.R
├── configure.R
├── inst
    └── include
    │   ├── sourcetools.h
    │   └── sourcetools
    │       ├── collection
    │           ├── Position.h
    │           ├── Range.h
    │           └── collection.h
    │       ├── completion
    │           ├── CodeCompletion.h
    │           └── completion.h
    │       ├── core
    │           ├── config.h
    │           ├── core.h
    │           ├── macros.h
    │           └── util.h
    │       ├── cursor
    │           ├── TextCursor.h
    │           ├── TokenCursor.h
    │           └── cursor.h
    │       ├── diagnostics
    │           ├── Checkers.h
    │           ├── Diagnostic.h
    │           ├── DiagnosticsSet.h
    │           └── diagnostics.h
    │       ├── multibyte
    │           └── multibyte.h
    │       ├── parse
    │           ├── ParseError.h
    │           ├── ParseNode.h
    │           ├── ParseStatus.h
    │           ├── Parser.h
    │           ├── Precedence.h
    │           └── parse.h
    │       ├── platform
    │           └── platform.h
    │       ├── r
    │           ├── RCallRecurser.h
    │           ├── RConverter.h
    │           ├── RFunctions.h
    │           ├── RHeaders.h
    │           ├── RNonStandardEvaluation.h
    │           ├── RProtect.h
    │           ├── RUtils.h
    │           └── r.h
    │       ├── read
    │           ├── MemoryMappedReader.h
    │           ├── posix
    │           │   ├── FileConnection.h
    │           │   └── MemoryMappedConnection.h
    │           ├── read.h
    │           └── windows
    │           │   ├── FileConnection.h
    │           │   └── MemoryMappedConnection.h
    │       ├── tokenization
    │           ├── Registration.h
    │           ├── Token.h
    │           ├── Tokenizer.h
    │           └── tokenization.h
    │       ├── utf8
    │           └── utf8.h
    │       └── validation
    │           ├── SyntaxValidator.h
    │           └── validation.h
├── man
    ├── read.Rd
    ├── register_routines.Rd
    ├── tokenize-methods.Rd
    └── validate_syntax.Rd
├── notes
    └── notes-tdop.R
├── sourcetools.Rproj
├── src
    ├── Makevars
    ├── Makevars.win
    ├── NSE.cpp
    ├── Parser.cpp
    ├── Reader.cpp
    ├── Tokenizer.cpp
    ├── ValidateSyntax.cpp
    ├── sourcetools-init.c
    ├── test-Parser.cpp
    ├── test-Tokenizer.cpp
    ├── test-multibyte.cpp
    ├── test-r.cpp
    └── test-runner.cpp
├── tests
    ├── testthat.R
    └── testthat
    │   ├── helper-aaa.R
    │   ├── helper-utf8.R
    │   ├── test-catch.R
    │   ├── test-diagnostics.R
    │   ├── test-parser.R
    │   ├── test-read.R
    │   └── test-tokenize.R
└── tools
    └── header-guards.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rhistory$
 3 | ^\.Rproj\.user$
 4 | ^\.clang-format$
 5 | ^\.gitignore$
 6 | ^\.travis\.yml$
 7 | ^appveyor\.yml$
 8 | ^configure\.R$
 9 | ^README\.Rmd$
10 | ^TODO\.md$
11 | ^benchmark/
12 | ^notes/
13 | ^tools/
14 | ^travis/
15 | ^src/*\.s?o
16 | ^\.github$
17 | 


--------------------------------------------------------------------------------
/.clang-format:
--------------------------------------------------------------------------------
 1 | ---
 2 | Language:        Cpp
 3 | # BasedOnStyle:  LLVM
 4 | AccessModifierOffset: -2
 5 | AlignAfterOpenBracket: true
 6 | AlignConsecutiveAssignments: false
 7 | AlignEscapedNewlinesLeft: false
 8 | AlignOperands:   true
 9 | AlignTrailingComments: true
10 | AllowAllParametersOfDeclarationOnNextLine: true
11 | AllowShortBlocksOnASingleLine: false
12 | AllowShortCaseLabelsOnASingleLine: false
13 | AllowShortFunctionsOnASingleLine: All
14 | AllowShortIfStatementsOnASingleLine: false
15 | AllowShortLoopsOnASingleLine: false
16 | AlwaysBreakAfterDefinitionReturnType: None
17 | AlwaysBreakBeforeMultilineStrings: false
18 | AlwaysBreakTemplateDeclarations: false
19 | BinPackArguments: true
20 | BinPackParameters: true
21 | BreakBeforeBinaryOperators: None
22 | BreakBeforeBraces: Mozilla
23 | BreakBeforeTernaryOperators: true
24 | BreakConstructorInitializersBeforeComma: false
25 | ColumnLimit:     80
26 | CommentPragmas:  '^ IWYU pragma:'
27 | ConstructorInitializerAllOnOneLineOrOnePerLine: false
28 | ConstructorInitializerIndentWidth: 4
29 | ContinuationIndentWidth: 4
30 | Cpp11BracedListStyle: true
31 | DerivePointerAlignment: false
32 | DisableFormat:   false
33 | ExperimentalAutoDetectBinPacking: false
34 | ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
35 | IndentCaseLabels: false
36 | IndentWidth:     2
37 | IndentWrappedFunctionNames: false
38 | KeepEmptyLinesAtTheStartOfBlocks: true
39 | MacroBlockBegin: ''
40 | MacroBlockEnd:   ''
41 | MaxEmptyLinesToKeep: 1
42 | NamespaceIndentation: None
43 | ObjCBlockIndentWidth: 2
44 | ObjCSpaceAfterProperty: false
45 | ObjCSpaceBeforeProtocolList: true
46 | PenaltyBreakBeforeFirstCallParameter: 19
47 | PenaltyBreakComment: 300
48 | PenaltyBreakFirstLessLess: 120
49 | PenaltyBreakString: 1000
50 | PenaltyExcessCharacter: 1000000
51 | PenaltyReturnTypeOnItsOwnLine: 60
52 | PointerAlignment: Left
53 | SpaceAfterCStyleCast: false
54 | SpaceBeforeAssignmentOperators: true
55 | SpaceBeforeParens: ControlStatements
56 | SpaceInEmptyParentheses: false
57 | SpacesBeforeTrailingComments: 2
58 | SpacesInAngles:  false
59 | SpacesInContainerLiterals: true
60 | SpacesInCStyleCastParentheses: false
61 | SpacesInParentheses: false
62 | SpacesInSquareBrackets: false
63 | Standard:        Cpp11
64 | TabWidth:        8
65 | UseTab:          Never
66 | ...
67 | 
68 | 


--------------------------------------------------------------------------------
/.github/.gitignore:
--------------------------------------------------------------------------------
1 | *.html
2 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples
 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help
 3 | on:
 4 |   push:
 5 |     branches: [main, master]
 6 |   pull_request:
 7 | 
 8 | name: R-CMD-check.yaml
 9 | 
10 | permissions: read-all
11 | 
12 | jobs:
13 |   R-CMD-check:
14 |     runs-on: ubuntu-latest
15 |     env:
16 |       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
17 |       R_KEEP_PKG_SOURCE: yes
18 |     steps:
19 |       - uses: actions/checkout@v4
20 | 
21 |       - uses: r-lib/actions/setup-r@v2
22 |         with:
23 |           use-public-rspm: true
24 | 
25 |       - uses: r-lib/actions/setup-r-dependencies@v2
26 |         with:
27 |           extra-packages: any::rcmdcheck
28 |           needs: check
29 | 
30 |       - uses: r-lib/actions/check-r-package@v2
31 |         with:
32 |           upload-snapshots: true
33 |           build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")'
34 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | src/*.o
4 | src/*.o-*
5 | src/*.so
6 | src/*.dll
7 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: sourcetools
 2 | Type: Package
 3 | Title: Tools for Reading, Tokenizing and Parsing R Code
 4 | Version: 0.1.7-9000
 5 | Author: Kevin Ushey
 6 | Maintainer: Kevin Ushey <kevinushey@gmail.com>
 7 | Description: Tools for Reading, Tokenizing and Parsing R Code.
 8 | License: MIT + file LICENSE
 9 | LazyData: TRUE
10 | Depends:
11 |     R (>= 3.0.2)
12 | Suggests:
13 |     testthat
14 | LinkingTo:
15 |     testthat (>= 1.0.2)
16 | RoxygenNote: 7.1.1
17 | BugReports: https://github.com/kevinushey/sourcetools/issues
18 | Encoding: UTF-8
19 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Copyright 2015-2017 Kevin Ushey
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(print,RTokens)
 4 | export(read)
 5 | export(read_bytes)
 6 | export(read_lines)
 7 | export(read_lines_bytes)
 8 | export(tokenize)
 9 | export(tokenize_file)
10 | export(tokenize_string)
11 | export(validate_syntax)
12 | useDynLib(sourcetools, .registration = TRUE)
13 | 


--------------------------------------------------------------------------------
/NEWS.md:
--------------------------------------------------------------------------------
 1 | 
 2 | ## sourcetools 0.2.0 (UNRELEASED)
 3 | 
 4 | - Remove calls to `std::sprintf()`.
 5 | 
 6 | - Support `=>` pipe-bind operator, to be introduced in R 4.1.0.
 7 | 
 8 | - Support `|>` pipe operator, to be introduced in R 4.1.0.
 9 | 
10 | - Support raw string syntax, introduced in R 4.0.0.
11 | 
12 | ## sourcetools 0.1.7
13 | 
14 | - Ensure tests pass on platforms where `char` is unsigned. (#21)
15 | 
16 | ## sourcetools 0.1.6
17 | 
18 | - Register native routines.
19 | 
20 | ## sourcetools 0.1.5
21 | 
22 | - More work to ensure `sourcetools` can build on Solaris.
23 | 
24 | ## sourcetools 0.1.4
25 | 
26 | - More work to ensure `sourcetools` can build on Solaris.
27 | 
28 | ## sourcetools 0.1.3
29 | 
30 | - Relax C++11 requirement, to ensure that `sourcetools` can
31 |   build on machines with older compilers (e.g. gcc 4.4).
32 |   
33 | ## sourcetools 0.1.2
34 | 
35 | - Disable failing tests on Solaris.
36 | 
37 | ## sourcetools 0.1.1
38 | 
39 | - Rename token type `ERR` to `INVALID` to fix build errors
40 |   on Solaris.
41 | 
42 | ## sourcetools 0.1.0
43 | 
44 | ### Features
45 | 
46 | The first release of `sourcetools` comes with a small set
47 | of features exposed to R:
48 | 
49 | - `read(file)`: Read a file (as a string). Similar to
50 |   `readChar()`, but faster (and maybe be optimized to
51 |   use a memory mapped file reader in the future).
52 | 
53 | - `tokenize_file(file)`: Tokenize an R script.
54 | 
55 | - `tokenize_string(string)`: Tokenize a string of R code.
56 | 


--------------------------------------------------------------------------------
/R/catch.R:
--------------------------------------------------------------------------------
1 | (function() .Call(run_testthat_tests))
2 | 


--------------------------------------------------------------------------------
/R/diagnostics.R:
--------------------------------------------------------------------------------
1 | diagnose_string <- function(string) {
2 |   .Call(sourcetools_diagnose_string, as.character(string))
3 | }
4 | 
5 | diagnose_file <- function(file) {
6 |   diagnose_string(read(file))
7 | }
8 | 


--------------------------------------------------------------------------------
/R/nse.R:
--------------------------------------------------------------------------------
1 | 
2 | performs_nse <- function(...) {
3 |   .Call(sourcetools_performs_nse, list(...))
4 | }
5 | 


--------------------------------------------------------------------------------
/R/register.R:
--------------------------------------------------------------------------------
  1 | #' Register Native Routines
  2 | #'
  3 | #' Discover and register native routines in a package.
  4 | #' Functions to be registered should be prefixed with the
  5 | #' `// [[export(<methods>)]]` attribute.
  6 | #'
  7 | #' @param package The path to an \R package.
  8 | #' @param prefix The prefix to assign to the \R objects
  9 | #'   generated that map to each routine.
 10 | #' @param dynamic.symbols Boolean; should dynamic symbol lookup
 11 | #'   be enabled?
 12 | #'
 13 | register_routines <- function(package = ".",
 14 |                               prefix = "C_",
 15 |                               dynamic.symbols = FALSE)
 16 | {
 17 |   # read DESCRIPTION file
 18 |   desc_path <- file.path(package, "DESCRIPTION")
 19 |   if (!file.exists(desc_path)) {
 20 |     fmt <- "no DESCRIPTION at path '%s'"
 21 |     stop(sprintf(fmt, desc_path))
 22 |   }
 23 |   desc <- read.dcf(desc_path, all = TRUE)
 24 |   pkg_name <- desc$Package
 25 | 
 26 |   # find C, C++ files in package
 27 |   srcfiles <- list.files(
 28 |     package,
 29 |     pattern = "\\.(?:h|c|cc|cpp)$",
 30 |     full.names = TRUE,
 31 |     recursive = TRUE
 32 |   )
 33 | 
 34 |   # discover routines in these files
 35 |   routines <- unlist(
 36 |     lapply(srcfiles, discover_routines),
 37 |     recursive = FALSE
 38 |   )
 39 | 
 40 |   # generate prototypes based on routines
 41 |   prototypes <- generate_prototypes(routines)
 42 | 
 43 |   # separate routines based on declared export type
 44 |   call_routines <- external_routines <- list()
 45 |   lapply(routines, function(routine) {
 46 | 
 47 |     # extract registration text and discover the interface
 48 |     pieces <- strsplit(routine$registration, "\\[\\[|\\]\\]")[[1]]
 49 |     code <- utils::tail(pieces, 1)
 50 |     parsed <- tryCatch(
 51 |       parse(text = code)[[1]],
 52 |       error = function(e) {
 53 |         warning("failed to parse registration comment '", routine$registration, "'")
 54 |       }
 55 |     )
 56 | 
 57 |     interface <- as.character(parsed[[2]])
 58 |     if (interface == ".Call") {
 59 |       call_routines[[length(call_routines) + 1]] <<- routine
 60 |     } else if (interface == ".External") {
 61 |       external_routines[[length(external_routines) + 1]] <<- routine
 62 |     } else {
 63 |       warning("unrecognized / unsupported interface '", interface, "'")
 64 |     }
 65 | 
 66 |   })
 67 | 
 68 |   # generate method definitions for each
 69 |   call_methods <- generate_call_methods(call_routines, prefix = prefix)
 70 |   external_methods <- generate_external_methods(external_routines, prefix = prefix)
 71 | 
 72 |   # generate initialization routine
 73 |   r_init <- generate_r_init(pkg_name = pkg_name,
 74 |                             call_methods = call_methods,
 75 |                             external_methods = external_methods,
 76 |                             dynamic_symbols = dynamic.symbols)
 77 | 
 78 |   # generate script
 79 |   script <- c(
 80 |     "// This file was automatically generated.",
 81 |     "",
 82 |     "#include <R.h>",
 83 |     "#include <Rinternals.h>",
 84 |     "#include <R_ext/Rdynload.h>",
 85 |     "",
 86 |     prototypes,
 87 |     "",
 88 |     call_methods,
 89 |     "",
 90 |     external_methods,
 91 |     "",
 92 |     r_init
 93 |   )
 94 | 
 95 |   # write to init file
 96 |   init_path <- sub("^\\./", "", file.path(package, sprintf("src/%s-init.c", pkg_name)))
 97 |   writeLines(script, init_path, sep = "\n")
 98 |   message("* Wrote registration metadata to '", init_path, "'")
 99 | 
100 |   # remind about .registration = TRUE
101 |   check_namespace_symbol_registration(package)
102 |   invisible(init_path)
103 | }
104 | 
105 | discover_routines <- function(file) {
106 |   contents <- readBin(file, what = raw(), n = file.info(file)$size)
107 | 
108 |   # find routines for registration
109 |   re_registration <- "//[[:space:]*]\\[\\[export"
110 |   if (length(contents) < re_registration)
111 |     return(list())
112 | 
113 |   matches <- grepRaw(re_registration, contents, all = TRUE)
114 |   lapply(matches, function(match) {
115 | 
116 |     # find bounds for function prototype
117 |     start <- grepRaw("\n",    contents, offset = match) + 1
118 |     end   <- grepRaw("\\{|;", contents, offset = start) - 1
119 | 
120 |     # extract the routine type
121 |     registration <- rawToChar(contents[match:(start - 2)])
122 | 
123 |     # extract all 'SEXP .*' pieces of function
124 |     prototype <- rawToChar(contents[start:end])
125 |     m <- gregexpr("SEXP[[:space:]+]([[:alnum:]_])+", prototype)
126 |     names <- regmatches(prototype, m)[[1]]
127 | 
128 |     list(
129 |       registration = registration,
130 |       prototype = prototype,
131 |       name = names[[1]],
132 |       arguments = names[-1L]
133 |     )
134 | 
135 |   })
136 | 
137 | }
138 | 
139 | check_namespace_symbol_registration <- function(package = ".") {
140 | 
141 |   # check for namespace file
142 |   ns_path <- file.path(package, "NAMESPACE")
143 |   if (!file.exists(ns_path))
144 |     return(invisible(FALSE))
145 | 
146 |   # try parsing the namespace
147 |   ns <- parse(ns_path)
148 | 
149 |   # try finding a call to 'useDynLib(pkg, .registration = TRUE)'
150 |   for (entry in ns) {
151 |     if (identical(entry[[1]], as.name("useDynLib"))) {
152 |       nm <- names(entry)
153 |       idx <- which(nm == ".registration")
154 |       if (length(idx) != 1)
155 |         break
156 | 
157 |       if (isTRUE(entry[[idx]]))
158 |         return(invisible(TRUE))
159 |     }
160 |   }
161 | 
162 |   invisible(FALSE)
163 | }
164 | 
165 | generate_prototypes <- function(routines) {
166 |   # TODO: we assume only SEXP interfaces here
167 |   vapply(routines, function(routine) {
168 |     arglist <- paste(rep("SEXP", length(routine$arguments)), collapse = ", ")
169 |     sprintf("%s(%s);", routine$name, arglist)
170 |   }, character(1))
171 | }
172 | 
173 | generate_call_methods <- function(routines, prefix = "C_") {
174 | 
175 |   # for each routine, generate a registration line
176 |   fmt <- '{"%s", (DL_FUNC) &%s, %i},'
177 |   lines <- vapply(routines, function(routine) {
178 |     name <- utils::tail(strsplit(routine$name, "[[:space:]+]")[[1]], 1)
179 |     prefixed_name <- paste0(prefix, name)
180 |     n <- length(routine$arguments)
181 |     sprintf(fmt, prefixed_name, name, n)
182 |   }, character(1))
183 | 
184 |   # indent, add commas, add null entry at end
185 |   lines <- c(lines, "{NULL, NULL, 0}")
186 | 
187 |   c(
188 |     "static R_CallMethodDef callMethods[] = {",
189 |     paste0("\t", lines),
190 |     "};"
191 |   )
192 | 
193 | }
194 | 
195 | generate_external_methods <- function(routines, prefix = "C_") {
196 |   # TODO
197 |   character()
198 | }
199 | 
200 | generate_r_init <- function(pkg_name,
201 |                             call_methods,
202 |                             external_methods,
203 |                             dynamic_symbols)
204 | {
205 |   r_register_routines <- sprintf(
206 |     "\tR_registerRoutines(info, %s, %s, %s, %s);",
207 |     "NULL",
208 |     if (length(call_methods)) "callMethods" else "NULL",
209 |     "NULL",
210 |     if (length(external_methods)) "externalMethods" else "NULL"
211 |   )
212 | 
213 |   fmt <- paste(
214 |     "void R_init_%s(DllInfo* info) {",
215 |     r_register_routines,
216 |     "\tR_useDynamicSymbols(info, %s);",
217 |     "}",
218 |     sep = "\n", collapse = "\n"
219 |   )
220 | 
221 |   sprintf(fmt, pkg_name, if (dynamic_symbols) "TRUE" else "FALSE")
222 | 
223 | }
224 | 


--------------------------------------------------------------------------------
/R/sourcetools.R:
--------------------------------------------------------------------------------
  1 | #' @useDynLib sourcetools, .registration = TRUE
  2 | NULL
  3 | 
  4 | #' Read the Contents of a File
  5 | #'
  6 | #' Read the contents of a file into a string (or, in the case of
  7 | #' \code{read_lines}, a vector of strings).
  8 | #'
  9 | #' @param path A file path.
 10 | #'
 11 | #' @name read
 12 | #' @rdname read
 13 | #' @export
 14 | read <- function(path) {
 15 |   path <- normalizePath(path, mustWork = TRUE)
 16 |   .Call(sourcetools_read, path)
 17 | }
 18 | 
 19 | #' @name read
 20 | #' @rdname read
 21 | #' @export
 22 | read_lines <- function(path) {
 23 |   path <- normalizePath(path, mustWork = TRUE)
 24 |   .Call(sourcetools_read_lines, path)
 25 | }
 26 | 
 27 | #' @name read
 28 | #' @rdname read
 29 | #' @export
 30 | read_bytes <- function(path) {
 31 |   path <- normalizePath(path, mustWork = TRUE)
 32 |   .Call(sourcetools_read_bytes, path)
 33 | }
 34 | 
 35 | #' @name read
 36 | #' @rdname read
 37 | #' @export
 38 | read_lines_bytes <- function(path) {
 39 |   path <- normalizePath(path, mustWork = TRUE)
 40 |   .Call(sourcetools_read_lines_bytes, path)
 41 | }
 42 | 
 43 | #' Tokenize R Code
 44 | #'
 45 | #' Tools for tokenizing \R code.
 46 | #'
 47 | #' @param file,path A file path.
 48 | #' @param text,string \R code as a character vector of length one.
 49 | #'
 50 | #' @note Line numbers are determined by existence of the \code{\\n}
 51 | #' line feed character, under the assumption that code being tokenized
 52 | #' will use either \code{\\n} to indicate newlines (as on modern
 53 | #' Unix systems), or \code{\\r\\n} as on Windows.
 54 | #'
 55 | #' @return A \code{data.frame} with the following columns:
 56 | #'
 57 | #' \tabular{ll}{
 58 | #' \code{value}  \tab The token's contents, as a string.     \cr
 59 | #' \code{row}    \tab The row where the token is located.    \cr
 60 | #' \code{column} \tab The column where the token is located. \cr
 61 | #' \code{type}   \tab The token type, as a string.           \cr
 62 | #' }
 63 | #'
 64 | #' @rdname tokenize-methods
 65 | #' @export
 66 | #' @examples
 67 | #' tokenize_string("x <- 1 + 2")
 68 | tokenize_file <- function(path) {
 69 |   path <- normalizePath(path, mustWork = TRUE)
 70 |   .Call(sourcetools_tokenize_file, path)
 71 | }
 72 | 
 73 | #' @rdname tokenize-methods
 74 | #' @export
 75 | tokenize_string <- function(string) {
 76 |   .Call(sourcetools_tokenize_string, as.character(string))
 77 | }
 78 | 
 79 | #' @rdname tokenize-methods
 80 | #' @export
 81 | tokenize <- function(file = "", text = NULL) {
 82 |   if (is.null(text))
 83 |     text <- read(file)
 84 |   tokenize_string(text)
 85 | }
 86 | 
 87 | #' Find Syntax Errors
 88 | #'
 89 | #' Find syntax errors in a string of \R code.
 90 | #'
 91 | #' @param string A character vector (of length one).
 92 | #' @export
 93 | validate_syntax <- function(string) {
 94 |   .Call(sourcetools_validate_syntax, as.character(string))
 95 | }
 96 | 
 97 | #' @export
 98 | print.RTokens <- function(x, ...) {
 99 |   print.data.frame(x, ...)
100 | }
101 | 
102 | parse_string <- function(string) {
103 |   .Call(sourcetools_parse_string, string)
104 | }
105 | 
106 | parse_file <- function(file) {
107 |   parse_string(read(file))
108 | }
109 | 


--------------------------------------------------------------------------------
/R/util.R:
--------------------------------------------------------------------------------
1 | search_objects <- function() {
2 |   lapply(seq_along(search()), function(i) {
3 |     ls(pos = i, all.names = TRUE)
4 |   })
5 | }
6 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ```{r setup, include=FALSE}
 2 | library(sourcetools)
 3 | library(microbenchmark)
 4 | ```
 5 | 
 6 | <!-- badges: start -->
 7 | [![R-CMD-check](https://github.com/kevinushey/sourcetools/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kevinushey/sourcetools/actions/workflows/R-CMD-check.yaml)
 8 | <!-- badges: end -->
 9 | 
10 | # sourcetools
11 | 
12 | Tools for reading, tokenizing, and (eventually) parsing `R` code.
13 | 
14 | ## Getting Started
15 | 
16 | You can install `sourcetools` from CRAN with:
17 | 
18 | ```{r, eval=FALSE}
19 | install.packages("sourcetools")
20 | ```
21 | 
22 | Or, you can install the development version from GitHub with:
23 | 
24 | ```{r, eval=FALSE}
25 | devtools::install_github("kevinushey/sourcetools")
26 | ```
27 | 
28 | ## Reading
29 | 
30 | `sourcetools` comes with a couple fast functions for reading
31 | files into `R`.
32 | 
33 | Use `read()` and `read_lines()` to quickly read a file into
34 | `R` as character vectors. `read_lines()` handles both
35 | Windows style `\r\n` line endings, as well as Unix-style
36 | `\n` endings. Performance is on par with the readers
37 | provided by the
38 | [readr](https://cran.r-project.org/package=readr) package.
39 | 
40 | ```{r}
41 | text <- replicate(10000, {
42 |   paste(sample(letters, 200, TRUE), collapse = "")
43 | })
44 | file <- tempfile()
45 | cat(text, file = file, sep = "\n")
46 | mb <- microbenchmark::microbenchmark(times = 10,
47 |   base::readLines(file),
48 |   readr::read_lines(file),
49 |   sourcetools::read_lines(file)
50 | )
51 | sm <- summary(mb)
52 | print(sm[c("expr", "mean", "median")], digits = 3)
53 | unlink(file)
54 | ```
55 | 
56 | ## Tokenization
57 | 
58 | `sourcetools` provides the `tokenize_string()` and
59 | `tokenize_file()` functions for generating a tokenized
60 | representation of R code. These produce 'raw' tokenized
61 | representations of the code, with each token's value as a
62 | string, and a recorded row, column, and type:
63 | 
64 | ```{r}
65 | tokenize_string("if (x < 10) 20")
66 | ```
67 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | <!-- badges: start -->
 4 | [![R-CMD-check](https://github.com/kevinushey/sourcetools/workflows/R-CMD-check/badge.svg)](https://github.com/kevinushey/sourcetools/actions)
 5 | <!-- badges: end -->
 6 | 
 7 | # sourcetools
 8 | 
 9 | Tools for reading, tokenizing, and (eventually) parsing `R` code.
10 | 
11 | ## Getting Started
12 | 
13 | You can install `sourcetools` from CRAN with:
14 | 
15 | 
16 | ```r
17 | install.packages("sourcetools")
18 | ```
19 | 
20 | Or, you can install the development version from GitHub with:
21 | 
22 | 
23 | ```r
24 | devtools::install_github("kevinushey/sourcetools")
25 | ```
26 | 
27 | ## Reading
28 | 
29 | `sourcetools` comes with a couple fast functions for reading
30 | files into `R`.
31 | 
32 | Use `read()` and `read_lines()` to quickly read a file into
33 | `R` as character vectors. `read_lines()` handles both
34 | Windows style `\r\n` line endings, as well as Unix-style
35 | `\n` endings. Performance is on par with the readers
36 | provided by the
37 | [readr](https://cran.r-project.org/package=readr) package.
38 | 
39 | 
40 | ```r
41 | text <- replicate(10000, {
42 |   paste(sample(letters, 200, TRUE), collapse = "")
43 | })
44 | file <- tempfile()
45 | cat(text, file = file, sep = "\n")
46 | mb <- microbenchmark::microbenchmark(times = 10,
47 |   base::readLines(file),
48 |   readr::read_lines(file),
49 |   sourcetools::read_lines(file)
50 | )
51 | sm <- summary(mb)
52 | print(sm[c("expr", "mean", "median")], digits = 3)
53 | ```
54 | 
55 | ```
56 | ##                            expr  mean median
57 | ## 1         base::readLines(file) 17.29  16.22
58 | ## 2       readr::read_lines(file) 30.70   8.11
59 | ## 3 sourcetools::read_lines(file)  6.67   6.43
60 | ```
61 | 
62 | ```r
63 | unlink(file)
64 | ```
65 | 
66 | ## Tokenization
67 | 
68 | `sourcetools` provides the `tokenize_string()` and
69 | `tokenize_file()` functions for generating a tokenized
70 | representation of R code. These produce 'raw' tokenized
71 | representations of the code, with each token's value as a
72 | string, and a recorded row, column, and type:
73 | 
74 | 
75 | ```r
76 | tokenize_string("if (x < 10) 20")
77 | ```
78 | 
79 | ```
80 | ##    value row column       type
81 | ## 1     if   1      1    keyword
82 | ## 2          1      3 whitespace
83 | ## 3      (   1      4    bracket
84 | ## 4      x   1      5     symbol
85 | ## 5          1      6 whitespace
86 | ## 6      <   1      7   operator
87 | ## 7          1      8 whitespace
88 | ## 8     10   1      9     number
89 | ## 9      )   1     11    bracket
90 | ## 10         1     12 whitespace
91 | ## 11    20   1     13     number
92 | ```
93 | 


--------------------------------------------------------------------------------
/TODO.md:
--------------------------------------------------------------------------------
 1 | Parser
 2 | ======
 3 | 
 4 | - `::` and `:::` are only permitted within certain contexts; the parser is currently permissive about where these tokens are found.
 5 | 
 6 | - Equality operators (`<`, `<=`, `>`, `>=`, `=`, `!=`) can only occur once within the same level of an expression.
 7 | 
 8 | - `->` and `->>` need to be translated into `<-` and `<<-` when generating the R parse tree.
 9 | 
10 | 
11 | 
12 | Syntax Validator
13 | ================
14 | 
15 | Remove it? It really just tries to check for parse errors but the parser itself is equipped to do that.
16 | 
17 | 
18 | 
19 | 


--------------------------------------------------------------------------------
/benchmark/benchmark-parser.R:
--------------------------------------------------------------------------------
 1 | library(sourcetools)
 2 | library(microbenchmark)
 3 | 
 4 | files <- list.files("R", full.names = TRUE)
 5 | for (file in files) {
 6 | 
 7 |   mb <- microbenchmark(
 8 |     R  = base::parse(file, keep.source = FALSE),
 9 |     ST = sourcetools:::parse_file(file)
10 |   )
11 | 
12 |   print(mb)
13 | 
14 |   contents <- sourcetools:::read(file)
15 | 
16 |   mb <- microbenchmark(
17 |     R  = base::parse(text = contents, keep.source = FALSE),
18 |     ST = sourcetools:::parse_string(contents)
19 |   )
20 | 
21 |   sourcetools:::check_parse(contents)
22 | 
23 | }
24 | 


--------------------------------------------------------------------------------
/benchmark/benchmark-read.R:
--------------------------------------------------------------------------------
 1 | library(sourcetools)
 2 | library(microbenchmark)
 3 | 
 4 | file <- tempfile()
 5 | 
 6 | n <- 1024
 7 | junk <- replicate(1E4, {
 8 |   paste(sample(letters, n, TRUE), collapse = "")
 9 | })
10 | writeLines(junk, con = file)
11 | 
12 | stopifnot(identical(
13 |   read(file),
14 |   readChar(file, file.info(file)$size, TRUE)
15 | ))
16 | 
17 | stopifnot(identical(
18 |   readLines(file),
19 |   read_lines(file)
20 | ))
21 | 
22 | # read a file into a string
23 | mb <- microbenchmark(
24 |   sourcetools::read(file),
25 |   base::readChar(file, file.info(file)$size, TRUE),
26 |   readr::read_file(file)
27 | )
28 | print(mb)
29 | 
30 | # read a file, splitting on newline characters
31 | mb <- microbenchmark(
32 |   sourcetools::read_lines(file),
33 |   base::readLines(file),
34 |   readr::read_lines(file, progress = FALSE)
35 | )
36 | print(mb)
37 | 
38 | unlink(file)
39 | 


--------------------------------------------------------------------------------
/benchmark/benchmark-tokenizer.R:
--------------------------------------------------------------------------------
 1 | library(microbenchmark)
 2 | library(sourcetools)
 3 | 
 4 | # Obviously not fair to compare R's parser to a tokenizer but it
 5 | # helps establish a baseline for the tokenizer + how much 'wiggle
 6 | # room' we have in our parser
 7 | file <- "R/sourcetools.R"
 8 | microbenchmark(
 9 |   tokenize_file(file),
10 |   parse(file, keep.source = FALSE)
11 | )
12 | 
13 | contents <- read(file)
14 | mb <- microbenchmark(
15 |   tokenize_string(contents),
16 |   parse(text = contents, keep.source = FALSE)
17 | )
18 | print(mb)
19 | 


--------------------------------------------------------------------------------
/configure.R:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env Rscript
2 | knitr::render_markdown(strict = FALSE)
3 | knitr::knit("README.Rmd", output = "README.md")
4 | tools:::package_native_routine_registration_skeleton(".", "src/sourcetools-init.c", character_only = FALSE)
5 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCE_TOOLS_H
 2 | #define SOURCE_TOOLS_H
 3 | 
 4 | #include <sourcetools/core/core.h>
 5 | #include <sourcetools/platform/platform.h>
 6 | #include <sourcetools/collection/collection.h>
 7 | #include <sourcetools/utf8/utf8.h>
 8 | #include <sourcetools/cursor/cursor.h>
 9 | #include <sourcetools/r/r.h>
10 | #include <sourcetools/read/read.h>
11 | #include <sourcetools/parse/parse.h>
12 | #include <sourcetools/diagnostics/diagnostics.h>
13 | #include <sourcetools/tokenization/tokenization.h>
14 | #include <sourcetools/validation/validation.h>
15 | 
16 | #endif
17 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/collection/Position.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_COLLECTION_POSITION_H
 2 | #define SOURCETOOLS_COLLECTION_POSITION_H
 3 | 
 4 | #include <ostream>
 5 | #include <cstddef>
 6 | 
 7 | #include <sourcetools/core/config.h>
 8 | 
 9 | namespace sourcetools {
10 | namespace collections {
11 | 
12 | struct Position
13 | {
14 |   Position()
15 |     : row(0), column(0)
16 |   {
17 |   }
18 | 
19 |   Position(index_type row, index_type column)
20 |     : row(row), column(column)
21 |   {
22 |   }
23 | 
24 |   friend std::ostream& operator<<(std::ostream& os,
25 |                                   const Position& position)
26 |   {
27 |     os << position.row << ":" << position.column;
28 |     return os;
29 |   }
30 | 
31 |   friend bool operator <(const Position& lhs, const Position& rhs)
32 |   {
33 |     return
34 |       lhs.row < rhs.row ||
35 |       (lhs.row == rhs.row && lhs.column < rhs.column);
36 |   }
37 | 
38 |   friend bool operator <=(const Position& lhs, const Position& rhs)
39 |   {
40 |     return
41 |       lhs.row < rhs.row ||
42 |       (lhs.row == rhs.row && lhs.column <= rhs.column);
43 |   }
44 | 
45 |   friend bool operator ==(const Position& lhs, const Position& rhs)
46 |   {
47 |     return
48 |       lhs.row == rhs.row &&
49 |       lhs.column == rhs.column;
50 |   }
51 | 
52 |   friend bool operator >(const Position& lhs, const Position& rhs)
53 |   {
54 |     return
55 |       lhs.row > rhs.row ||
56 |       (lhs.row == rhs.row && lhs.column > rhs.column);
57 |   }
58 | 
59 |   friend bool operator >=(const Position& lhs, const Position& rhs)
60 |   {
61 |     return
62 |       lhs.row > rhs.row ||
63 |       (lhs.row == rhs.row && lhs.column >= rhs.column);
64 |   }
65 | 
66 |   friend Position operator +(const Position& lhs, index_type rhs)
67 |   {
68 |     return Position(lhs.row, lhs.column + rhs);
69 |   }
70 | 
71 |   index_type row;
72 |   index_type column;
73 | 
74 | };
75 | 
76 | } // namespace collections
77 | } // namespace sourcetools
78 | 
79 | #endif /* SOURCETOOLS_COLLECTION_POSITION_H */
80 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/collection/Range.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_COLLECTION_RANGE_H
 2 | #define SOURCETOOLS_COLLECTION_RANGE_H
 3 | 
 4 | #include <iosfwd>
 5 | #include <sourcetools/collection/Position.h>
 6 | 
 7 | namespace sourcetools {
 8 | namespace collections {
 9 | 
10 | class Range
11 | {
12 | public:
13 |   Range(const Position& start, const Position& end)
14 |     : start_(start), end_(end)
15 |   {
16 |   }
17 | 
18 |   friend std::ostream& operator <<(std::ostream& os, const Range& range)
19 |   {
20 |     os << "[" << range.start() << "-" << range.end() << "]";
21 |     return os;
22 |   }
23 | 
24 |   const Position start() const { return start_; }
25 |   const Position end() const { return end_; }
26 | 
27 | private:
28 |   Position start_;
29 |   Position end_;
30 | };
31 | } // namespace collections
32 | } // namespace sourcetools
33 | 
34 | #endif /* SOURCETOOLS_COLLECTION_RANGE_H */
35 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/collection/collection.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_COLLECTION_COLLECTION_H
2 | #define SOURCETOOLS_COLLECTION_COLLECTION_H
3 | 
4 | #include <sourcetools/collection/Position.h>
5 | #include <sourcetools/collection/Range.h>
6 | 
7 | #endif /* SOURCETOOLS_COLLECTION_COLLECTION_H */
8 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/completion/CodeCompletion.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_COMPLETION_CODE_COMPLETION_H
 2 | #define SOURCETOOLS_COMPLETION_CODE_COMPLETION_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | 
 7 | #include <sourcetools/parse/parse.h>
 8 | 
 9 | namespace sourcetools {
10 | namespace completion {
11 | 
12 | enum CompletionType
13 | {
14 |   CompletionTypeUnknown
15 | };
16 | 
17 | class Completion
18 | {
19 | public:
20 |   Completion(const std::string& value, CompletionType type)
21 |     : value_(value), type_(type)
22 |   {
23 |   }
24 | 
25 | private:
26 |   std::string value_;
27 |   CompletionType type_;
28 | };
29 | 
30 | std::vector<Completion> completions(const char* code,
31 |                                     index_type n,
32 |                                     const collections::Position& position)
33 | {
34 |   std::vector<Completion> completions;
35 | 
36 |   // TODO:
37 |   //
38 |   // 1) produce parse tree
39 |   // 2) get node at position (note: token immediately before position?)
40 |   // 3) figure out completion context type
41 |   //    ('$', '@', file, identifier, special context, etc)
42 |   // 4) dispatch to appropriate completer for context
43 |   // 5) return completions
44 | 
45 |   return completions;
46 | }
47 | 
48 | } // namespace completion
49 | } // namespace sourcetools
50 | 
51 | #endif /* SOURCETOOLS_COMPLETION_CODE_COMPLETION_H */
52 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/completion/completion.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_COMPLETION_COMPLETION_H
2 | #define SOURCETOOLS_COMPLETION_COMPLETION_H
3 | 
4 | #include <sourcetools/completion/CodeCompletion.h>
5 | 
6 | #endif /* SOURCETOOLS_COMPLETION_COMPLETION_H */
7 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/core/config.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_CORE_CONFIG_H
 2 | #define SOURCETOOLS_CORE_CONFIG_H
 3 | 
 4 | namespace sourcetools {
 5 | 
 6 | #ifndef SOURCETOOLS_CONFIG_INDEX_TYPE
 7 | # define SOURCETOOLS_CONFIG_INDEX_TYPE int
 8 | #endif
 9 | 
10 | typedef SOURCETOOLS_CONFIG_INDEX_TYPE index_type;
11 | 
12 | } // namespace sourcetools
13 | 
14 | #endif /* SOURCETOOLS_CORE_CONFIG_H */
15 | 
16 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/core/core.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_CORE_CORE_H
2 | #define SOURCETOOLS_CORE_CORE_H
3 | 
4 | #include <sourcetools/core/config.h>
5 | #include <sourcetools/core/macros.h>
6 | #include <sourcetools/core/util.h>
7 | 
8 | #endif /* SOURCETOOLS_CORE_CORE_H */
9 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/core/macros.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_CORE_MACROS_H
 2 | #define SOURCETOOLS_CORE_MACROS_H
 3 | 
 4 | #include <cstdio>
 5 | #include <cstring>
 6 | 
 7 | #include <string>
 8 | #include <iostream>
 9 | 
10 | /* Utility */
11 | #ifdef __GNUC__
12 | # define LIKELY(x)   __builtin_expect(!!(x), 1)
13 | # define UNLIKELY(x) __builtin_expect(!!(x), 0)
14 | #else
15 | # define LIKELY(x) x
16 | # define UNLIKELY(x) x
17 | #endif
18 | 
19 | #define SOURCE_TOOLS_CHECK_MASK(__SELF__, __MASK__)                   \
20 |   ((__MASK__ & __SELF__) == __MASK__)
21 | 
22 | #define SOURCE_TOOLS_LOWER_BITS(__VALUE__, __BITS__)                   \
23 |   (((1 << __BITS__) - 1) & __VALUE__)
24 | 
25 | #define SOURCE_TOOLS_PASTE(__X__, __Y__) __X__ ## __Y__
26 | #define SOURCE_TOOLS_STRINGIFY(__X__) #__X__
27 | 
28 | /* Logging */
29 | namespace sourcetools {
30 | namespace debug {
31 | 
32 | inline std::string shortFilePath(const std::string& filePath)
33 | {
34 |   std::string::size_type index = filePath.find_last_of("/");
35 |   if (index != std::string::npos)
36 |     return filePath.substr(index + 1);
37 |   return filePath;
38 | }
39 | 
40 | inline std::string debugPosition(const char* filePath, int line)
41 | {
42 |   static const int N = 1024;
43 |   char buffer[N];
44 |   std::string shortPath = shortFilePath(filePath);
45 |   if (shortPath.size() > N / 2)
46 |     shortPath = shortPath.substr(0, N / 2);
47 |   std::snprintf(buffer, N, "[%s:%4i]", shortPath.c_str(), line);
48 |   return buffer;
49 | }
50 | 
51 | } // namespace debug
52 | } // namespace sourcetools
53 | 
54 | // Flip on/off as necessary
55 | #ifdef SOURCETOOLS_ENABLE_DEBUG_LOGGING
56 | 
57 | #include <iostream>
58 | 
59 | #define DEBUG(__X__)                                                   \
60 |   std::cerr << ::sourcetools::debug::debugPosition(__FILE__, __LINE__) \
61 |             << ": " << __X__ << ::std::endl;
62 | #define DEBUG_BLOCK(x)
63 | 
64 | #else
65 | 
66 | #define DEBUG(x)
67 | #define DEBUG_BLOCK(x) if (false)
68 | 
69 | #endif
70 | 
71 | #endif /* SOURCETOOLS_CORE_MACROS_H */
72 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/core/util.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_CORE_UTIL_H
  2 | #define SOURCETOOLS_CORE_UTIL_H
  3 | 
  4 | #include <string>
  5 | #include <memory>
  6 | #include <cctype>
  7 | #include <cstdlib>
  8 | 
  9 | #include <sourcetools/core/config.h>
 10 | 
 11 | namespace sourcetools {
 12 | namespace detail {
 13 | 
 14 | class noncopyable
 15 | {
 16 | protected:
 17 |   noncopyable() {}
 18 |   ~noncopyable() {}
 19 | 
 20 | private:
 21 |   noncopyable(const noncopyable&);
 22 |   noncopyable& operator=(const noncopyable&);
 23 | };
 24 | 
 25 | } // namespace detail
 26 | typedef detail::noncopyable noncopyable;
 27 | 
 28 | template <typename T>
 29 | class scoped_ptr : noncopyable
 30 | {
 31 | public:
 32 |   explicit scoped_ptr(T* pData) : pData_(pData) {}
 33 |   T& operator*() const { return *pData_; }
 34 |   T* operator->() const { return pData_; }
 35 |   operator T*() const { return pData_; }
 36 |   ~scoped_ptr() { delete pData_; }
 37 | private:
 38 |   T* pData_;
 39 | };
 40 | 
 41 | template <typename T>
 42 | class scoped_array : noncopyable
 43 | {
 44 | public:
 45 |   explicit scoped_array(T* pData) : pData_(pData) {}
 46 |   T& operator*() const { return *pData_; }
 47 |   T* operator->() const { return pData_; }
 48 |   operator T*() const { return pData_; }
 49 |   ~scoped_array() { delete[] pData_; }
 50 | private:
 51 |   T* pData_;
 52 | };
 53 | 
 54 | namespace utils {
 55 | 
 56 | inline bool isWhitespace(char ch)
 57 | {
 58 |   return
 59 |     ch == ' ' ||
 60 |     ch == '\f' ||
 61 |     ch == '\r' ||
 62 |     ch == '\n' ||
 63 |     ch == '\t' ||
 64 |     ch == '\v';
 65 | }
 66 | 
 67 | template <typename T>
 68 | inline bool countWhitespaceBytes(const char* data,
 69 |                                  T* pBytes)
 70 | {
 71 |   T bytes = 0;
 72 |   while (isWhitespace(*data)) {
 73 |     ++data;
 74 |     ++bytes;
 75 |   }
 76 | 
 77 |   *pBytes = bytes;
 78 |   return bytes != 0;
 79 | }
 80 | 
 81 | inline bool isDigit(char ch)
 82 | {
 83 |   return
 84 |     (ch >= '0' && ch <= '9');
 85 | }
 86 | 
 87 | inline bool isAlphabetic(char ch)
 88 | {
 89 |   return
 90 |     (ch >= 'a' && ch <= 'z') ||
 91 |     (ch >= 'A' && ch <= 'Z');
 92 | }
 93 | 
 94 | inline bool isAlphaNumeric(char ch)
 95 | {
 96 |   return
 97 |     (ch >= 'a' && ch <= 'z') ||
 98 |     (ch >= 'A' && ch <= 'Z') ||
 99 |     (ch >= '0' && ch <= '9');
100 | }
101 | 
102 | inline bool isHexDigit(char ch)
103 | {
104 |   return
105 |     (ch >= '0' && ch <= '9') ||
106 |     (ch >= 'a' && ch <= 'f') ||
107 |     (ch >= 'A' && ch <= 'F');
108 | }
109 | 
110 | inline bool isValidForStartOfRSymbol(char ch)
111 | {
112 |   return
113 |     isAlphabetic(ch) ||
114 |     ch == '.' ||
115 |     static_cast<signed char>(ch) < 0;
116 | }
117 | 
118 | inline bool isValidForRSymbol(char ch)
119 | {
120 |   return
121 |     isAlphaNumeric(ch) ||
122 |     ch == '.' ||
123 |     ch == '_' ||
124 |     static_cast<signed char>(ch) < 0;
125 | }
126 | 
127 | inline std::string escape(char ch)
128 | {
129 |   switch (ch) {
130 |   case '\r':
131 |     return "\\r";
132 |   case '\n':
133 |     return "\\n";
134 |   case '\t':
135 |     return "\\t";
136 |   default:
137 |     return std::string(1, ch);
138 |   }
139 | }
140 | 
141 | template <typename T>
142 | index_type size(const T& object)
143 | {
144 |   return static_cast<index_type>(object.size());
145 | }
146 | 
147 | } // namespace utils
148 | } // namespace sourcetools
149 | 
150 | #endif /* SOURCETOOLS_CORE_UTIL_H */
151 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/cursor/TextCursor.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_CURSOR_TEXT_CURSOR_H
 2 | #define SOURCETOOLS_CURSOR_TEXT_CURSOR_H
 3 | 
 4 | #include <sourcetools/core/macros.h>
 5 | #include <sourcetools/collection/Position.h>
 6 | 
 7 | namespace sourcetools {
 8 | namespace cursors {
 9 | 
10 | class TextCursor
11 | {
12 | public:
13 | 
14 |   TextCursor(const char* text, index_type n)
15 |       : text_(text),
16 |         n_(n),
17 |         offset_(0),
18 |         position_(0, 0)
19 |   {
20 |   }
21 | 
22 |   char peek(index_type offset = 0) const
23 |   {
24 |     index_type index = offset_ + offset;
25 |     if (UNLIKELY(index >= n_))
26 |       return '\0';
27 |     return text_[index];
28 |   }
29 | 
30 |   void advance(index_type times = 1)
31 |   {
32 |     for (index_type i = 0; i < times; ++i) {
33 |       if (peek() == '\n') {
34 |         ++position_.row;
35 |         position_.column = 0;
36 |       } else {
37 |         ++position_.column;
38 |       }
39 |       ++offset_;
40 |     }
41 |   }
42 | 
43 |   operator const char*() const { return text_ + offset_; }
44 | 
45 |   index_type offset() const { return offset_; }
46 | 
47 |   const collections::Position& position() const { return position_; }
48 |   index_type row() const { return position_.row; }
49 |   index_type column() const { return position_.column; }
50 | 
51 |   const char* begin() const { return text_; }
52 |   const char* end() const { return text_ + n_; }
53 | 
54 | private:
55 |   const char* text_;
56 |   index_type n_;
57 |   index_type offset_;
58 |   collections::Position position_;
59 | };
60 | 
61 | } // namespace cursors
62 | } // namespace sourcetools
63 | 
64 | #endif /* SOURCETOOLS_CURSOR_TEXT_CURSOR_H */
65 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/cursor/TokenCursor.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_CURSOR_TOKEN_CURSOR_H
  2 | #define SOURCETOOLS_CURSOR_TOKEN_CURSOR_H
  3 | 
  4 | #include <cstring>
  5 | 
  6 | #include <algorithm>
  7 | #include <ostream>
  8 | #include <string>
  9 | #include <vector>
 10 | 
 11 | #include <sourcetools/collection/Position.h>
 12 | #include <sourcetools/tokenization/Token.h>
 13 | 
 14 | namespace sourcetools {
 15 | namespace cursors {
 16 | 
 17 | class TokenCursor {
 18 | 
 19 | private:
 20 |   typedef collections::Position Position;
 21 |   typedef tokens::Token Token;
 22 | 
 23 | public:
 24 | 
 25 |   TokenCursor(const std::vector<Token>& tokens)
 26 |     : tokens_(tokens),
 27 |       offset_(0),
 28 |       n_(tokens.size()),
 29 |       noSuchToken_(tokens::END)
 30 |   {}
 31 | 
 32 |   bool moveToNextToken()
 33 |   {
 34 |     if (UNLIKELY(offset_ >= n_ - 1))
 35 |       return false;
 36 | 
 37 |     ++offset_;
 38 |     return true;
 39 |   }
 40 | 
 41 |   bool moveToNextSignificantToken()
 42 |   {
 43 |     if (!moveToNextToken())
 44 |       return false;
 45 | 
 46 |     if (!fwdOverWhitespaceAndComments())
 47 |       return false;
 48 | 
 49 |     return true;
 50 |   }
 51 | 
 52 |   bool moveToPreviousToken()
 53 |   {
 54 |     if (UNLIKELY(offset_ == 0))
 55 |       return false;
 56 | 
 57 |     --offset_;
 58 |     return true;
 59 |   }
 60 | 
 61 |   bool moveToPreviousSignificantToken()
 62 |   {
 63 |     if (!moveToPreviousToken())
 64 |       return false;
 65 | 
 66 |     if (!bwdOverWhitespaceAndComments())
 67 |       return false;
 68 | 
 69 |     return true;
 70 |   }
 71 | 
 72 |   const Token& peekFwd(index_type offset = 1) const
 73 |   {
 74 |     index_type index = offset_ + offset;
 75 |     if (UNLIKELY(index >= n_))
 76 |       return noSuchToken_;
 77 | 
 78 |     return tokens_[index];
 79 |   }
 80 | 
 81 |   const Token& peekBwd(index_type offset = 1) const
 82 |   {
 83 |     if (UNLIKELY(offset > offset_))
 84 |       return noSuchToken_;
 85 | 
 86 |     index_type index = offset_ - offset;
 87 |     return tokens_[index];
 88 |   }
 89 | 
 90 |   const Token& currentToken() const
 91 |   {
 92 |     if (UNLIKELY(offset_ >= n_))
 93 |       return noSuchToken_;
 94 |     return tokens_[offset_];
 95 |   }
 96 | 
 97 |   operator const Token&() const { return currentToken(); }
 98 | 
 99 |   bool fwdOverWhitespace()
100 |   {
101 |     while (isType(tokens::WHITESPACE))
102 |       if (!moveToNextToken())
103 |         return false;
104 |     return true;
105 |   }
106 | 
107 |   bool bwdOverWhitespace()
108 |   {
109 |     while (isType(tokens::WHITESPACE))
110 |       if (!moveToPreviousToken())
111 |         return false;
112 |     return true;
113 |   }
114 | 
115 |   bool fwdOverComments()
116 |   {
117 |     while (isType(tokens::COMMENT))
118 |       if (!moveToNextToken())
119 |         return false;
120 |     return true;
121 |   }
122 | 
123 |   bool bwdOverComments()
124 |   {
125 |     while (isType(tokens::COMMENT))
126 |       if (!moveToPreviousToken())
127 |         return false;
128 |     return true;
129 |   }
130 | 
131 |   bool fwdOverWhitespaceAndComments()
132 |   {
133 |     while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE))
134 |       if (!moveToNextToken())
135 |         return false;
136 |     return true;
137 |   }
138 | 
139 |   bool bwdOverWhitespaceAndComments()
140 |   {
141 |     while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE))
142 |       if (!moveToPreviousToken())
143 |         return false;
144 |     return true;
145 |   }
146 | 
147 |   const Token& nextSignificantToken(index_type times = 1) const
148 |   {
149 |     TokenCursor clone(*this);
150 |     for (index_type i = 0; i < times; ++i)
151 |       clone.moveToNextSignificantToken();
152 |     return clone;
153 |   }
154 | 
155 |   const Token& previousSignificantToken(index_type times = 1) const
156 |   {
157 |     TokenCursor clone(*this);
158 |     for (index_type i = 0; i < times; ++i)
159 |       clone.moveToPreviousSignificantToken();
160 |     return clone;
161 |   }
162 | 
163 |   bool moveToPosition(index_type row, index_type column)
164 |   {
165 |     return moveToPosition(Position(row, column));
166 |   }
167 | 
168 |   bool moveToPosition(const Position& target)
169 |   {
170 |     if (UNLIKELY(n_ == 0))
171 |       return false;
172 | 
173 |     if (UNLIKELY(tokens_[n_ - 1].position() <= target))
174 |     {
175 |       offset_ = n_ - 1;
176 |       return true;
177 |     }
178 | 
179 |     index_type start  = 0;
180 |     index_type end    = n_;
181 | 
182 |     index_type offset = 0;
183 |     while (true)
184 |     {
185 |       offset = (start + end) / 2;
186 |       const Position& current = tokens_[offset].position();
187 | 
188 |       if (current == target || start == end)
189 |         break;
190 |       else if (current < target)
191 |         start = offset + 1;
192 |       else
193 |         end = offset - 1;
194 |     }
195 | 
196 |     offset_ = offset;
197 |     return true;
198 |   }
199 | 
200 |   template <typename F>
201 |   bool findFwd(F f)
202 |   {
203 |     do {
204 |       if (f(this))
205 |         return true;
206 |     } while (moveToNextToken());
207 | 
208 |     return false;
209 |   }
210 | 
211 |   template <typename F>
212 |   bool findBwd(F f)
213 |   {
214 |     do {
215 |       if (f(this))
216 |         return true;
217 |     } while (moveToPreviousToken());
218 | 
219 |     return false;
220 |   }
221 | 
222 |   bool findFwd(const char* contents)
223 |   {
224 |     return findFwd(std::string(contents, std::strlen(contents)));
225 |   }
226 | 
227 |   bool findFwd(const std::string& contents)
228 |   {
229 |     do {
230 |       if (currentToken().contentsEqual(contents))
231 |         return true;
232 |     } while (moveToNextToken());
233 | 
234 |     return false;
235 |   }
236 | 
237 |   bool findBwd(const char* contents)
238 |   {
239 |     return findBwd(std::string(contents, std::strlen(contents)));
240 |   }
241 | 
242 |   bool findBwd(const std::string& contents)
243 |   {
244 |     do {
245 |       if (currentToken().contentsEqual(contents))
246 |         return true;
247 |     } while (moveToPreviousToken());
248 | 
249 |     return false;
250 |   }
251 | 
252 |   bool fwdToMatchingBracket()
253 |   {
254 |     using namespace tokens;
255 |     if (!isLeftBracket(currentToken()))
256 |       return false;
257 | 
258 |     TokenType lhs = currentToken().type();
259 |     TokenType rhs = complement(lhs);
260 |     index_type balance = 1;
261 | 
262 |     while (moveToNextSignificantToken())
263 |     {
264 |       TokenType type = currentToken().type();
265 |       balance += type == lhs;
266 |       balance -= type == rhs;
267 |       if (balance == 0) return true;
268 |     }
269 | 
270 |     return false;
271 |   }
272 | 
273 |   bool bwdToMatchingBracket()
274 |   {
275 |     using namespace tokens;
276 |     if (!isRightBracket(currentToken()))
277 |       return false;
278 | 
279 |     TokenType lhs = currentToken().type();
280 |     TokenType rhs = complement(lhs);
281 |     index_type balance = 1;
282 | 
283 |     while (moveToPreviousSignificantToken())
284 |     {
285 |       TokenType type = currentToken().type();
286 |       balance += type == lhs;
287 |       balance -= type == rhs;
288 |       if (balance == 0) return true;
289 |     }
290 | 
291 |     return false;
292 |   }
293 | 
294 |   friend std::ostream& operator<<(std::ostream& os, const TokenCursor& cursor)
295 |   {
296 |     return os << toString(cursor.currentToken());
297 |   }
298 | 
299 |   tokens::TokenType type() const { return currentToken().type(); }
300 |   bool isType(tokens::TokenType type) const { return currentToken().isType(type); }
301 |   collections::Position position() const { return currentToken().position(); }
302 |   index_type offset() const { return offset_; }
303 |   index_type row() const { return currentToken().row(); }
304 |   index_type column() const { return currentToken().column(); }
305 | 
306 | 
307 | private:
308 | 
309 |   const std::vector<Token>& tokens_;
310 |   index_type offset_;
311 |   index_type n_;
312 |   Token noSuchToken_;
313 | 
314 | };
315 | 
316 | } // namespace cursors
317 | 
318 | inline std::string toString(const cursors::TokenCursor& cursor)
319 | {
320 |   return toString(cursor.currentToken());
321 | }
322 | 
323 | } // namespace sourcetools
324 | 
325 | #endif /* SOURCETOOLS_CURSOR_TOKEN_CURSOR_H */
326 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/cursor/cursor.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_CURSOR_CURSOR_H
2 | #define SOURCETOOLS_CURSOR_CURSOR_H
3 | 
4 | #include <sourcetools/cursor/TextCursor.h>
5 | #include <sourcetools/cursor/TokenCursor.h>
6 | 
7 | #endif /* SOURCETOOLS_CURSOR_CURSOR_H */
8 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/diagnostics/Checkers.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_DIAGNOSTICS_CHECKERS_H
  2 | #define SOURCETOOLS_DIAGNOSTICS_CHECKERS_H
  3 | 
  4 | #include <vector>
  5 | #include <set>
  6 | #include <string>
  7 | 
  8 | #include <sourcetools/r/r.h>
  9 | #include <sourcetools/parse/ParseNode.h>
 10 | #include <sourcetools/diagnostics/Diagnostic.h>
 11 | 
 12 | namespace sourcetools {
 13 | namespace diagnostics {
 14 | namespace checkers {
 15 | 
 16 | class CheckerBase
 17 | {
 18 | public:
 19 |   typedef tokens::Token Token;
 20 |   typedef tokens::TokenType TokenType;
 21 |   typedef parser::ParseNode ParseNode;
 22 | 
 23 |   virtual void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth) = 0;
 24 |   virtual ~CheckerBase() {}
 25 | };
 26 | 
 27 | /**
 28 |  * Warn about code of the form:
 29 |  *
 30 |  *    x == NULL
 31 |  *
 32 |  * The user likely intended to check if a value was NULL,
 33 |  * and in such a case should use `is.null()` instead.
 34 |  */
 35 | class ComparisonWithNullChecker : public CheckerBase
 36 | {
 37 | public:
 38 |   void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth)
 39 |   {
 40 |     const Token& token = pNode->token();
 41 |     bool isEquals =
 42 |       token.isType(tokens::OPERATOR_EQUAL) ||
 43 |       token.isType(tokens::OPERATOR_NOT_EQUAL);
 44 | 
 45 |     if (!isEquals)
 46 |       return;
 47 | 
 48 |     if (pNode->children().size() != 2)
 49 |       return;
 50 | 
 51 |     ParseNode* pLhs = pNode->children()[0];
 52 |     ParseNode* pRhs = pNode->children()[1];
 53 | 
 54 |     if (pLhs->token().isType(tokens::KEYWORD_NULL) ||
 55 |         pRhs->token().isType(tokens::KEYWORD_NULL))
 56 |     {
 57 |       pDiagnostics->addWarning(
 58 |         "Use 'is.null()' to check if an object is NULL",
 59 |         pNode->range());
 60 |     }
 61 |   }
 62 | };
 63 | 
 64 | /**
 65 |  * Warn about code of the form:
 66 |  *
 67 |  *    if (x = 1) { ... }
 68 |  *
 69 |  * The user likely intended to write 'if (x == 1)'.
 70 |  */
 71 | class AssignmentInIfChecker : public CheckerBase
 72 | {
 73 | public:
 74 |   void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth)
 75 |   {
 76 |     if (!pNode->token().isType(tokens::KEYWORD_IF))
 77 |       return;
 78 | 
 79 |     if (pNode->children().size() < 1)
 80 |       return;
 81 | 
 82 |     ParseNode* pCondition = pNode->children()[0];
 83 |     if (!pCondition->token().isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS))
 84 |       return;
 85 | 
 86 |     pDiagnostics->addWarning(
 87 |       "Using '=' for assignment in 'if' condition",
 88 |       pCondition->range());
 89 | 
 90 |   }
 91 | };
 92 | 
 93 | /**
 94 |  * Warn about vectorized '&' or '|' used in
 95 |  * 'if' statements. The scalar forms, '&&' and '||',
 96 |  * are likely preferred.
 97 |  */
 98 | class ScalarOpsInIfChecker : public CheckerBase
 99 | {
100 | public:
101 |   void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth)
102 |   {
103 |     if (!pNode->token().isType(tokens::KEYWORD_IF))
104 |       return;
105 | 
106 |     if (pNode->children().size() < 1)
107 |       return;
108 | 
109 |     ParseNode* pCondition = pNode->children()[0];
110 |     const Token& token = pCondition->token();
111 |     if (token.isType(tokens::OPERATOR_AND_VECTOR))
112 |     {
113 |       pDiagnostics->addInfo(
114 |         "Prefer '&&' to '&' in 'if' statement condition",
115 |         pCondition->range());
116 |     }
117 |     else if (token.isType(tokens::OPERATOR_OR_VECTOR))
118 |     {
119 |       pDiagnostics->addInfo(
120 |         "Prefer '||' to '|' in 'if' statement condition",
121 |         pCondition->range());
122 |     }
123 |   }
124 | };
125 | 
126 | /**
127 |  * Warn about unused computations, e.g.
128 |  *
129 |  *    foo <- function(x) {
130 |  *       x < 1
131 |  *       print(x)
132 |  *    }
133 |  *
134 |  * For example, in the above code, it's possible that the user
135 |  * intended to assign 1 to x, or use that result elsewhere.
136 |  *
137 |  * Don't warn if the expression shows up as the last statement
138 |  * within a parent function's body.
139 |  */
140 | class UnusedResultChecker : public CheckerBase
141 | {
142 | public:
143 |   void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth)
144 |   {
145 |     if (pNode->parent() == NULL)
146 |       return;
147 | 
148 |     const Token& parentToken = pNode->parent()->token();
149 |     bool isTopLevelContext =
150 |       parentToken.isType(tokens::ROOT) ||
151 |       parentToken.isType(tokens::LBRACE);
152 | 
153 |     if (!isTopLevelContext)
154 |       return;
155 | 
156 |     if (parentToken.isType(tokens::LBRACE))
157 |     {
158 |       const std::vector<ParseNode*>& siblings = pNode->parent()->children();
159 |       if (pNode == siblings[siblings.size() - 1])
160 |         return;
161 |     }
162 | 
163 |     const Token& token = pNode->token();
164 |     if (!tokens::isOperator(token))
165 |       return;
166 | 
167 |     if (tokens::isAssignmentOperator(token))
168 |       return;
169 | 
170 | 
171 |     pDiagnostics->addInfo(
172 |       "result of computation is not used",
173 |       pNode->range());
174 |   }
175 | };
176 | 
177 | class NoSymbolInScopeChecker : public CheckerBase
178 | {
179 | public:
180 | 
181 |   NoSymbolInScopeChecker()
182 |   {
183 |     stack_.push_back(Context(0));
184 |     objects_ = r::objectsOnSearchPath();
185 |   }
186 | 
187 |   void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth)
188 |   {
189 |     using namespace tokens;
190 |     const Token& token = pNode->token();
191 | 
192 |     // If we've left the last active scope, pop.
193 |     if (depth < current().depth())
194 |       pop();
195 | 
196 |     // Assignments update the current scope.
197 |     if (token.isType(OPERATOR_ASSIGN_LEFT) ||
198 |         token.isType(OPERATOR_ASSIGN_LEFT_EQUALS))
199 |     {
200 |       const ParseNode* pChild = pNode->children()[0];
201 |       const Token& symbol = pChild->token();
202 |       if (symbol.isType(SYMBOL) || symbol.isType(STRING))
203 |         add(symbol);
204 |     }
205 | 
206 |     // Check if a symbol has a definition in scope.
207 |     if (token.isType(SYMBOL))
208 |       check(token, pDiagnostics);
209 | 
210 |     // If we encounter a function definition, create a new scope
211 |     // and make the function argument names present in that scope.
212 |     if (token.isType(KEYWORD_FUNCTION))
213 |       push(pNode, depth);
214 |   }
215 | 
216 | private:
217 | 
218 |   class Context
219 |   {
220 |   public:
221 |     explicit Context(index_type depth)
222 |       : depth_(depth)
223 |     {
224 |     }
225 | 
226 |     void add(const Token& token)
227 |     {
228 |       values_.insert(token.contents());
229 |     }
230 | 
231 |     bool contains(const std::string& contents) const
232 |     {
233 |       return values_.count(contents);
234 |     }
235 | 
236 |     index_type depth() const
237 |     {
238 |       return depth_;
239 |     }
240 | 
241 |   private:
242 |     std::set<std::string> values_;
243 |     index_type depth_;
244 |   };
245 | 
246 |   Context& current()
247 |   {
248 |     return stack_[stack_.size() - 1];
249 |   }
250 | 
251 |   void push(const ParseNode* pNode, index_type depth)
252 |   {
253 |     stack_.push_back(Context(depth));
254 | 
255 |     ParseNode* pFormals = pNode->children()[0];
256 |     const std::vector<ParseNode*>& children = pFormals->children();
257 |     for (std::vector<ParseNode*>::const_iterator it = children.begin();
258 |          it != children.end();
259 |          ++it)
260 |     {
261 |       const Token& token = (*it)->token();
262 |       if (token.isType(tokens::SYMBOL))
263 |         add(token);
264 |       else if (token.isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS))
265 |       {
266 |         const Token& lhs = (*it)->children()[0]->token();
267 |         if (lhs.isType(tokens::SYMBOL))
268 |           add(lhs);
269 |       }
270 |     }
271 |   }
272 | 
273 |   void pop()
274 |   {
275 |     stack_.pop_back();
276 |   }
277 | 
278 |   void add(const Token& token)
279 |   {
280 |     current().add(token);
281 |   }
282 | 
283 |   void check(const Token& token, Diagnostics* pDiagnostics)
284 |   {
285 |     if (!token.isType(tokens::SYMBOL))
286 |       return;
287 | 
288 |     std::string contents = token.contents();
289 |     for (std::vector<Context>::const_iterator it = stack_.begin();
290 |          it != stack_.end();
291 |          ++it)
292 |     {
293 |       if (it->contains(contents))
294 |       {
295 |         return;
296 |       }
297 |     }
298 | 
299 |     if (objects_.count(token.contents()))
300 |       return;
301 | 
302 |     collections::Range range(token.position(), token.position() + token.size());
303 |     pDiagnostics->addWarning(
304 |         "use of undefined symbol '" + token.contents() + "'",
305 |         range);
306 |   }
307 | 
308 |   std::vector<Context> stack_;
309 |   std::set<std::string> objects_;
310 | 
311 | };
312 | 
313 | } // namespace checkers
314 | } // namespace diagnostics
315 | } // namespace sourcetools
316 | 
317 | #endif /* SOURCETOOLS_DIAGNOSTICS_CHECKERS_H */
318 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/diagnostics/Diagnostic.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_DIAGNOSTICS_DIAGNOSTIC_H
  2 | #define SOURCETOOLS_DIAGNOSTICS_DIAGNOSTIC_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | 
  7 | #include <sourcetools/r/r.h>
  8 | #include <sourcetools/collection/collection.h>
  9 | 
 10 | namespace sourcetools {
 11 | namespace diagnostics {
 12 | 
 13 | enum DiagnosticType
 14 | {
 15 |   DIAGNOSTIC_ERROR,
 16 |   DIAGNOSTIC_WARNING,
 17 |   DIAGNOSTIC_INFO,
 18 |   DIAGNOSTIC_STYLE
 19 | };
 20 | 
 21 | class Diagnostic
 22 | {
 23 | public:
 24 |   Diagnostic(DiagnosticType type,
 25 |              const std::string& message,
 26 |              const collections::Range& range)
 27 |     : type_(type), message_(message), range_(range)
 28 |   {
 29 |   }
 30 | 
 31 |   const std::string message() const { return message_; }
 32 |   DiagnosticType type() const { return type_; }
 33 |   collections::Range range() const { return range_; }
 34 |   collections::Position start() const { return range_.start(); }
 35 |   collections::Position end() const { return range_.end(); }
 36 | 
 37 | private:
 38 |   DiagnosticType type_;
 39 |   std::string message_;
 40 |   collections::Range range_;
 41 | };
 42 | 
 43 | class Diagnostics
 44 | {
 45 |   typedef collections::Range Range;
 46 | 
 47 | public:
 48 | 
 49 |   void add(DiagnosticType type, const std::string& message, const Range& range)
 50 |   {
 51 |     diagnostics_.push_back(Diagnostic(type, message, range));
 52 |   }
 53 | 
 54 |   void addError(const std::string& message, const Range& range)
 55 |   {
 56 |     add(DIAGNOSTIC_ERROR, message, range);
 57 |   }
 58 | 
 59 |   void addWarning(const std::string& message, const Range& range)
 60 |   {
 61 |     add(DIAGNOSTIC_WARNING, message, range);
 62 |   }
 63 | 
 64 |   void addInfo(const std::string& message, const Range& range)
 65 |   {
 66 |     add(DIAGNOSTIC_INFO, message, range);
 67 |   }
 68 | 
 69 |   operator const std::vector<Diagnostic>&() const { return diagnostics_; }
 70 | 
 71 | private:
 72 |   std::vector<Diagnostic> diagnostics_;
 73 | };
 74 | 
 75 | } // namespace diagnostics
 76 | 
 77 | namespace r {
 78 | 
 79 | inline SEXP create(diagnostics::DiagnosticType type)
 80 | {
 81 |   using namespace diagnostics;
 82 | 
 83 |   switch (type)
 84 |   {
 85 |   case DIAGNOSTIC_ERROR:   return Rf_mkString("error");
 86 |   case DIAGNOSTIC_WARNING: return Rf_mkString("warning");
 87 |   case DIAGNOSTIC_INFO:    return Rf_mkString("info");
 88 |   case DIAGNOSTIC_STYLE:   return Rf_mkString("style");
 89 |   }
 90 | 
 91 |   // happy compiler
 92 |   return Rf_mkString("error");
 93 | }
 94 | 
 95 | inline SEXP create(const diagnostics::Diagnostic& diagnostic)
 96 | {
 97 |   using namespace diagnostics;
 98 | 
 99 |   ListBuilder builder;
100 | 
101 |   builder.add("type",    create(diagnostic.type()));
102 |   builder.add("file",    Rf_mkString(""));
103 |   builder.add("line",    Rf_ScalarInteger(diagnostic.start().row));
104 |   builder.add("column",  Rf_ScalarInteger(diagnostic.start().column));
105 |   builder.add("message", r::createString(diagnostic.message()));
106 | 
107 |   return builder;
108 | }
109 | 
110 | inline SEXP create(const std::vector<diagnostics::Diagnostic>& diagnostics)
111 | {
112 |   using namespace diagnostics;
113 | 
114 |   Protect protect;
115 |   index_type n = diagnostics.size();
116 |   SEXP resultSEXP = protect(Rf_allocVector(VECSXP, n));
117 |   for (index_type i = 0; i < n; ++i)
118 |     SET_VECTOR_ELT(resultSEXP, i, create(diagnostics[i]));
119 |   return resultSEXP;
120 | }
121 | 
122 | } // namespace r
123 | 
124 | } // namespace sourcetools
125 | 
126 | #endif /* SOURCETOOLS_DIAGNOSTICS_DIAGNOSTIC_H */
127 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/diagnostics/DiagnosticsSet.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_SET_H
 2 | #define SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_SET_H
 3 | 
 4 | #include <sourcetools/diagnostics/Diagnostic.h>
 5 | #include <sourcetools/diagnostics/Checkers.h>
 6 | 
 7 | namespace sourcetools {
 8 | namespace diagnostics {
 9 | 
10 | class DiagnosticsSet
11 | {
12 |   typedef std::vector<checkers::CheckerBase*> Checkers;
13 |   typedef checkers::CheckerBase CheckerBase;
14 |   typedef parser::ParseNode ParseNode;
15 | 
16 | public:
17 | 
18 |   void add(CheckerBase* pChecker)
19 |   {
20 |     checkers_.push_back(pChecker);
21 |   }
22 | 
23 |   const std::vector<Diagnostic>& run(const ParseNode* pNode)
24 |   {
25 |     runImpl(pNode);
26 |     return diagnostics_;
27 |   }
28 | 
29 |   void report()
30 |   {
31 |     const std::vector<Diagnostic>& diagnostics = diagnostics_;
32 |     for (index_type i = 0; i < utils::size(diagnostics); ++i)
33 |     {
34 |       Diagnostic diagnostic = diagnostics[i];
35 |       std::cerr << diagnostic.range() << ": "
36 |                 << diagnostic.message()
37 |                 << std::endl;
38 |     }
39 |   }
40 | 
41 |   ~DiagnosticsSet()
42 |   {
43 |     for (Checkers::const_iterator it = checkers_.begin();
44 |          it != checkers_.end();
45 |          ++it)
46 |     {
47 |       delete *it;
48 |     }
49 |   }
50 | 
51 | private:
52 |   void runImpl(const ParseNode* pNode, index_type depth = 0)
53 |   {
54 |     for (Checkers::iterator it = checkers_.begin();
55 |          it != checkers_.end();
56 |          ++it)
57 |     {
58 |       (*it)->apply(pNode, &diagnostics_, depth);
59 |     }
60 | 
61 |     for (std::vector<ParseNode*>::const_iterator it = pNode->children().begin();
62 |          it != pNode->children().end();
63 |          ++it)
64 |     {
65 |       runImpl(*it, depth + 1);
66 |     }
67 |   }
68 | 
69 | 
70 | private:
71 |   Checkers checkers_;
72 |   Diagnostics diagnostics_;
73 | };
74 | 
75 | inline DiagnosticsSet* createDefaultDiagnosticsSet()
76 | {
77 |   DiagnosticsSet* pSet = new DiagnosticsSet();
78 |   pSet->add(new checkers::AssignmentInIfChecker);
79 |   pSet->add(new checkers::ComparisonWithNullChecker);
80 |   pSet->add(new checkers::ScalarOpsInIfChecker);
81 |   pSet->add(new checkers::UnusedResultChecker);
82 |   pSet->add(new checkers::NoSymbolInScopeChecker);
83 |   return pSet;
84 | }
85 | 
86 | } // namespace diagnostics
87 | } // namespace sourcetools
88 | 
89 | #endif /* SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_SET_H */
90 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/diagnostics/diagnostics.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_H
2 | #define SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_H
3 | 
4 | #include <sourcetools/diagnostics/Diagnostic.h>
5 | #include <sourcetools/diagnostics/Checkers.h>
6 | #include <sourcetools/diagnostics/DiagnosticsSet.h>
7 | 
8 | #endif /* SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_H */
9 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/multibyte/multibyte.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_MULTIBYTE_MULTIBYTE_H
 2 | #define SOURCETOOLS_MULTIBYTE_MULTIBYTE_H
 3 | 
 4 | #include <cstdlib>
 5 | #include <cwchar>
 6 | 
 7 | namespace sourcetools {
 8 | namespace multibyte {
 9 | 
10 | template <typename T>
11 | inline bool countWhitespaceBytes(const char* data,
12 |                                  T* pBytes)
13 | {
14 |   wchar_t ch;
15 |   T bytes = 0;
16 |   const char* it = data;
17 | 
18 |   while (true) {
19 | 
20 |     int status = std::mbtowc(&ch, it, MB_CUR_MAX);
21 |     if (status == 0) {
22 |       break;
23 |     } else if (status == -1) {
24 |       break;
25 |     }
26 | 
27 |     if (!std::iswspace(ch))
28 |       break;
29 | 
30 |     bytes += status;
31 |     it += status;
32 |   }
33 | 
34 |   *pBytes = bytes;
35 |   return bytes != 0;
36 | }
37 | 
38 | } // namespace multibyte
39 | } // namespace sourcetools
40 | 
41 | #endif /* SOURCETOOLS_MULTIBYTE_MULTIBYTE_H */
42 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/parse/ParseError.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_PARSE_PARSE_ERROR_H
 2 | #define SOURCETOOLS_PARSE_PARSE_ERROR_H
 3 | 
 4 | #include <sourcetools/collection/collection.h>
 5 | #include <sourcetools/tokenization/tokenization.h>
 6 | 
 7 | namespace sourcetools {
 8 | namespace parser {
 9 | 
10 | class ParseError
11 | {
12 |   typedef collections::Position Position;
13 |   typedef tokens::Token Token;
14 | 
15 |   Position start_;
16 |   Position end_;
17 |   std::string message_;
18 | 
19 | public:
20 | 
21 |   ParseError(const tokens::Token& token,
22 |              const std::string& message)
23 |     : start_(token.position()),
24 |       end_(token.position()),
25 |       message_(message)
26 |   {
27 |     end_.column += token.end() - token.begin();
28 |   }
29 | 
30 |   ParseError(const Position& start,
31 |              const Position& end,
32 |              const std::string& message)
33 |     : start_(start),
34 |       end_(end),
35 |       message_(message)
36 |   {
37 |   }
38 | 
39 |   explicit ParseError(const std::string& message)
40 |     : start_(0, 0),
41 |       end_(0, 0),
42 |       message_(message)
43 |   {
44 |   }
45 | 
46 |   const Position& start() const { return start_; }
47 |   const Position& end() const { return end_; }
48 |   const std::string& message() const { return message_; }
49 | };
50 | 
51 | } // namespace parser
52 | } // namespace sourcetools
53 | 
54 | #endif /* SOURCETOOLS_PARSE_PARSE_ERROR_H */
55 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/parse/ParseNode.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_PARSE_PARSE_NODE_H
  2 | #define SOURCETOOLS_PARSE_PARSE_NODE_H
  3 | 
  4 | #include <map>
  5 | #include <memory>
  6 | 
  7 | #include <sourcetools/collection/collection.h>
  8 | #include <sourcetools/tokenization/tokenization.h>
  9 | 
 10 | namespace sourcetools {
 11 | namespace parser {
 12 | 
 13 | class ParseNode
 14 | {
 15 | public:
 16 |   typedef collections::Position Position;
 17 |   typedef collections::Range Range;
 18 |   typedef tokens::Token Token;
 19 |   typedef tokens::TokenType TokenType;
 20 | 
 21 | private:
 22 |   Token token_;
 23 |   ParseNode* parent_;
 24 |   std::vector<ParseNode*> children_;
 25 | 
 26 |   Token begin_;
 27 |   Token end_;
 28 | 
 29 | public:
 30 | 
 31 |   explicit ParseNode(const Token& token)
 32 |     : token_(token), parent_(NULL),
 33 |       begin_(token), end_(token)
 34 |   {
 35 |   }
 36 | 
 37 |   static ParseNode* create(const Token& token)
 38 |   {
 39 |     return new ParseNode(token);
 40 |   }
 41 | 
 42 |   static ParseNode* create(const TokenType& type)
 43 |   {
 44 |     static std::map<TokenType, Token> tokens;
 45 |     if (!tokens.count(type))
 46 |       tokens[type] = Token(type);
 47 | 
 48 |     const Token& token = tokens[type];
 49 |     return new ParseNode(token);
 50 |   }
 51 | 
 52 |   ~ParseNode()
 53 |   {
 54 |     for (std::vector<ParseNode*>::const_iterator it = children_.begin();
 55 |          it != children_.end();
 56 |          ++it)
 57 |     {
 58 |       delete *it;
 59 |     }
 60 |   }
 61 | 
 62 |   void remove(const ParseNode* pNode)
 63 |   {
 64 |     children_.erase(
 65 |       std::remove(children_.begin(), children_.end(), pNode),
 66 |       children_.end());
 67 |   }
 68 | 
 69 |   void add(ParseNode* pNode)
 70 |   {
 71 |     if (pNode->parent_ != NULL)
 72 |       pNode->parent_->remove(pNode);
 73 |     pNode->parent_ = this;
 74 | 
 75 |     const Token& begin = pNode->begin();
 76 |     const Token& end   = pNode->end();
 77 |     if (begin.offset() != -1 && end.offset() != -1)
 78 |     {
 79 |       for (ParseNode* pParent = this; pParent != NULL; pParent = pParent->parent_)
 80 |       {
 81 |         if (begin.begin() < pParent->begin().begin())
 82 |           pParent->setBegin(begin);
 83 |         if (end.end() > pParent->end().end())
 84 |           pParent->setEnd(end);
 85 |       }
 86 |     }
 87 | 
 88 |     children_.push_back(pNode);
 89 |   }
 90 | 
 91 |   const Token& begin() const
 92 |   {
 93 |     return begin_;
 94 |   }
 95 | 
 96 |   void setBegin(const Token& begin)
 97 |   {
 98 |     for (ParseNode* pNode = this; pNode != NULL; pNode = pNode->parent_)
 99 |       if (begin.begin() < pNode->begin().begin())
100 |         pNode->begin_ = begin;
101 |   }
102 | 
103 |   const Token& end() const
104 |   {
105 |     return end_;
106 |   }
107 | 
108 |   void setEnd(const Token& end)
109 |   {
110 |     end_ = end;
111 |     for (ParseNode* pNode = this; pNode != NULL; pNode = pNode->parent_)
112 |       if (end.end() > pNode->end().end())
113 |         pNode->end_ = end;
114 |   }
115 | 
116 |   void bounds(const char** begin, const char** end)
117 |   {
118 |     *begin = begin_.begin();
119 |     *end   = end_.end();
120 |   }
121 | 
122 |   Range range() const
123 |   {
124 |     return Range(begin_.position(), end_.position() + end_.size());
125 |   }
126 | 
127 |   const Token& token() const { return token_; }
128 |   const ParseNode* parent() const { return parent_; }
129 |   const std::vector<ParseNode*>& children() const { return children_; }
130 | };
131 | 
132 | } // namespace parser
133 | } // namespace sourcetools
134 | 
135 | #endif /* SOURCETOOLS_PARSE_PARSE_NODE_H */
136 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/parse/ParseStatus.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_PARSE_PARSE_STATUS_H
 2 | #define SOURCETOOLS_PARSE_PARSE_STATUS_H
 3 | 
 4 | #include <sourcetools/collection/Position.h>
 5 | 
 6 | #include <sourcetools/parse/ParseError.h>
 7 | 
 8 | namespace sourcetools {
 9 | namespace parser {
10 | 
11 | class ParseNode;
12 | 
13 | class ParseStatus
14 | {
15 |   typedef collections::Position Position;
16 | 
17 | public:
18 |   ParseStatus() {}
19 | 
20 |   void recordNodeLocation(const Position& position,
21 |                           ParseNode* pNode)
22 |   {
23 |     map_[position] = pNode;
24 |   }
25 | 
26 |   ParseNode* getNodeAtPosition(const Position& position)
27 |   {
28 |     return map_[position];
29 |   }
30 | 
31 |   void addError(const ParseError& error)
32 |   {
33 |     errors_.push_back(error);
34 |   }
35 | 
36 |   const std::vector<ParseError>& getErrors() const
37 |   {
38 |     return errors_;
39 |   }
40 | 
41 | private:
42 |   std::map<Position, ParseNode*> map_;
43 |   std::vector<ParseError> errors_;
44 | };
45 | } // namespace parser
46 | } // namespace sourcetools
47 | 
48 | #endif /* SOURCETOOLS_PARSE_PARSE_STATUS_H */
49 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/parse/Parser.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_PARSE_PARSER_H
  2 | #define SOURCETOOLS_PARSE_PARSER_H
  3 | #define SOURCE_TOOLS_PARSE_PARSER_H
  4 | 
  5 | #include <iostream>
  6 | 
  7 | #include <sourcetools/tokenization/tokenization.h>
  8 | #include <sourcetools/collection/collection.h>
  9 | #include <sourcetools/cursor/cursor.h>
 10 | 
 11 | #include <sourcetools/parse/ParseNode.h>
 12 | #include <sourcetools/parse/Precedence.h>
 13 | #include <sourcetools/parse/ParseError.h>
 14 | #include <sourcetools/parse/ParseStatus.h>
 15 | 
 16 | // Defines that will go away once the parser is more tested / game ready
 17 | // #define SOURCE_TOOLS_DEBUG_PARSER_TRACE
 18 | // #define SOURCE_TOOLS_DEBUG_PARSER_PRINT_TOKEN_INFO
 19 | // #define SOURCE_TOOLS_DEBUG_PARSER_STACK_OVERFLOW
 20 | 
 21 | #ifdef SOURCE_TOOLS_DEBUG_PARSER_TRACE
 22 | # define SOURCE_TOOLS_DEBUG_PARSER_LOG(__X__) std::cerr << __X__ << std::endl
 23 | #else
 24 | # define SOURCE_TOOLS_DEBUG_PARSER_LOG(__X__)
 25 | #endif
 26 | 
 27 | #ifdef SOURCE_TOOLS_DEBUG_PARSER_PRINT_TOKEN_INFO
 28 | 
 29 | # define SOURCE_TOOLS_DEBUG_TOKEN(__TOKEN__)                 \
 30 |   do                                                         \
 31 |   {                                                          \
 32 |     std::cout << __TOKEN__ << std::endl;                     \
 33 |   } while (0)
 34 | 
 35 | #else
 36 | 
 37 | # define SOURCE_TOOLS_DEBUG_TOKEN(__TOKEN__)                 \
 38 |   do                                                         \
 39 |   {                                                          \
 40 |   } while (0)                                                \
 41 | 
 42 | #endif
 43 | 
 44 | namespace sourcetools {
 45 | namespace parser {
 46 | 
 47 | class Parser
 48 | {
 49 |   typedef tokenizer::Tokenizer Tokenizer;
 50 |   typedef tokens::Token Token;
 51 |   typedef tokens::TokenType TokenType;
 52 |   typedef collections::Position Position;
 53 | 
 54 |   enum ParseState
 55 |   {
 56 |     PARSE_STATE_TOP_LEVEL,
 57 |     PARSE_STATE_BRACE,
 58 |     PARSE_STATE_PAREN
 59 |   };
 60 | 
 61 |   Tokenizer tokenizer_;
 62 |   Token token_;
 63 |   Token previous_;
 64 |   ParseState state_;
 65 |   ParseStatus* pStatus_;
 66 | 
 67 | public:
 68 |   explicit Parser(const std::string& code)
 69 |     : tokenizer_(code.c_str(), code.size()),
 70 |       state_(PARSE_STATE_TOP_LEVEL)
 71 |   {
 72 |     advance();
 73 |   }
 74 | 
 75 |   explicit Parser(const char* code, index_type n)
 76 |     : tokenizer_(code, n),
 77 |       state_(PARSE_STATE_TOP_LEVEL)
 78 |   {
 79 |     advance();
 80 |   }
 81 | 
 82 | private:
 83 | 
 84 |   // Error-related ----
 85 | 
 86 |   void unexpectedEndOfInput()
 87 |   {
 88 |     ParseError error("unexpected end of input");
 89 |     pStatus_->addError(error);
 90 |   }
 91 | 
 92 |   std::string unexpectedTokenString(const Token& token)
 93 |   {
 94 |     return std::string() +
 95 |       "unexpected token '" + token.contents() + "'";
 96 |   }
 97 | 
 98 |   std::string unexpectedTokenString(const Token& token,
 99 |                                     TokenType expectedType)
100 |   {
101 |     return unexpectedTokenString(token) +
102 |       "; expected type '" + toString(expectedType) + "'";
103 |   }
104 | 
105 |   void unexpectedToken(const Token& token)
106 |   {
107 |     unexpectedToken(token, unexpectedTokenString(token));
108 |   }
109 | 
110 |   void unexpectedToken(const Token& token,
111 |                        TokenType type)
112 |   {
113 |     unexpectedToken(token, unexpectedTokenString(token, type));
114 |   }
115 | 
116 |   void unexpectedToken(const Token& token,
117 |                        const std::string& message)
118 |   {
119 |     ParseError error(token, message);
120 |     pStatus_->addError(error);
121 |   }
122 | 
123 |   bool checkUnexpectedEnd(const Token& token)
124 |   {
125 |     if (UNLIKELY(token.isType(tokens::END)))
126 |     {
127 |       ParseError error(token, "unexpected end of input");
128 |       pStatus_->addError(error);
129 |       return true;
130 |     }
131 | 
132 |     return false;
133 |   }
134 | 
135 |   // Parser sub-routines ----
136 | 
137 |   ParseNode* parseFunctionArgumentListOne()
138 |   {
139 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionArgument()");
140 |     using namespace tokens;
141 | 
142 |     check(SYMBOL);
143 | 
144 |     Token lookahead = peek(1);
145 |     if (lookahead.isType(COMMA) || lookahead.isType(RPAREN))
146 |       return ParseNode::create(consume());
147 |     else if (lookahead.isType(OPERATOR_ASSIGN_LEFT_EQUALS))
148 |       return parseExpression();
149 | 
150 |     if (isOperator(lookahead))
151 |       unexpectedToken(lookahead, "expected '=', ',' or ')' following argument name");
152 | 
153 |     return parseExpression();
154 |   }
155 | 
156 |   ParseNode* parseFunctionArgumentList()
157 |   {
158 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionArgumentList()");
159 |     using namespace tokens;
160 | 
161 |     ParseNode* pNode = createNode(EMPTY);
162 |     if (token_.isType(RPAREN))
163 |       return pNode;
164 | 
165 |     while (true)
166 |     {
167 |       if (checkUnexpectedEnd(current()))
168 |         break;
169 | 
170 |       pNode->add(parseFunctionArgumentListOne());
171 |       if (current().isType(RPAREN))
172 |         return pNode;
173 |       else if (current().isType(COMMA))
174 |       {
175 |         advance();
176 |         continue;
177 |       }
178 | 
179 |       // TODO: how should we recover here? For now, we
180 |       // assume that there should have been a comma and
181 |       // continue parsing.
182 |       unexpectedToken(current(), "expected ',' or ')'");
183 |       continue;
184 |     }
185 | 
186 |     return pNode;
187 |   }
188 | 
189 |   ParseNode* parseFunctionDefinition()
190 |   {
191 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionDefinition()");
192 |     using namespace tokens;
193 |     ParseNode* pNode = createNode(current());
194 |     checkAndAdvance(KEYWORD_FUNCTION);
195 |     checkAndAdvance(LPAREN, false);
196 |     ParseState state = state_;
197 |     state_ = PARSE_STATE_PAREN;
198 |     pNode->add(parseFunctionArgumentList());
199 |     state_ = state;
200 |     checkAndAdvance(RPAREN, false);
201 |     pNode->add(parseNonEmptyExpression());
202 |     return pNode;
203 |   }
204 | 
205 |   ParseNode* parseFor()
206 |   {
207 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFor()");
208 |     using namespace tokens;
209 |     ParseNode* pNode = createNode(current());
210 |     checkAndAdvance(KEYWORD_FOR);
211 |     checkAndAdvance(LPAREN, false);
212 |     ParseState state = state_;
213 |     state_ = PARSE_STATE_PAREN;
214 |     check(SYMBOL);
215 |     pNode->add(createNode(consume()));
216 |     checkAndAdvance(KEYWORD_IN, false);
217 |     pNode->add(parseNonEmptyExpression());
218 |     state_ = state;
219 |     checkAndAdvance(RPAREN, false);
220 |     pNode->add(parseNonEmptyExpression());
221 |     return pNode;
222 |   }
223 | 
224 |   ParseNode* parseIf()
225 |   {
226 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseIf()");
227 |     using namespace tokens;
228 |     ParseNode* pNode = createNode(current());
229 |     checkAndAdvance(KEYWORD_IF);
230 |     checkAndAdvance(LPAREN, false);
231 |     ParseState state = state_;
232 |     state_ = PARSE_STATE_PAREN;
233 |     pNode->add(parseNonEmptyExpression());
234 |     state_ = state;
235 |     checkAndAdvance(RPAREN, false);
236 |     pNode->add(parseNonEmptyExpression());
237 |     if (current().isType(KEYWORD_ELSE))
238 |     {
239 |       advance();
240 |       pNode->add(parseNonEmptyExpression());
241 |     }
242 |     return pNode;
243 |   }
244 | 
245 |   ParseNode* parseWhile()
246 |   {
247 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseWhile()");
248 |     using namespace tokens;
249 |     ParseNode* pNode = createNode(current());
250 |     checkAndAdvance(KEYWORD_WHILE);
251 |     checkAndAdvance(LPAREN, false);
252 |     ParseState state = state_;
253 |     state_ = PARSE_STATE_PAREN;
254 |     pNode->add(parseNonEmptyExpression());
255 |     state_ = state;
256 |     checkAndAdvance(RPAREN, false);
257 |     pNode->add(parseNonEmptyExpression());
258 |     return pNode;
259 |   }
260 | 
261 |   ParseNode* parseRepeat()
262 |   {
263 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseRepeat()");
264 |     using namespace tokens;
265 |     ParseNode* pNode = createNode(current());
266 |     checkAndAdvance(KEYWORD_REPEAT);
267 |     pNode->add(parseNonEmptyExpression());
268 |     return pNode;
269 |   }
270 | 
271 |   ParseNode* parseControlFlowKeyword()
272 |   {
273 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseControlFlowKeyword('" << token_.contents() << "')");
274 |     using namespace tokens;
275 | 
276 |     const Token& token = current();
277 |     if (token.isType(KEYWORD_FUNCTION))
278 |       return parseFunctionDefinition();
279 |     else if (token.isType(KEYWORD_IF))
280 |       return parseIf();
281 |     else if (token.isType(KEYWORD_WHILE))
282 |       return parseWhile();
283 |     else if (token.isType(KEYWORD_FOR))
284 |       return parseFor();
285 |     else if (token.isType(KEYWORD_REPEAT))
286 |       return parseRepeat();
287 | 
288 |     unexpectedToken(consume(), "expected control-flow keyword");
289 |     return createNode(INVALID);
290 |   }
291 | 
292 |   ParseNode* parseBracedExpression()
293 |   {
294 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseBracedExpression()");
295 |     using namespace tokens;
296 |     ParseNode* pNode = createNode(current());
297 | 
298 |     checkAndAdvance(LBRACE);
299 |     ParseState state = state_;
300 |     state_ = PARSE_STATE_BRACE;
301 |     skipSemicolons();
302 |     if (current().isType(RBRACE))
303 |     {
304 |       pNode->add(createNode(EMPTY));
305 |     }
306 |     else
307 |     {
308 |       while (!current().isType(RBRACE))
309 |       {
310 |         if (checkUnexpectedEnd(current()))
311 |           break;
312 |         pNode->add(parseNonEmptyExpression());
313 |         skipSemicolons();
314 |       }
315 |     }
316 |     state_ = state;
317 |     pNode->setEnd(current());
318 |     checkAndAdvance(RBRACE);
319 | 
320 |     return pNode;
321 |   }
322 | 
323 |   ParseNode* parseParentheticalExpression()
324 |   {
325 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseParentheticalExpression()");
326 |     using namespace tokens;
327 |     ParseNode* pNode = createNode(current());
328 |     checkAndAdvance(LPAREN);
329 |     ParseState state = state_;
330 |     state_ = PARSE_STATE_PAREN;
331 |     if (current().isType(RPAREN))
332 |       unexpectedToken(current());
333 |     else
334 |       pNode->add(parseNonEmptyExpression());
335 |     state_ = state;
336 |     pNode->setEnd(current());
337 |     checkAndAdvance(RPAREN);
338 |     return pNode;
339 |   }
340 | 
341 |   ParseNode* parseUnaryOperator()
342 |   {
343 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseUnaryOperator()");
344 |     ParseNode* pNode = createNode(current());
345 |     pNode->add(parseNonEmptyExpression(precedence::unary(consume())));
346 |     return pNode;
347 |   }
348 | 
349 |   ParseNode* parseExpressionStart()
350 |   {
351 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseExpressionStart('" << current().contents() << "')");
352 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("Type: " << toString(current().type()));
353 |     using namespace tokens;
354 | 
355 |     skipSemicolons();
356 |     const Token& token = current();
357 | 
358 |     if (isControlFlowKeyword(token))
359 |       return parseControlFlowKeyword();
360 |     else if (token.isType(LBRACE))
361 |       return parseBracedExpression();
362 |     else if (token.isType(LPAREN))
363 |       return parseParentheticalExpression();
364 |     else if (isUnaryOperator(token))
365 |       return parseUnaryOperator();
366 |     else if (isSymbolic(token) || isKeyword(token))
367 |       return createNode(consume());
368 |     else if (token.isType(END))
369 |       return NULL;
370 | 
371 |     unexpectedToken(consume());
372 |     return createNode(INVALID);
373 |   }
374 | 
375 |   ParseNode* parseFunctionCallOne(TokenType rhsType)
376 |   {
377 |     using namespace tokens;
378 | 
379 |     const Token& token = current();
380 |     if (token.isType(COMMA) || token.isType(rhsType))
381 |       return createNode(Token(MISSING));
382 | 
383 |     if (peek(1).isType(OPERATOR_ASSIGN_LEFT_EQUALS))
384 |     {
385 |       ParseNode* pLhs  = createNode(consume());
386 |       ParseNode* pNode = createNode(consume());
387 |       pNode->add(pLhs);
388 | 
389 |       if (current().isType(COMMA) || current().isType(rhsType))
390 |         pNode->add(createNode(MISSING));
391 |       else
392 |         pNode->add(parseNonEmptyExpression());
393 | 
394 |       return pNode;
395 |     }
396 | 
397 |     return parseNonEmptyExpression();
398 |   }
399 | 
400 |   // Parse a function call, e.g.
401 |   //
402 |   //    <fn-call> ::= <expr> <fn-open> <fn-call-args> <fn-close>
403 |   //
404 |   // <fn-open> can be one of '(', '[' or '[[',
405 |   // <fn-call-args> are (potentially named) comma-separated values
406 |   // <fn-close> is the complement of the above.
407 |   //
408 |   // Parsing a function call is surprisingly tricky, due to the
409 |   // nature of allowing a mixture of unnamed, named, and missing
410 |   // arguments.
411 |   ParseNode* parseFunctionCall(ParseNode* pLhs)
412 |   {
413 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionCall('" << current().contents() << "')");
414 |     using namespace tokens;
415 |     TokenType lhsType = current().type();
416 |     TokenType rhsType = complement(lhsType);
417 | 
418 |     ParseNode* pNode = createNode(current());
419 |     pNode->add(pLhs);
420 | 
421 |     checkAndAdvance(lhsType);
422 | 
423 |     ParseState state = state_;
424 |     state_ = PARSE_STATE_PAREN;
425 | 
426 |     if (current().isType(rhsType))
427 |     {
428 |       pNode->add(lhsType == LPAREN ?
429 |                    createNode(Token(EMPTY)) :
430 |                    createNode(Token(MISSING)));
431 |     }
432 |     else
433 |     {
434 |       while (true)
435 |       {
436 |         if (checkUnexpectedEnd(current()))
437 |           break;
438 | 
439 |         pNode->add(parseFunctionCallOne(rhsType));
440 | 
441 |         const Token& token = current();
442 |         if (token.isType(COMMA))
443 |         {
444 |           consume();
445 |           continue;
446 |         }
447 |         else if (token.isType(rhsType))
448 |         {
449 |           break;
450 |         }
451 | 
452 |         std::string message = std::string() +
453 |           "expected ',' or '" + toString(rhsType) + "'";
454 |         unexpectedToken(current(), message);
455 |       }
456 |     }
457 | 
458 |     checkAndAdvance(rhsType);
459 | 
460 |     state_ = state;
461 | 
462 |     if (isCallOperator(current()) && canParseExpressionContinuation())
463 |       return parseFunctionCall(pNode);
464 |     return pNode;
465 |   }
466 | 
467 |   ParseNode* parseExpressionContinuation(ParseNode* pNode)
468 |   {
469 |     using namespace tokens;
470 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseExpressionContinuation('" << current().contents() << "')");
471 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("Type: " << toString(current().type()));
472 | 
473 |     Token token = current();
474 |     if (isCallOperator(token))
475 |       return parseFunctionCall(pNode);
476 |     else if (token.isType(END))
477 |       return createNode(token);
478 | 
479 |     ParseNode* pNew = createNode(token);
480 |     pNew->add(pNode);
481 | 
482 |     advance();
483 |     int precedence =
484 |       precedence::binary(token) -
485 |       precedence::isRightAssociative(token);
486 |     pNew->add(parseNonEmptyExpression(precedence));
487 | 
488 |     return pNew;
489 |   }
490 | 
491 |   bool canParseExpressionContinuation(int precedence = 0)
492 |   {
493 |     if (precedence >= precedence::binary(current()))
494 |       return false;
495 | 
496 |     if (state_ == PARSE_STATE_PAREN)
497 |       return true;
498 | 
499 |     index_type lhs = previous().row();
500 |     index_type rhs = current().row();
501 |     if (previous().isType(tokens::STRING))
502 |     {
503 |       lhs += std::count(previous().begin(), previous().end(), '\n');
504 |     }
505 | 
506 |     return lhs == rhs;
507 | 
508 |   }
509 | 
510 |   ParseNode* parseExpression(int precedence = 0)
511 |   {
512 |     SOURCE_TOOLS_DEBUG_PARSER_LOG("parseExpression(" << precedence << ")");
513 |     using namespace tokens;
514 |     ParseNode* pNode = parseExpressionStart();
515 |     while (canParseExpressionContinuation(precedence))
516 |       pNode = parseExpressionContinuation(pNode);
517 |     return pNode;
518 |   }
519 | 
520 |   ParseNode* parseNonEmptyExpression(int precedence = 0)
521 |   {
522 |     if (checkUnexpectedEnd(current()))
523 |       return ParseNode::create(tokens::MISSING);
524 |     return parseExpression(precedence);
525 |   }
526 | 
527 |   // Tokenization ----
528 | 
529 |   const Token& current()  const { return token_; }
530 |   const Token& previous() const { return previous_; }
531 | 
532 |   Token consume()
533 |   {
534 |     Token token = current();
535 |     advance();
536 |     return token;
537 |   }
538 | 
539 |   bool advance()
540 |   {
541 |     previous_ = token_;
542 |     using namespace tokens;
543 | 
544 |     bool success = tokenizer_.tokenize(&token_);
545 |     while (success && (isComment(token_) || isWhitespace(token_)))
546 |       success = tokenizer_.tokenize(&token_);
547 |     return success;
548 |   }
549 | 
550 |   bool check(TokenType type)
551 |   {
552 |     const Token& token = current();
553 |     bool success = token.isType(type);
554 |     if (!success)
555 |       unexpectedToken(token, type);
556 |     return success;
557 |   }
558 | 
559 |   bool checkAndAdvance(TokenType type, bool advanceOnError = true)
560 |   {
561 |     bool result = check(type);
562 |     if (result || advanceOnError) advance();
563 |     return result;
564 |   }
565 | 
566 |   Token peek(index_type lookahead = 0,
567 |              bool skipWhitespace = true,
568 |              bool skipComments = true)
569 |   {
570 |     index_type offset = lookahead;
571 | 
572 |     while (true)
573 |     {
574 |       Token result = tokenizer_.peek(offset);
575 |       if ((skipWhitespace && result.isType(tokens::WHITESPACE)) ||
576 |           (skipComments && result.isType(tokens::COMMENT)))
577 |       {
578 |         ++offset;
579 |         continue;
580 |       }
581 | 
582 |       if (lookahead == 0)
583 |         return result;
584 | 
585 |       --lookahead;
586 |     }
587 | 
588 |   }
589 | 
590 |   // Utils ----
591 | 
592 |   ParseNode* createNode(TokenType type)
593 |   {
594 |     return ParseNode::create(type);
595 |   }
596 | 
597 |   ParseNode* createNode(const Token& token)
598 |   {
599 |     ParseNode* pNode = ParseNode::create(token);
600 |     pStatus_->recordNodeLocation(token.position(), pNode);
601 |     return pNode;
602 |   }
603 | 
604 |   void skipSemicolons()
605 |   {
606 |     while (current().isType(tokens::SEMI))
607 |     {
608 |       if (state_ == PARSE_STATE_PAREN)
609 |         unexpectedToken(consume());
610 |       else
611 |         advance();
612 |     }
613 |   }
614 | 
615 | public:
616 | 
617 |   ParseNode* parse(ParseStatus* pStatus)
618 |   {
619 |     pStatus_ = pStatus;
620 |     ParseNode* root = createNode(tokens::ROOT);
621 | 
622 |     while (true)
623 |     {
624 |       ParseNode* pNode = parseExpression();
625 |       if (!pNode)
626 |         break;
627 | 
628 |       root->add(pNode);
629 |     }
630 | 
631 |     return root;
632 |   }
633 | 
634 | };
635 | 
636 | } // namespace parser
637 | 
638 | void log(parser::ParseNode* pNode, int depth = 0);
639 | 
640 | } // namespace sourcetools
641 | 
642 | #endif /* SOURCETOOLS_PARSE_PARSER_H */
643 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/parse/Precedence.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_PARSE_PRECEDENCE_H
  2 | #define SOURCETOOLS_PARSE_PRECEDENCE_H
  3 | 
  4 | #include <sourcetools/tokenization/tokenization.h>
  5 | 
  6 | namespace sourcetools {
  7 | namespace parser {
  8 | namespace precedence {
  9 | 
 10 | inline int binary(const tokens::Token& token)
 11 | {
 12 |   using namespace tokens;
 13 |   switch (token.type())
 14 |   {
 15 |   case OPERATOR_HELP:
 16 |     return 10;
 17 |   case OPERATOR_ASSIGN_LEFT_COLON:
 18 |     return 20;
 19 |   case OPERATOR_ASSIGN_LEFT:
 20 |   case OPERATOR_ASSIGN_LEFT_EQUALS:
 21 |   case OPERATOR_ASSIGN_LEFT_PARENT:
 22 |     return 30;
 23 |   case OPERATOR_ASSIGN_RIGHT:
 24 |   case OPERATOR_ASSIGN_RIGHT_PARENT:
 25 |     return 40;
 26 |   case OPERATOR_FORMULA:
 27 |     return 50;
 28 |   case OPERATOR_PIPE:
 29 |   case OPERATOR_PIPE_BIND:
 30 |     return 55;
 31 |   case OPERATOR_OR_SCALAR:
 32 |   case OPERATOR_OR_VECTOR:
 33 |     return 60;
 34 |   case OPERATOR_AND_SCALAR:
 35 |   case OPERATOR_AND_VECTOR:
 36 |     return 70;
 37 |   case OPERATOR_NEGATION:
 38 |     return 80;
 39 |   case OPERATOR_LESS:
 40 |   case OPERATOR_LESS_OR_EQUAL:
 41 |   case OPERATOR_GREATER:
 42 |   case OPERATOR_GREATER_OR_EQUAL:
 43 |   case OPERATOR_EQUAL:
 44 |   case OPERATOR_NOT_EQUAL:
 45 |     return 90;
 46 |   case OPERATOR_PLUS:
 47 |   case OPERATOR_MINUS:
 48 |     return 100;
 49 |   case OPERATOR_MULTIPLY:
 50 |   case OPERATOR_DIVIDE:
 51 |     return 110;
 52 |   case OPERATOR_USER:
 53 |     return 120;
 54 |   case OPERATOR_SEQUENCE:
 55 |     return 130;
 56 |   case OPERATOR_EXPONENTATION_STARS:
 57 |   case OPERATOR_HAT:
 58 |     return 150;
 59 |   case LPAREN:
 60 |   case LBRACKET:
 61 |   case LDBRACKET:
 62 |     return 170;
 63 |   case OPERATOR_DOLLAR:
 64 |   case OPERATOR_AT:
 65 |     return 180;
 66 |   case OPERATOR_NAMESPACE_EXPORTS:
 67 |   case OPERATOR_NAMESPACE_ALL:
 68 |     return 190;
 69 | 
 70 |   default:
 71 |     return 0;
 72 |   }
 73 | }
 74 | 
 75 | inline int unary(const tokens::Token& token)
 76 | {
 77 |   using namespace tokens;
 78 |   switch (token.type())
 79 |   {
 80 |   case OPERATOR_HELP:
 81 |     return 10;
 82 |   case OPERATOR_FORMULA:
 83 |     return 50;
 84 |   case OPERATOR_NEGATION:
 85 |     return 80;
 86 |   case OPERATOR_PLUS:
 87 |   case OPERATOR_MINUS:
 88 |     return 140;
 89 |   default:
 90 |     return 0;
 91 |   }
 92 | }
 93 | 
 94 | inline bool isRightAssociative(const tokens::Token& token)
 95 | {
 96 |   using namespace tokens;
 97 |   switch (token.type())
 98 |   {
 99 |   case OPERATOR_ASSIGN_LEFT:
100 |   case OPERATOR_ASSIGN_LEFT_PARENT:
101 |   case OPERATOR_ASSIGN_LEFT_EQUALS:
102 |   case OPERATOR_EXPONENTATION_STARS:
103 |   case OPERATOR_HAT:
104 |   case LPAREN:
105 |   case LBRACKET:
106 |   case LDBRACKET:
107 |     return true;
108 |   default:
109 |     return false;
110 |   }
111 | }
112 | 
113 | } // namespace precedence
114 | } // namespace parser
115 | } // namespace sourcetools
116 | 
117 | #endif /* SOURCETOOLS_PARSE_PRECEDENCE_H */
118 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/parse/parse.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_PARSE_PARSE_H
 2 | #define SOURCETOOLS_PARSE_PARSE_H
 3 | 
 4 | #include <sourcetools/parse/ParseNode.h>
 5 | #include <sourcetools/parse/Precedence.h>
 6 | #include <sourcetools/parse/ParseError.h>
 7 | #include <sourcetools/parse/ParseStatus.h>
 8 | #include <sourcetools/parse/Parser.h>
 9 | 
10 | #endif /* SOURCETOOLS_PARSE_PARSE_H */
11 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/platform/platform.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_PLATFORM_PLATFORM_H
 2 | #define SOURCETOOLS_PLATFORM_PLATFORM_H
 3 | 
 4 | #ifdef _WIN32
 5 | # define SOURCETOOLS_PLATFORM_WINDOWS
 6 | #endif
 7 | 
 8 | #ifdef __APPLE__
 9 | # define SOURCETOOLS_PLATFORM_MACOS
10 | #endif
11 | 
12 | #ifdef __linux__
13 | # define SOURCETOOLS_PLATFORM_LINUX
14 | #endif
15 | 
16 | #if defined(__sun) && defined(__SVR4)
17 | # define SOURCETOOLS_PLATFORM_SOLARIS
18 | #endif
19 | 
20 | #if __cplusplus >= 201103L
21 | # define SOURCETOOLS_COMPILER_CXX11
22 | #endif
23 | 
24 | #endif /* SOURCETOOLS_PLATFORM_PLATFORM_H */
25 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/RCallRecurser.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_R_R_CALL_RECURSER_H
 2 | #define SOURCETOOLS_R_R_CALL_RECURSER_H
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include <sourcetools/core/core.h>
 7 | 
 8 | #include <sourcetools/r/RHeaders.h>
 9 | #include <sourcetools/r/RFunctions.h>
10 | 
11 | 
12 | namespace sourcetools {
13 | namespace r {
14 | 
15 | class CallRecurser : noncopyable
16 | {
17 | public:
18 | 
19 |   class Operation
20 |   {
21 |   public:
22 |     virtual void apply(SEXP dataSEXP) = 0;
23 |     virtual ~Operation() {}
24 |   };
25 | 
26 |   explicit CallRecurser(SEXP dataSEXP)
27 |   {
28 |     if (Rf_isPrimitive(dataSEXP))
29 |       dataSEXP_ = R_NilValue;
30 |     else if (Rf_isFunction(dataSEXP))
31 |       dataSEXP_ = r::util::functionBody(dataSEXP);
32 |     else if (TYPEOF(dataSEXP) == LANGSXP)
33 |       dataSEXP_ = dataSEXP;
34 |     else
35 |       dataSEXP_ = R_NilValue;
36 |   }
37 | 
38 |   void add(Operation* pOperation)
39 |   {
40 |     operations_.push_back(pOperation);
41 |   }
42 | 
43 |   void run()
44 |   {
45 |     runImpl(dataSEXP_);
46 |   }
47 | 
48 |   void runImpl(SEXP dataSEXP)
49 |   {
50 |     for (std::vector<Operation*>::iterator it = operations_.begin();
51 |          it != operations_.end();
52 |          ++it)
53 |     {
54 |       (*it)->apply(dataSEXP);
55 |     }
56 | 
57 |     if (TYPEOF(dataSEXP) == LANGSXP)
58 |     {
59 |       while (dataSEXP != R_NilValue)
60 |       {
61 |         runImpl(CAR(dataSEXP));
62 |         dataSEXP = CDR(dataSEXP);
63 |       }
64 |     }
65 |   }
66 | 
67 | private:
68 |   SEXP dataSEXP_;
69 |   std::vector<Operation*> operations_;
70 | };
71 | 
72 | } // namespace r
73 | } // namespace sourcetools
74 | 
75 | #endif /* SOURCETOOLS_R_R_CALL_RECURSER_H */
76 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/RConverter.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_R_R_CONVERTER_H
 2 | #define SOURCETOOLS_R_R_CONVERTER_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | 
 7 | #include <sourcetools/r/RHeaders.h>
 8 | #include <sourcetools/r/RProtect.h>
 9 | 
10 | namespace sourcetools {
11 | namespace r {
12 | 
13 | inline SEXP createChar(const std::string& data)
14 | {
15 |   return Rf_mkCharLenCE(data.c_str(), data.size(), CE_UTF8);
16 | }
17 | 
18 | inline SEXP createString(const std::string& data)
19 | {
20 |   Protect protect;
21 |   SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1));
22 |   SET_STRING_ELT(resultSEXP, 0, createChar(data));
23 |   return resultSEXP;
24 | }
25 | 
26 | inline SEXP create(const std::vector<std::string>& vector)
27 | {
28 |   Protect protect;
29 |   index_type n = vector.size();
30 |   SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n));
31 |   for (index_type i = 0; i < n; ++i)
32 |     SET_STRING_ELT(resultSEXP, i, createChar(vector[i]));
33 |   return resultSEXP;
34 | }
35 | 
36 | } // namespace r
37 | } // namespace sourcetools
38 | 
39 | #endif /* SOURCETOOLS_R_R_CONVERTER_H */
40 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/RFunctions.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_R_R_FUNCTIONS_H
 2 | #define SOURCETOOLS_R_R_FUNCTIONS_H
 3 | 
 4 | #include <string>
 5 | #include <set>
 6 | 
 7 | #include <sourcetools/r/RUtils.h>
 8 | 
 9 | namespace sourcetools {
10 | namespace r {
11 | 
12 | inline SEXP eval(const std::string& fn, SEXP envSEXP = NULL)
13 | {
14 |   Protect protect;
15 |   if (envSEXP == NULL)
16 |   {
17 |     SEXP strSEXP = protect(Rf_mkString("sourcetools"));
18 |     envSEXP = R_FindNamespace(strSEXP);
19 |   }
20 | 
21 |   SEXP callSEXP = protect(Rf_lang1(Rf_install(fn.c_str())));
22 |   SEXP resultSEXP = protect(Rf_eval(callSEXP, envSEXP));
23 |   return resultSEXP;
24 | }
25 | 
26 | inline std::set<std::string> objectsOnSearchPath()
27 | {
28 |   std::set<std::string> results;
29 |   Protect protect;
30 | 
31 |   SEXP objectsSEXP;
32 |   protect(objectsSEXP = eval("search_objects"));
33 | 
34 |   for (R_xlen_t i = 0; i < Rf_length(objectsSEXP); ++i)
35 |   {
36 |     SEXP strSEXP = VECTOR_ELT(objectsSEXP, i);
37 |     for (R_xlen_t j = 0; j < Rf_length(strSEXP); ++j)
38 |     {
39 |       SEXP charSEXP = STRING_ELT(strSEXP, j);
40 |       std::string element(CHAR(charSEXP), Rf_length(charSEXP));
41 |       results.insert(element);
42 |     }
43 |   }
44 | 
45 |   return results;
46 | }
47 | 
48 | namespace util {
49 | 
50 | inline void setNames(SEXP dataSEXP, const char** names, index_type n)
51 | {
52 |   RObjectFactory factory;
53 |   SEXP namesSEXP = factory.create(STRSXP, n);
54 |   for (index_type i = 0; i < n; ++i)
55 |     SET_STRING_ELT(namesSEXP, i, Rf_mkChar(names[i]));
56 |   Rf_setAttrib(dataSEXP, R_NamesSymbol, namesSEXP);
57 | }
58 | 
59 | inline void listToDataFrame(SEXP listSEXP, int n)
60 | {
61 |   r::Protect protect;
62 |   SEXP classSEXP = protect(Rf_mkString("data.frame"));
63 |   Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP);
64 | 
65 |   SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2));
66 |   INTEGER(rownamesSEXP)[0] = NA_INTEGER;
67 |   INTEGER(rownamesSEXP)[1] = -n;
68 |   Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP);
69 | }
70 | 
71 | inline SEXP functionBody(SEXP fnSEXP)
72 | {
73 |   SEXP bodyFunctionSEXP = Rf_findFun(Rf_install("body"), R_BaseNamespace);
74 | 
75 |   r::Protect protect;
76 |   SEXP callSEXP = protect(Rf_lang2(bodyFunctionSEXP, fnSEXP));
77 |   return Rf_eval(callSEXP, R_BaseNamespace);
78 | }
79 | 
80 | } // namespace util
81 | 
82 | } // namespace r
83 | } // namespace sourcetools
84 | 
85 | #endif /* SOURCETOOLS_R_R_FUNCTIONS_H */
86 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/RHeaders.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_R_R_HEADERS_H
2 | #define SOURCETOOLS_R_R_HEADERS_H
3 | 
4 | #define R_NO_REMAP
5 | #include <R.h>
6 | #include <Rinternals.h>
7 | 
8 | #endif /* SOURCETOOLS_R_R_HEADERS_H */
9 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/RNonStandardEvaluation.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H
  2 | #define SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H
  3 | 
  4 | #include <set>
  5 | #include <map>
  6 | 
  7 | #include <sourcetools/r/RHeaders.h>
  8 | #include <sourcetools/r/RCallRecurser.h>
  9 | 
 10 | namespace sourcetools {
 11 | namespace r {
 12 | namespace nse {
 13 | 
 14 | namespace detail {
 15 | 
 16 | inline std::set<std::string> makeNsePrimitives()
 17 | {
 18 |   std::set<std::string> instance;
 19 | 
 20 |   instance.insert("quote");
 21 |   instance.insert("substitute");
 22 |   instance.insert("eval");
 23 |   instance.insert("evalq");
 24 |   instance.insert("lazy_dots");
 25 | 
 26 |   return instance;
 27 | }
 28 | 
 29 | inline std::set<std::string>& nsePrimitives()
 30 | {
 31 |   static std::set<std::string> instance = makeNsePrimitives();
 32 |   return instance;
 33 | }
 34 | 
 35 | class PerformsNonStandardEvaluationOperation
 36 |   : public r::CallRecurser::Operation
 37 | {
 38 | public:
 39 | 
 40 |   PerformsNonStandardEvaluationOperation()
 41 |     : status_(false)
 42 |   {
 43 |   }
 44 | 
 45 |   virtual void apply(SEXP dataSEXP)
 46 |   {
 47 |     if (status_ || TYPEOF(dataSEXP) != LANGSXP)
 48 |       return;
 49 | 
 50 |     if ((status_ = checkCall(dataSEXP)))
 51 |       return;
 52 | 
 53 |     SEXP fnSEXP = CAR(dataSEXP);
 54 |     if (TYPEOF(fnSEXP) == SYMSXP)
 55 |       status_ = nsePrimitives().count(CHAR(PRINTNAME(fnSEXP)));
 56 |     else if (TYPEOF(fnSEXP) == STRSXP)
 57 |       status_ = nsePrimitives().count(CHAR(STRING_ELT(fnSEXP, 0)));
 58 | 
 59 |   }
 60 | 
 61 |   bool status() const { return status_; }
 62 | 
 63 | private:
 64 | 
 65 |   bool checkCall(SEXP callSEXP)
 66 |   {
 67 |     index_type n = Rf_length(callSEXP);
 68 |     if (n == 0)
 69 |       return false;
 70 | 
 71 |     SEXP fnSEXP = CAR(callSEXP);
 72 |     if (fnSEXP == Rf_install("::") || fnSEXP == Rf_install(":::"))
 73 |     {
 74 |       SEXP lhsSEXP = CADR(callSEXP);
 75 |       SEXP rhsSEXP = CADDR(callSEXP);
 76 | 
 77 |       if (lhsSEXP == Rf_install("lazyeval") && rhsSEXP == Rf_install("lazy_dots"))
 78 |         return true;
 79 |     }
 80 | 
 81 |     return false;
 82 |   }
 83 | 
 84 | private:
 85 |   bool status_;
 86 | };
 87 | 
 88 | } // namespace detail
 89 | 
 90 | class Database
 91 | {
 92 | public:
 93 |   bool check(SEXP dataSEXP)
 94 |   {
 95 |     if (contains(dataSEXP))
 96 |       return get(dataSEXP);
 97 | 
 98 |     typedef detail::PerformsNonStandardEvaluationOperation Operation;
 99 |     scoped_ptr<Operation> operation(new Operation);
100 | 
101 |     r::CallRecurser recurser(dataSEXP);
102 |     recurser.add(operation);
103 |     recurser.run();
104 | 
105 |     set(dataSEXP, operation->status());
106 |     return operation->status();
107 |   }
108 | 
109 | private:
110 | 
111 |   bool contains(SEXP dataSEXP)
112 |   {
113 |     return map_.count(address(dataSEXP));
114 |   }
115 | 
116 |   bool get(SEXP dataSEXP)
117 |   {
118 |     return map_[address(dataSEXP)];
119 |   }
120 | 
121 |   void set(SEXP dataSEXP, bool value)
122 |   {
123 |     map_[address(dataSEXP)] = value;
124 |   }
125 | 
126 |   std::size_t address(SEXP dataSEXP)
127 |   {
128 |     return reinterpret_cast<std::size_t>(dataSEXP);
129 |   }
130 | 
131 |   std::map<std::size_t, bool> map_;
132 | };
133 | 
134 | inline Database& database()
135 | {
136 |   static Database instance;
137 |   return instance;
138 | }
139 | 
140 | inline bool performsNonStandardEvaluation(SEXP fnSEXP)
141 | {
142 |   return database().check(fnSEXP);
143 | }
144 | 
145 | } // namespace nse
146 | } // namespace r
147 | } // namespace sourcetools
148 | 
149 | #endif /* SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H */
150 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/RProtect.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_R_RPROTECT_H
 2 | #define SOURCETOOLS_R_RPROTECT_H
 3 | 
 4 | #include <sourcetools/core/core.h>
 5 | 
 6 | #include <sourcetools/r/RHeaders.h>
 7 | 
 8 | namespace sourcetools {
 9 | namespace r {
10 | 
11 | class Protect : noncopyable
12 | {
13 | public:
14 |   Protect(): n_(0) {}
15 |   ~Protect() { UNPROTECT(n_); }
16 | 
17 |   SEXP operator()(SEXP objectSEXP)
18 |   {
19 |     ++n_;
20 |     return PROTECT(objectSEXP);
21 |   }
22 | 
23 | private:
24 |   int n_;
25 | };
26 | 
27 | } // end namespace r
28 | } // end namespace sourcetools
29 | 
30 | #endif /* SOURCETOOLS_R_RPROTECT_H */
31 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/RUtils.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_R_R_UTILS_H
 2 | #define SOURCETOOLS_R_R_UTILS_H
 3 | 
 4 | #include <vector>
 5 | 
 6 | #include <sourcetools/core/core.h>
 7 | 
 8 | #include <sourcetools/r/RHeaders.h>
 9 | #include <sourcetools/r/RProtect.h>
10 | #include <sourcetools/r/RConverter.h>
11 | 
12 | namespace sourcetools {
13 | namespace r {
14 | 
15 | class RObjectFactory : noncopyable
16 | {
17 | public:
18 | 
19 |   RObjectFactory()
20 |     : n_(0)
21 |   {
22 |   }
23 | 
24 |   template <typename T, typename F>
25 |   SEXP create(SEXPTYPE type, const std::vector<T>& vector, F f)
26 |   {
27 |     ++n_;
28 |     index_type n = vector.size();
29 |     SEXP resultSEXP = PROTECT(Rf_allocVector(type, n));
30 |     for (index_type i = 0; i < n; ++i)
31 |       f(resultSEXP, i, vector[i]);
32 |     return resultSEXP;
33 |   }
34 | 
35 |   SEXP create(SEXPTYPE type, index_type n)
36 |   {
37 |     ++n_;
38 |     return PROTECT(Rf_allocVector(type, n));
39 |   }
40 | 
41 |   ~RObjectFactory()
42 |   {
43 |     UNPROTECT(n_);
44 |   }
45 | 
46 | private:
47 |   index_type n_;
48 | };
49 | 
50 | class ListBuilder : noncopyable
51 | {
52 | public:
53 | 
54 |   void add(const std::string& name, SEXP value)
55 |   {
56 |     names_.push_back(name);
57 |     data_.push_back(protect_(value));
58 |   }
59 | 
60 |   operator SEXP() const
61 |   {
62 |     index_type n = data_.size();
63 | 
64 |     SEXP resultSEXP = protect_(Rf_allocVector(VECSXP, n));
65 |     SEXP namesSEXP  = protect_(Rf_allocVector(STRSXP, n));
66 | 
67 |     for (index_type i = 0; i < n; ++i)
68 |     {
69 |       SET_VECTOR_ELT(resultSEXP, i, data_[i]);
70 |       SET_STRING_ELT(namesSEXP, i, createChar(names_[i]));
71 |     }
72 | 
73 |     Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP);
74 |     return resultSEXP;
75 |   }
76 | 
77 | private:
78 |   std::vector<std::string> names_;
79 |   std::vector<SEXP> data_;
80 |   mutable Protect protect_;
81 | };
82 | 
83 | } // namespace r
84 | } // namespace sourcetools
85 | 
86 | #endif /* SOURCETOOLS_R_R_UTILS_H */
87 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/r/r.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_R_R_H
 2 | #define SOURCETOOLS_R_R_H
 3 | 
 4 | #include <sourcetools/r/RHeaders.h>
 5 | #include <sourcetools/r/RProtect.h>
 6 | #include <sourcetools/r/RUtils.h>
 7 | #include <sourcetools/r/RConverter.h>
 8 | #include <sourcetools/r/RFunctions.h>
 9 | #include <sourcetools/r/RCallRecurser.h>
10 | #include <sourcetools/r/RNonStandardEvaluation.h>
11 | 
12 | #endif /* SOURCETOOLS_R_R_H */
13 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/read/MemoryMappedReader.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_READ_MEMORY_MAPPED_READER_H
  2 | #define SOURCETOOLS_READ_MEMORY_MAPPED_READER_H
  3 | 
  4 | #include <vector>
  5 | #include <string>
  6 | #include <algorithm>
  7 | 
  8 | #include <sourcetools/core/macros.h>
  9 | 
 10 | #include <sourcetools/r/RHeaders.h>
 11 | #include <sourcetools/r/RUtils.h>
 12 | 
 13 | #ifndef _WIN32
 14 | # include <sourcetools/read/posix/FileConnection.h>
 15 | # include <sourcetools/read/posix/MemoryMappedConnection.h>
 16 | #else
 17 | # include <sourcetools/read/windows/FileConnection.h>
 18 | # include <sourcetools/read/windows/MemoryMappedConnection.h>
 19 | #endif
 20 | 
 21 | namespace sourcetools {
 22 | namespace detail {
 23 | 
 24 | class MemoryMappedReader
 25 | {
 26 | public:
 27 | 
 28 |   class VectorReader
 29 |   {
 30 |   public:
 31 | 
 32 |     explicit VectorReader(std::vector<std::string>* pData)
 33 |       : pData_(pData)
 34 |     {
 35 |     }
 36 | 
 37 |     template <typename T>
 38 |     void operator()(const T& lhs, const T& rhs)
 39 |     {
 40 |       pData_->push_back(std::string(lhs, rhs));
 41 |     }
 42 | 
 43 |   private:
 44 |     std::vector<std::string>* pData_;
 45 |   };
 46 | 
 47 |   static bool read(const char* path, std::string* pContent)
 48 |   {
 49 |     // Open file connection
 50 |     FileConnection conn(path);
 51 |     if (!conn.open())
 52 |       return false;
 53 | 
 54 |     // Get size of file
 55 |     index_type size;
 56 |     if (!conn.size(&size))
 57 |       return false;
 58 | 
 59 |     // Early return for empty files
 60 |     if (UNLIKELY(size == 0))
 61 |       return true;
 62 | 
 63 |     // mmap the file
 64 |     MemoryMappedConnection map(conn, size);
 65 |     if (!map.open())
 66 |       return false;
 67 | 
 68 |     pContent->assign(map, size);
 69 |     return true;
 70 |   }
 71 | 
 72 |   template <typename F>
 73 |   static bool read_lines(const char* path, F f)
 74 |   {
 75 |     FileConnection conn(path);
 76 |     if (!conn.open())
 77 |       return false;
 78 | 
 79 |     // Get size of file
 80 |     index_type size;
 81 |     if (!conn.size(&size))
 82 |       return false;
 83 | 
 84 |     // Early return for empty files
 85 |     if (UNLIKELY(size == 0))
 86 |       return true;
 87 | 
 88 |     // mmap the file
 89 |     MemoryMappedConnection map(conn, size);
 90 |     if (!map.open())
 91 |       return false;
 92 | 
 93 |     // special case: just a '\n'
 94 |     bool endsWithNewline =
 95 |       map[size - 1] == '\n' ||
 96 |       map[size - 1] == '\r';
 97 |     
 98 |     if (size == 1 && endsWithNewline)
 99 |       return true;
100 | 
101 |     // Search for newlines
102 |     const char* lower = map;
103 |     const char* end = map + size;
104 |     
105 |     for (const char* it = lower; it != end; it++)
106 |     {
107 |       // check for newline
108 |       char ch = *it;
109 |       bool isNewline = ch == '\r' || ch == '\n';
110 |       if (!isNewline)
111 |         continue;
112 |       
113 |       // found a newline; call functor
114 |       f(lower, it);
115 |       
116 |       // update iterator, handling '\r\n' specially
117 |       if (it[0] == '\r' &&
118 |           it[1] == '\n')
119 |       {
120 |         it += 1;
121 |       }
122 |       
123 |       // update lower iterator
124 |       lower = it + 1;
125 |       
126 |     }
127 | 
128 |     // If this file ended with a newline, we're done
129 |     if (endsWithNewline)
130 |       return true;
131 | 
132 |     // Otherwise, consume one more string, then we're done
133 |     f(lower, end);
134 |     return true;
135 |   }
136 | 
137 |   static bool read_lines(const char* path, std::vector<std::string>* pContent)
138 |   {
139 |     VectorReader reader(pContent);
140 |     return read_lines(path, reader);
141 |   }
142 | 
143 | };
144 | 
145 | } // namespace detail
146 | } // namespace sourcetools
147 | 
148 | #endif /* SOURCETOOLS_READ_MEMORY_MAPPED_READER_H */
149 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/read/posix/FileConnection.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H
 2 | #define SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H
 3 | 
 4 | #include <cstddef>
 5 | 
 6 | #include <sys/stat.h>
 7 | #include <fcntl.h>
 8 | #include <unistd.h>
 9 | 
10 | namespace sourcetools {
11 | namespace detail {
12 | 
13 | class FileConnection
14 | {
15 | public:
16 | 
17 |   typedef int FileDescriptor;
18 | 
19 |   FileConnection(const char* path, int flags = O_RDONLY)
20 |   {
21 |     fd_ = ::open(path, flags);
22 |   }
23 | 
24 |   ~FileConnection()
25 |   {
26 |     if (open())
27 |       ::close(fd_);
28 |   }
29 | 
30 |   bool open()
31 |   {
32 |     return fd_ != -1;
33 |   }
34 | 
35 |   bool size(index_type* pSize)
36 |   {
37 |     struct stat info;
38 |     if (::fstat(fd_, &info) == -1)
39 |       return false;
40 | 
41 |     *pSize = info.st_size;
42 |     return true;
43 |   }
44 | 
45 |   operator FileDescriptor() const
46 |   {
47 |     return fd_;
48 |   }
49 | 
50 | private:
51 |   FileDescriptor fd_;
52 | };
53 | 
54 | 
55 | } // namespace detail
56 | } // namespace sourcetools
57 | 
58 | #endif /* SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H */
59 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/read/posix/MemoryMappedConnection.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H
 2 | #define SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H
 3 | 
 4 | #include <cstdlib>
 5 | #include <fcntl.h>
 6 | #include <sys/mman.h>
 7 | 
 8 | #include <sourcetools/platform/platform.h>
 9 | 
10 | namespace sourcetools {
11 | namespace detail {
12 | 
13 | class MemoryMappedConnection
14 | {
15 | public:
16 | 
17 |   MemoryMappedConnection(int fd, index_type size)
18 |     : size_(size)
19 |   {
20 | #ifdef MAP_POPULATE
21 |     map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0);
22 | #else
23 |     map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
24 | #endif
25 | 
26 | #if defined(POSIX_MADV_SEQUENTIAL) && defined(POSIX_MADV_WILLNEED)
27 |     ::posix_madvise((void*) map_, size, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED);
28 | #endif
29 |   }
30 | 
31 |   ~MemoryMappedConnection()
32 |   {
33 |     if (map_ != MAP_FAILED)
34 |       ::munmap(map_, size_);
35 |   }
36 | 
37 |   bool open()
38 |   {
39 |     return map_ != MAP_FAILED;
40 |   }
41 | 
42 |   operator char*() const
43 |   {
44 |     return map_;
45 |   }
46 | 
47 | private:
48 |   char* map_;
49 |   index_type size_;
50 | };
51 | 
52 | } // namespace detail
53 | } // namespace sourcetools
54 | 
55 | #endif /* SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H */
56 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/read/read.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_READ_READ_H
 2 | #define SOURCETOOLS_READ_READ_H
 3 | 
 4 | #include <vector>
 5 | #include <string>
 6 | 
 7 | #include <sourcetools/read/MemoryMappedReader.h>
 8 | 
 9 | namespace sourcetools {
10 | 
11 | inline bool read(const std::string& absolutePath, std::string* pContent)
12 | {
13 |   return detail::MemoryMappedReader::read(absolutePath.c_str(), pContent);
14 | }
15 | 
16 | inline bool read_lines(const std::string& absolutePath,
17 |                        std::vector<std::string>* pLines)
18 | {
19 |   return detail::MemoryMappedReader::read_lines(absolutePath.c_str(), pLines);
20 | }
21 | 
22 | }  // namespace sourcetools
23 | 
24 | #endif /* SOURCETOOLS_READ_READ_H */
25 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/read/windows/FileConnection.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H
 2 | #define SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H
 3 | 
 4 | #undef Realloc
 5 | #undef Free
 6 | #include <windows.h>
 7 | 
 8 | namespace sourcetools {
 9 | namespace detail {
10 | 
11 | class FileConnection
12 | {
13 | public:
14 |   typedef HANDLE FileDescriptor;
15 | 
16 |   FileConnection(const char* path, int flags = GENERIC_READ)
17 |   {
18 |     handle_ = ::CreateFile(path, flags, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL);
19 |   }
20 | 
21 |   ~FileConnection()
22 |   {
23 |     if (open())
24 |       ::CloseHandle(handle_);
25 |   }
26 | 
27 |   bool open()
28 |   {
29 |     return handle_ != INVALID_HANDLE_VALUE;
30 |   }
31 | 
32 |   bool size(index_type* pSize)
33 |   {
34 |     *pSize = ::GetFileSize(handle_, NULL);
35 |     return true;
36 |   }
37 | 
38 |   operator FileDescriptor() const
39 |   {
40 |     return handle_;
41 |   }
42 | 
43 | private:
44 |   FileDescriptor handle_;
45 | };
46 | 
47 | } // namespace detail
48 | } // namespace sourcetools
49 | 
50 | #endif /* SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H */
51 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/read/windows/MemoryMappedConnection.h:
--------------------------------------------------------------------------------
 1 | #ifndef SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H
 2 | #define SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H
 3 | 
 4 | #undef Realloc
 5 | #undef Free
 6 | 
 7 | #include <windows.h>
 8 | 
 9 | #include <sourcetools/core/core.h>
10 | 
11 | namespace sourcetools {
12 | namespace detail {
13 | 
14 | class MemoryMappedConnection
15 | {
16 | public:
17 | 
18 |   MemoryMappedConnection(HANDLE handle, index_type size)
19 |     : map_(NULL), size_(size)
20 |   {
21 |     handle_ = ::CreateFileMapping(handle, NULL, PAGE_READONLY, 0, 0, NULL);
22 |     if (handle_ == NULL)
23 |       return;
24 | 
25 |     map_ = (char*) ::MapViewOfFile(handle_, FILE_MAP_READ, 0, 0, size);
26 |   }
27 | 
28 |   ~MemoryMappedConnection()
29 |   {
30 |     if (handle_ != INVALID_HANDLE_VALUE)
31 |       ::CloseHandle(handle_);
32 | 
33 |     if (map_ != NULL)
34 |       ::UnmapViewOfFile(map_);
35 |   }
36 | 
37 |   bool open()
38 |   {
39 |     return map_ != NULL;
40 |   }
41 | 
42 |   operator char*() const
43 |   {
44 |     return map_;
45 |   }
46 | 
47 | private:
48 |   char* map_;
49 |   index_type size_;
50 |   HANDLE handle_;
51 | };
52 | 
53 | } // namespace detail
54 | } // namespace sourcetools
55 | 
56 | #endif /* SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H */
57 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/tokenization/Registration.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_TOKENIZATION_REGISTRATION_H
  2 | #define SOURCETOOLS_TOKENIZATION_REGISTRATION_H
  3 | 
  4 | #include <string>
  5 | #include <cstring>
  6 | #include <cstdlib>
  7 | 
  8 | namespace sourcetools {
  9 | namespace tokens {
 10 | 
 11 | typedef unsigned int TokenType;
 12 | 
 13 | // Simple, non-nestable types.
 14 | #define SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(__NAME__, __TYPE__)         \
 15 |   static const TokenType __NAME__ = __TYPE__
 16 | 
 17 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(INVALID,    (1 << 31));
 18 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(END,        (1 << 30));
 19 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(EMPTY,      (1 << 29));
 20 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(MISSING,    (1 << 28));
 21 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(ROOT,       (1 << 27));
 22 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SEMI,       (1 << 26));
 23 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMA,      (1 << 25));
 24 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SYMBOL,     (1 << 24));
 25 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMENT,    (1 << 23));
 26 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(WHITESPACE, (1 << 22));
 27 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(STRING,     (1 << 21));
 28 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(NUMBER,     (1 << 20));
 29 | 
 30 | /* Brackets */
 31 | #define SOURCE_TOOLS_BRACKET_BIT        (1 << 19)
 32 | #define SOURCE_TOOLS_BRACKET_RIGHT_BIT  (1 << 5)
 33 | #define SOURCE_TOOLS_BRACKET_LEFT_BIT   (1 << 4)
 34 | #define SOURCE_TOOLS_BRACKET_MASK       SOURCE_TOOLS_BRACKET_BIT
 35 | #define SOURCE_TOOLS_BRACKET_LEFT_MASK  (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT)
 36 | #define SOURCE_TOOLS_BRACKET_RIGHT_MASK (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT)
 37 | 
 38 | #define SOURCE_TOOLS_REGISTER_BRACKET(__NAME__, __SIDE__, __INDEX__)  \
 39 |   static const TokenType __NAME__ =                            \
 40 |     SOURCE_TOOLS_BRACKET_BIT | __SIDE__ | __INDEX__
 41 | 
 42 | SOURCE_TOOLS_REGISTER_BRACKET(LPAREN,    SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 0));
 43 | SOURCE_TOOLS_REGISTER_BRACKET(LBRACE,    SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 1));
 44 | SOURCE_TOOLS_REGISTER_BRACKET(LBRACKET,  SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 2));
 45 | SOURCE_TOOLS_REGISTER_BRACKET(LDBRACKET, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 3));
 46 | 
 47 | SOURCE_TOOLS_REGISTER_BRACKET(RPAREN,    SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 0));
 48 | SOURCE_TOOLS_REGISTER_BRACKET(RBRACE,    SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 1));
 49 | SOURCE_TOOLS_REGISTER_BRACKET(RBRACKET,  SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 2));
 50 | SOURCE_TOOLS_REGISTER_BRACKET(RDBRACKET, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 3));
 51 | 
 52 | /* Operators */
 53 | #define SOURCE_TOOLS_OPERATOR_BIT         (1 << 18)
 54 | #define SOURCE_TOOLS_OPERATOR_UNARY_BIT   (1 << 6)
 55 | #define SOURCE_TOOLS_OPERATOR_MASK        (SOURCE_TOOLS_OPERATOR_BIT)
 56 | #define SOURCE_TOOLS_OPERATOR_UNARY_MASK  (SOURCE_TOOLS_OPERATOR_MASK | SOURCE_TOOLS_OPERATOR_UNARY_BIT)
 57 | 
 58 | #define SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, __MASKS__) \
 59 |                                                                         \
 60 |   static const TokenType OPERATOR_ ## __NAME__ =                        \
 61 |     SOURCE_TOOLS_OPERATOR_BIT | __MASKS__;                              \
 62 |                                                                         \
 63 |   static const char* const                                              \
 64 |     OPERATOR_ ## __NAME__ ## _STRING = __STRING__
 65 | 
 66 | #define SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(__NAME__, __STRING__, __INDEX__)    \
 67 |   SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, SOURCE_TOOLS_OPERATOR_UNARY_BIT | __INDEX__)
 68 | 
 69 | // See ?"Syntax" for details on R's operators.
 70 | // Note: All operators registered work in a binary context, but only
 71 | // some will work as unary operators. (Occurring to the left of the token).
 72 | //
 73 | // In other words, -1 is parsed as `-`(1).
 74 | //
 75 | // Note that although brackets are operators we tokenize them separately,
 76 | // since we need to later check for their paired complement.
 77 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(PLUS,          "+",    0);
 78 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(MINUS,         "-",    1);
 79 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(HELP,          "?",    2);
 80 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(NEGATION,      "!",    3);
 81 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(FORMULA,       "~",    4);
 82 | 
 83 | SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_EXPORTS,   "::",   5);
 84 | SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_ALL,       ":::",  6);
 85 | SOURCE_TOOLS_REGISTER_OPERATOR(DOLLAR,              "$",    7);
 86 | SOURCE_TOOLS_REGISTER_OPERATOR(AT,                  "@",    8);
 87 | SOURCE_TOOLS_REGISTER_OPERATOR(HAT,                 "^",    9);
 88 | SOURCE_TOOLS_REGISTER_OPERATOR(EXPONENTATION_STARS, "**",  10);
 89 | SOURCE_TOOLS_REGISTER_OPERATOR(SEQUENCE,            ":",   11);
 90 | SOURCE_TOOLS_REGISTER_OPERATOR(MULTIPLY,            "*",   12);
 91 | SOURCE_TOOLS_REGISTER_OPERATOR(DIVIDE,              "/",   13);
 92 | SOURCE_TOOLS_REGISTER_OPERATOR(LESS,                "<",   14);
 93 | SOURCE_TOOLS_REGISTER_OPERATOR(LESS_OR_EQUAL,       "<=",  15);
 94 | SOURCE_TOOLS_REGISTER_OPERATOR(GREATER,             ">",   16);
 95 | SOURCE_TOOLS_REGISTER_OPERATOR(GREATER_OR_EQUAL,    ">=",  17);
 96 | SOURCE_TOOLS_REGISTER_OPERATOR(EQUAL,               "==",  18);
 97 | SOURCE_TOOLS_REGISTER_OPERATOR(NOT_EQUAL,           "!=",  19);
 98 | SOURCE_TOOLS_REGISTER_OPERATOR(AND_VECTOR,          "&",   20);
 99 | SOURCE_TOOLS_REGISTER_OPERATOR(AND_SCALAR,          "&&",  21);
100 | SOURCE_TOOLS_REGISTER_OPERATOR(OR_VECTOR,           "|",   22);
101 | SOURCE_TOOLS_REGISTER_OPERATOR(OR_SCALAR,           "||",  23);
102 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT,         "<-",  24);
103 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_PARENT,  "<<-", 25);
104 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT,        "->",  26);
105 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT_PARENT, "->>", 27);
106 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_EQUALS,  "=",   28);
107 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_COLON,   ":=",  29);
108 | SOURCE_TOOLS_REGISTER_OPERATOR(USER,                "%%",  30);
109 | SOURCE_TOOLS_REGISTER_OPERATOR(PIPE,                "|>",  31);
110 | SOURCE_TOOLS_REGISTER_OPERATOR(PIPE_BIND,           ">=",  32);
111 | 
112 | /* Keywords and symbols */
113 | #define SOURCE_TOOLS_KEYWORD_BIT               (1 << 17)
114 | #define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT  (1 << 7)
115 | #define SOURCE_TOOLS_KEYWORD_MASK              SOURCE_TOOLS_KEYWORD_BIT
116 | #define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK (SOURCE_TOOLS_KEYWORD_MASK | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT)
117 | 
118 | #define SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__)            \
119 |   static const TokenType KEYWORD_ ## __NAME__ =                \
120 |     __MASKS__ | SOURCE_TOOLS_KEYWORD_MASK
121 | 
122 | #define SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(__NAME__, __MASKS__) \
123 |   SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__ | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK)
124 | 
125 | // See '?Reserved' for a list of reversed R symbols.
126 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(IF,       1);
127 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FOR,      2);
128 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(WHILE,    3);
129 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(REPEAT,   4);
130 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FUNCTION, 5);
131 | 
132 | SOURCE_TOOLS_REGISTER_KEYWORD(ELSE,                  6);
133 | SOURCE_TOOLS_REGISTER_KEYWORD(IN,                    7);
134 | SOURCE_TOOLS_REGISTER_KEYWORD(NEXT,                  8);
135 | SOURCE_TOOLS_REGISTER_KEYWORD(BREAK,                 9);
136 | SOURCE_TOOLS_REGISTER_KEYWORD(TRUE,                 10);
137 | SOURCE_TOOLS_REGISTER_KEYWORD(FALSE,                11);
138 | SOURCE_TOOLS_REGISTER_KEYWORD(NULL,                 12);
139 | SOURCE_TOOLS_REGISTER_KEYWORD(Inf,                  13);
140 | SOURCE_TOOLS_REGISTER_KEYWORD(NaN,                  14);
141 | SOURCE_TOOLS_REGISTER_KEYWORD(NA,                   15);
142 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_integer_,          16);
143 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_real_,             17);
144 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_complex_,          18);
145 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_character_,        19);
146 | 
147 | inline TokenType symbolType(const char* string, index_type n)
148 | {
149 |   // TODO: Is this insanity really an optimization or am I just silly?
150 |   if (n < 2 || n > 13) {
151 |     return SYMBOL;
152 |   } else if (n == 2) {
153 |     if (!std::memcmp(string, "in", n)) return KEYWORD_IN;
154 |     if (!std::memcmp(string, "if", n)) return KEYWORD_IF;
155 |     if (!std::memcmp(string, "NA", n)) return KEYWORD_NA;
156 |   } else if (n == 3) {
157 |     if (!std::memcmp(string, "for", n)) return KEYWORD_FOR;
158 |     if (!std::memcmp(string, "Inf", n)) return KEYWORD_Inf;
159 |     if (!std::memcmp(string, "NaN", n)) return KEYWORD_NaN;
160 |   } else if (n == 4) {
161 |     if (!std::memcmp(string, "else", n)) return KEYWORD_ELSE;
162 |     if (!std::memcmp(string, "next", n)) return KEYWORD_NEXT;
163 |     if (!std::memcmp(string, "TRUE", n)) return KEYWORD_TRUE;
164 |     if (!std::memcmp(string, "NULL", n)) return KEYWORD_NULL;
165 |   } else if (n == 5) {
166 |     if (!std::memcmp(string, "while", n)) return KEYWORD_WHILE;
167 |     if (!std::memcmp(string, "break", n)) return KEYWORD_BREAK;
168 |     if (!std::memcmp(string, "FALSE", n)) return KEYWORD_FALSE;
169 |   } else if (n == 6) {
170 |     if (!std::memcmp(string, "repeat", n)) return KEYWORD_REPEAT;
171 |   } else if (n == 8) {
172 |     if (!std::memcmp(string, "function", n)) return KEYWORD_FUNCTION;
173 |     if (!std::memcmp(string, "NA_real_", n)) return KEYWORD_NA_real_;
174 |   } else if (n == 11) {
175 |     if (!std::memcmp(string, "NA_integer_", n)) return KEYWORD_NA_integer_;
176 |     if (!std::memcmp(string, "NA_complex_", n)) return KEYWORD_NA_complex_;
177 |   } else if (n == 13) {
178 |     if (!std::memcmp(string, "NA_character_", n)) return KEYWORD_NA_character_;
179 |   }
180 | 
181 |   return SYMBOL;
182 | }
183 | 
184 | inline TokenType symbolType(const std::string& symbol)
185 | {
186 |   return symbolType(symbol.data(), symbol.size());
187 | }
188 | 
189 | } // namespace tokens
190 | } // namespace sourcetools
191 | 
192 | #endif /* SOURCETOOLS_TOKENIZATION_REGISTRATION_H */
193 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/tokenization/Token.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_TOKENIZATION_TOKEN_H
  2 | #define SOURCETOOLS_TOKENIZATION_TOKEN_H
  3 | 
  4 | #include <cstring>
  5 | 
  6 | #include <vector>
  7 | #include <string>
  8 | #include <map>
  9 | #include <sstream>
 10 | 
 11 | #include <sourcetools/core/core.h>
 12 | #include <sourcetools/tokenization/Registration.h>
 13 | #include <sourcetools/collection/Position.h>
 14 | #include <sourcetools/cursor/TextCursor.h>
 15 | 
 16 | namespace sourcetools {
 17 | namespace tokens {
 18 | 
 19 | class Token
 20 | {
 21 | private:
 22 |   typedef cursors::TextCursor TextCursor;
 23 |   typedef collections::Position Position;
 24 | 
 25 | public:
 26 | 
 27 |   Token()
 28 |     : begin_(NULL),
 29 |       end_(NULL),
 30 |       offset_(-1),
 31 |       position_(-1, -1),
 32 |       type_(INVALID)
 33 |   {
 34 |   }
 35 | 
 36 |   explicit Token(TokenType type)
 37 |     : begin_(NULL),
 38 |       end_(NULL),
 39 |       offset_(-1),
 40 |       position_(-1, -1),
 41 |       type_(type)
 42 |   {
 43 |   }
 44 | 
 45 |   Token(const Position& position)
 46 |     : begin_(NULL),
 47 |       end_(NULL),
 48 |       offset_(-1),
 49 |       position_(position),
 50 |       type_(INVALID)
 51 |   {
 52 |   }
 53 | 
 54 |   Token(const TextCursor& cursor, TokenType type, index_type length)
 55 |     : begin_(cursor.begin() + cursor.offset()),
 56 |       end_(cursor.begin() + cursor.offset() + length),
 57 |       offset_(cursor.offset()),
 58 |       position_(cursor.position()),
 59 |       type_(type)
 60 |   {
 61 |   }
 62 | 
 63 |   const char* begin() const { return begin_; }
 64 |   const char* end() const { return end_; }
 65 |   index_type offset() const { return offset_; }
 66 |   index_type size() const { return end_ - begin_; }
 67 | 
 68 |   std::string contents() const
 69 |   {
 70 |     return std::string(begin_, end_);
 71 |   }
 72 | 
 73 |   bool contentsEqual(const char* string)
 74 |   {
 75 |     return std::strcmp(begin_, string);
 76 |   }
 77 | 
 78 |   bool contentsEqual(const std::string& string) const
 79 |   {
 80 |     if (utils::size(string) != size())
 81 |       return false;
 82 | 
 83 |     return std::memcmp(begin_, string.c_str(), size()) == 0;
 84 |   }
 85 | 
 86 |   const Position& position() const { return position_; }
 87 |   index_type row() const { return position_.row; }
 88 |   index_type column() const { return position_.column; }
 89 | 
 90 |   TokenType type() const { return type_; }
 91 |   bool isType(TokenType type) const { return type_ == type; }
 92 | 
 93 | private:
 94 |   const char* begin_;
 95 |   const char* end_;
 96 |   index_type offset_;
 97 | 
 98 |   Position position_;
 99 |   TokenType type_;
100 | };
101 | 
102 | inline bool isBracket(const Token& token)
103 | {
104 |   return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_MASK);
105 | }
106 | 
107 | inline bool isLeftBracket(const Token& token)
108 | {
109 |   return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_LEFT_MASK);
110 | }
111 | 
112 | inline bool isRightBracket(const Token& token)
113 | {
114 |   return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_RIGHT_MASK);
115 | }
116 | 
117 | inline bool isComplement(TokenType lhs, TokenType rhs)
118 | {
119 |   static const TokenType mask =
120 |     SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT;
121 | 
122 |   if (SOURCE_TOOLS_CHECK_MASK((lhs | rhs), mask))
123 |     return SOURCE_TOOLS_LOWER_BITS(lhs, 4) == SOURCE_TOOLS_LOWER_BITS(rhs, 4);
124 | 
125 |   return false;
126 | }
127 | 
128 | inline TokenType complement(TokenType type)
129 | {
130 |   static const TokenType mask =
131 |     SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT;
132 | 
133 |   return type ^ mask;
134 | }
135 | 
136 | inline bool isKeyword(const Token& token)
137 | {
138 |   return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_MASK);
139 | }
140 | 
141 | inline bool isControlFlowKeyword(const Token& token)
142 | {
143 |   return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK);
144 | }
145 | 
146 | inline bool isOperator(const Token& token)
147 | {
148 |   return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_MASK);
149 | }
150 | 
151 | inline bool isUnaryOperator(const Token& token)
152 | {
153 |   return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_UNARY_MASK);
154 | }
155 | 
156 | inline bool isNonUnaryOperator(const Token& token)
157 | {
158 |   return isOperator(token) && !isUnaryOperator(token);
159 | }
160 | 
161 | inline bool isComparisonOperator(const Token& token)
162 | {
163 |   switch (token.type())
164 |   {
165 |   case OPERATOR_AND_SCALAR:
166 |   case OPERATOR_AND_VECTOR:
167 |   case OPERATOR_OR_SCALAR:
168 |   case OPERATOR_OR_VECTOR:
169 |   case OPERATOR_EQUAL:
170 |   case OPERATOR_NOT_EQUAL:
171 |   case OPERATOR_LESS:
172 |   case OPERATOR_LESS_OR_EQUAL:
173 |   case OPERATOR_GREATER:
174 |   case OPERATOR_GREATER_OR_EQUAL:
175 |     return true;
176 |   default:
177 |     return false;
178 |   }
179 | }
180 | 
181 | inline bool isWhitespace(const Token& token)
182 | {
183 |   return token.type() == WHITESPACE;
184 | }
185 | 
186 | inline bool isComment(const Token& token)
187 | {
188 |   return token.type() == COMMENT;
189 | }
190 | 
191 | inline bool isSymbol(const Token& token)
192 | {
193 |   return token.type() == SYMBOL;
194 | }
195 | 
196 | inline bool isEnd(const Token& token)
197 | {
198 |   return token.type() == END;
199 | }
200 | 
201 | inline bool isString(const Token& token)
202 | {
203 |   return token.type() == STRING;
204 | }
205 | 
206 | inline bool isSymbolic(const Token& token)
207 | {
208 |   static const TokenType mask = SYMBOL | NUMBER | STRING;
209 |   return (token.type() & mask) != 0;
210 | }
211 | 
212 | inline bool isNumeric(const Token& token)
213 | {
214 |   return (token.type() & NUMBER) != 0;
215 | }
216 | 
217 | inline bool isCallOperator(const Token& token)
218 | {
219 |   return token.type() == LPAREN ||
220 |          token.type() == LBRACKET ||
221 |          token.type() == LDBRACKET;
222 | }
223 | 
224 | inline bool isAssignmentOperator(const Token& token)
225 | {
226 |   switch (token.type())
227 |   {
228 |   case OPERATOR_ASSIGN_LEFT:
229 |   case OPERATOR_ASSIGN_LEFT_COLON:
230 |   case OPERATOR_ASSIGN_LEFT_EQUALS:
231 |   case OPERATOR_ASSIGN_LEFT_PARENT:
232 |   case OPERATOR_ASSIGN_RIGHT:
233 |   case OPERATOR_ASSIGN_RIGHT_PARENT:
234 |     return true;
235 |   default:
236 |     return false;
237 |   }
238 | }
239 | 
240 | namespace detail {
241 | 
242 | inline bool isHexDigit(char c)
243 | {
244 |   if (c >= '0' && c <= '9')
245 |     return true;
246 |   else if (c >= 'a' && c <= 'f')
247 |     return true;
248 |   else if (c >= 'A' && c <= 'F')
249 |     return true;
250 |   return false;
251 | }
252 | 
253 | inline int hexValue(char c)
254 | {
255 |   if (c >= '0' && c <= '9')
256 |     return c - '0';
257 |   else if (c >= 'a' && c <= 'f')
258 |     return c - 'a' + 10;
259 |   else if (c >= 'A' && c <= 'F')
260 |     return c - 'A' + 10;
261 | 
262 |   return 0;
263 | }
264 | 
265 | // Parses an octal escape sequence, e.g. '\012'.
266 | inline bool parseOctal(const char*& it, char*& output)
267 | {
268 |   // Check for opening escape
269 |   if (*it != '\\')
270 |     return false;
271 | 
272 |   // Check for number following
273 |   char lookahead = *(it + 1);
274 |   if (lookahead < '0' || lookahead > '7')
275 |     return false;
276 |   ++it;
277 | 
278 |   // Begin parsing. Consume up to three numbers.
279 |   unsigned char result = 0;
280 |   const char* end = it + 3;
281 |   for (; it != end; ++it)
282 |   {
283 |     char ch = *it;
284 |     if ('0' <= ch && ch <= '7')
285 |       result = 8 * result + ch - '0';
286 |     else
287 |       break;
288 |   }
289 | 
290 |   // Assign result, and return.
291 |   *output++ = result;
292 |   return true;
293 | }
294 | 
295 | // Parse a hex escape sequence, e.g. '\xFF'.
296 | inline bool parseHex(const char*& it, char*& output)
297 | {
298 |   // Check for opening escape.
299 |   if (*it != '\\')
300 |     return false;
301 | 
302 |   if (*(it + 1) != 'x')
303 |     return false;
304 | 
305 |   if (!isHexDigit(*(it + 2)))
306 |     return false;
307 | 
308 |   // Begin parsing.
309 |   it += 2;
310 |   unsigned char value = 0;
311 |   const char* end = it + 2;
312 |   for (; it != end; ++it)
313 |   {
314 |     int result = hexValue(*it);
315 |     if (result == 0)
316 |       break;
317 |     value = 16 * value + result;
318 |   }
319 | 
320 |   *output++ = value;
321 |   return true;
322 | }
323 | 
324 | // Parse a unicode escape sequence.
325 | inline bool parseUnicode(const char*& it, char*& output)
326 | {
327 |   if (*it != '\\')
328 |     return false;
329 | 
330 |   char lookahead = *(it + 1);
331 |   int size;
332 |   if (lookahead == 'u')
333 |     size = 4;
334 |   else if (lookahead == 'U')
335 |     size = 8;
336 |   else
337 |     return false;
338 | 
339 |   // Clone the input iterator (only set it on success)
340 |   const char* clone = it;
341 |   clone += 2;
342 | 
343 |   // Check for e.g. '\u{...}'
344 |   //                   ^
345 |   bool delimited = *clone == '{';
346 |   clone += delimited;
347 | 
348 |   // Check for a hex digit.
349 |   if (!isHexDigit(*clone))
350 |     return false;
351 | 
352 |   // Begin parsing hex digits
353 |   wchar_t value = 0;
354 |   const char* end = clone + size;
355 |   for (; clone != end; ++clone)
356 |   {
357 |     if (!isHexDigit(*clone))
358 |       break;
359 | 
360 |     int hex = hexValue(*clone);
361 |     value = 16 * value + hex;
362 |   }
363 | 
364 |   // Eat a closing '}' if we had a starting '{'.
365 |   if (delimited)
366 |   {
367 |     if (*clone != '}')
368 |       return false;
369 |     ++clone;
370 |   }
371 | 
372 |   std::mbstate_t state;
373 |   std::memset(&state, 0, sizeof(state));
374 |   index_type bytes = std::wcrtomb(output, value, &state);
375 |   if (bytes == static_cast<index_type>(-1))
376 |     return false;
377 | 
378 |   // Update iterator state
379 |   it = clone;
380 |   output += bytes;
381 |   return true;
382 | }
383 | 
384 | } // namespace detail
385 | 
386 | inline std::string stringValue(const char* begin, const char* end)
387 | {
388 |   if (begin == end)
389 |     return std::string();
390 | 
391 |   index_type n = end - begin;
392 |   scoped_array<char> buffer(new char[n + 1]);
393 | 
394 |   const char* it = begin;
395 |   char* output = buffer;
396 | 
397 |   while (it < end)
398 |   {
399 |     if (*it == '\\')
400 |     {
401 |       if (detail::parseOctal(it, output) ||
402 |           detail::parseHex(it, output) ||
403 |           detail::parseUnicode(it, output))
404 |       {
405 |         continue;
406 |       }
407 | 
408 |       // Handle the rest
409 |       ++it;
410 |       switch (*it)
411 |       {
412 |       case 'a':  *output++ = '\a'; break;
413 |       case 'b':  *output++ = '\b'; break;
414 |       case 'f':  *output++ = '\f'; break;
415 |       case 'n':  *output++ = '\n'; break;
416 |       case 'r':  *output++ = '\r'; break;
417 |       case 't':  *output++ = '\t'; break;
418 |       case 'v':  *output++ = '\v'; break;
419 |       case '\\': *output++ = '\\'; break;
420 |       default:   *output++ = *it;  break;
421 |       }
422 |       ++it;
423 |     }
424 |     else
425 |     {
426 |       *output++ = *it++;
427 |     }
428 |   }
429 | 
430 |   // Ensure null termination, just in case
431 |   *output++ = '\0';
432 | 
433 |   // Construct the result string and return
434 |   std::string result(buffer, output - buffer);
435 |   return result;
436 | }
437 | 
438 | inline std::string stringValue(const Token& token)
439 | {
440 |   switch (token.type())
441 |   {
442 |   case STRING:
443 |     return stringValue(token.begin() + 1, token.end() - 1);
444 |   case SYMBOL:
445 |     if (*token.begin() == '`')
446 |       return stringValue(token.begin() + 1, token.end() - 1);
447 |   default:
448 |     return stringValue(token.begin(), token.end());
449 |   }
450 | }
451 | 
452 | } // namespace tokens
453 | 
454 | inline std::string toString(tokens::TokenType type)
455 | {
456 |   using namespace tokens;
457 | 
458 |        if (type == INVALID)    return "invalid";
459 |   else if (type == END)        return "end";
460 |   else if (type == EMPTY)      return "empty";
461 |   else if (type == MISSING)    return "missing";
462 |   else if (type == SEMI)       return "semi";
463 |   else if (type == COMMA)      return "comma";
464 |   else if (type == SYMBOL)     return "symbol";
465 |   else if (type == COMMENT)    return "comment";
466 |   else if (type == WHITESPACE) return "whitespace";
467 |   else if (type == STRING)     return "string";
468 |   else if (type == NUMBER)     return "number";
469 | 
470 |   else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_BRACKET_MASK))
471 |     return "bracket";
472 |   else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_KEYWORD_MASK))
473 |     return "keyword";
474 |   else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_OPERATOR_MASK))
475 |     return "operator";
476 | 
477 |   return "unknown";
478 | }
479 | 
480 | inline std::string toString(const tokens::Token& token)
481 | {
482 |   std::string contents;
483 |   if (token.isType(tokens::END))
484 |     contents = "<END>";
485 |   else if (token.isType(tokens::EMPTY))
486 |     contents = "<empty>";
487 |   else if (token.isType(tokens::MISSING))
488 |     contents = "<missing>";
489 |   else
490 |     contents = token.contents();
491 | 
492 |   static const int N = 1024;
493 |   if (contents.size() > N / 2)
494 |     contents = contents.substr(0, N / 2);
495 |   char buff[N];
496 |   std::snprintf(buff,
497 |                 N,
498 |                 "[%4ld:%4ld]: %s",
499 |                 static_cast<long>(token.row()),
500 |                 static_cast<long>(token.column()),
501 |                 contents.c_str());
502 |   return buff;
503 | }
504 | 
505 | inline std::ostream& operator<<(std::ostream& os, const tokens::Token& token)
506 | {
507 |   return os << toString(token);
508 | }
509 | 
510 | inline std::ostream& operator<<(std::ostream& os, const std::vector<tokens::Token>& tokens)
511 | {
512 |   for (std::vector<tokens::Token>::const_iterator it = tokens.begin();
513 |        it != tokens.end();
514 |        ++it)
515 |   {
516 |     os << *it << std::endl;
517 |   }
518 | 
519 |   return os;
520 | }
521 | 
522 | } // namespace sourcetools
523 | 
524 | #endif /* SOURCETOOLS_TOKENIZATION_TOKEN_H */
525 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/tokenization/Tokenizer.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_TOKENIZATION_TOKENIZER_H
  2 | #define SOURCETOOLS_TOKENIZATION_TOKENIZER_H
  3 | 
  4 | #include <sourcetools/core/core.h>
  5 | #include <sourcetools/tokenization/Token.h>
  6 | #include <sourcetools/cursor/TextCursor.h>
  7 | 
  8 | #include <vector>
  9 | #include <stack>
 10 | #include <sstream>
 11 | 
 12 | namespace sourcetools {
 13 | namespace tokenizer {
 14 | 
 15 | class Tokenizer
 16 | {
 17 | private:
 18 |   typedef tokens::Token Token;
 19 |   typedef cursors::TextCursor TextCursor;
 20 |   typedef tokens::TokenType TokenType;
 21 | 
 22 | private:
 23 | 
 24 |   // Tokenization ----
 25 | 
 26 |   void consumeToken(TokenType type,
 27 |                     index_type length,
 28 |                     Token* pToken)
 29 |   {
 30 |     *pToken = Token(cursor_, type, length);
 31 |     cursor_.advance(length);
 32 |   }
 33 | 
 34 |   template <bool SkipEscaped, bool InvalidOnError>
 35 |   void consumeUntil(char ch,
 36 |                     TokenType type,
 37 |                     Token* pToken)
 38 |   {
 39 |     TextCursor lookahead = cursor_;
 40 | 
 41 |     bool success = false;
 42 |     index_type distance = 0;
 43 | 
 44 |     while (lookahead != lookahead.end()) {
 45 |       lookahead.advance();
 46 |       ++distance;
 47 | 
 48 |       if (SkipEscaped && lookahead.peek() == '\\') {
 49 |         lookahead.advance();
 50 |         ++distance;
 51 |         continue;
 52 |       }
 53 | 
 54 |       if (lookahead.peek() == ch) {
 55 |         success = true;
 56 |         break;
 57 |       }
 58 |     }
 59 | 
 60 |     if (success) {
 61 |       consumeToken(type, distance + 1, pToken);
 62 |     } else {
 63 |       consumeToken(
 64 |         InvalidOnError ? tokens::INVALID : type,
 65 |         distance,
 66 |         pToken
 67 |       );
 68 |     }
 69 |   }
 70 | 
 71 |   void consumeUserOperator(Token* pToken)
 72 |   {
 73 |     consumeUntil<false, true>('%', tokens::OPERATOR_USER, pToken);
 74 |   }
 75 | 
 76 |   void consumeComment(Token* pToken)
 77 |   {
 78 |     consumeUntil<false, false>('\n', tokens::COMMENT, pToken);
 79 |   }
 80 | 
 81 |   void consumeQuotedSymbol(Token* pToken)
 82 |   {
 83 |     consumeUntil<true, true>('`', tokens::SYMBOL, pToken);
 84 |   }
 85 | 
 86 |   void consumeQString(Token* pToken)
 87 |   {
 88 |     consumeUntil<true, true>('\'', tokens::STRING, pToken);
 89 |   }
 90 | 
 91 |   void consumeQQString(Token* pToken)
 92 |   {
 93 |     consumeUntil<true, true>('"', tokens::STRING, pToken);
 94 |   }
 95 | 
 96 |   void consumeRawString(Token* pToken)
 97 |   {
 98 |     // clone cursor
 99 |     TextCursor cursor = cursor_;
100 | 
101 |     // save current position
102 |     index_type start = cursor.offset();
103 | 
104 |     // consume a leading 'r' or 'R'
105 |     char ch = cursor.peek();
106 |     bool ok = ch == 'r' || ch == 'R';
107 |     if (!ok)
108 |       return consumeToken(tokens::INVALID, 1, pToken);
109 |     cursor.advance();
110 | 
111 |     // consume a quote, saving what we saw
112 |     char quote;
113 |     switch (cursor.peek())
114 |     {
115 |     case '"':
116 |       quote = '"';
117 |       break;
118 |     case '\'':
119 |       quote = '\'';
120 |       break;
121 |     default:
122 |       return consumeToken(tokens::INVALID, 2, pToken);
123 |     }
124 |     cursor.advance();
125 | 
126 |     // consume dashes, counting the number of dashes seen
127 |     int dashes = 0;
128 |     while (cursor.peek() == '-')
129 |     {
130 |       dashes += 1;
131 |       cursor.advance();
132 |     }
133 | 
134 |     // consume the delimiter, saving what we saw
135 |     char lhs;
136 |     switch (cursor.peek())
137 |     {
138 |     case '(':
139 |     case '{':
140 |     case '[':
141 |       lhs = cursor.peek();
142 |       break;
143 |     default:
144 |       return consumeToken(tokens::INVALID,
145 |                           cursor.offset() - start + 1,
146 |                           pToken);
147 |     }
148 |     cursor.advance();
149 | 
150 |     // compute complement for delimiter
151 |     char rhs;
152 |     switch (lhs)
153 |     {
154 |     case '(': rhs = ')'; break;
155 |     case '{': rhs = '}'; break;
156 |     case '[': rhs = ']'; break;
157 |     default:
158 |       return consumeToken(tokens::INVALID,
159 |                           cursor.offset() - start + 1,
160 |                           pToken);
161 |     }
162 | 
163 |     // start consuming things until we find the closing delimiter
164 |     for (; cursor.peek() != '\0'; cursor.advance())
165 |     {
166 |       // check for right delimiter
167 |       if (cursor.peek() != rhs)
168 |         goto AGAIN;
169 |       cursor.advance();
170 | 
171 |       // consume dashes
172 |       for (int i = 0; i < dashes; i++)
173 |       {
174 |         if (cursor.peek() != '-')
175 |           goto AGAIN;
176 |         cursor.advance();
177 |       }
178 | 
179 |       // check for matching quote
180 |       if (cursor.peek() != quote)
181 |         goto AGAIN;
182 |       cursor.advance();
183 | 
184 |       // if we got this far, we successfully matched the raw string
185 |       return consumeToken(
186 |         tokens::STRING,
187 |         cursor.offset() - start,
188 |         pToken
189 |       );
190 | 
191 |       // if we got here, we need to restart the loop
192 |       AGAIN: ;
193 |     }
194 | 
195 |     // if we got here, we failed to match
196 |     return consumeToken(
197 |       tokens::INVALID,
198 |       cursor.offset() - start,
199 |       pToken
200 |     );
201 | 
202 |   }
203 | 
204 |   bool isStartOfRawString(const TextCursor& cursor)
205 |   {
206 |     char ch = '\0';
207 | 
208 |     // check for leading 'r' or 'R'
209 |     ch = cursor.peek(0);
210 |     bool ok = ch == 'r' || ch == 'R';
211 |     if (!ok)
212 |       return false;
213 | 
214 |     // check for quote
215 |     ch = cursor.peek(1);
216 |     return ch == '\'' || ch == '"';
217 |   }
218 | 
219 |   // NOTE: Don't tokenize '-' or '+' as part of number; instead
220 |   // it's parsed as a unary operator.
221 |   bool isStartOfNumber()
222 |   {
223 |     char ch = cursor_.peek();
224 |     if (utils::isDigit(ch))
225 |       return true;
226 |     if (ch == '.')
227 |       return utils::isDigit(cursor_.peek(1));
228 |     return false;
229 |   }
230 | 
231 |   bool isStartOfSymbol()
232 |   {
233 |     return utils::isValidForStartOfRSymbol(cursor_.peek());
234 |   }
235 | 
236 |   bool consumeHexadecimalNumber(Token* pToken)
237 |   {
238 |     index_type distance = 0;
239 | 
240 |     // Detect the leading '0'.
241 |     if (cursor_.peek(distance) != '0')
242 |       return false;
243 |     ++distance;
244 | 
245 |     // Detect a 'x' or 'X'.
246 |     if (!(cursor_.peek(distance) == 'x' || cursor_.peek(distance) == 'X'))
247 |       return false;
248 |     ++distance;
249 | 
250 |     // Check and consume all alphanumeric characters.
251 |     // The number is valid if the characters are valid
252 |     // hexadecimal characters (0-9, a-f, A-F). The number
253 |     // can also end with an 'i' (for an imaginary number)
254 |     // or with an 'L' for an integer.
255 |     if (!utils::isHexDigit(cursor_.peek(distance)))
256 |     {
257 |       consumeToken(tokens::INVALID, distance, pToken);
258 |       return false;
259 |     }
260 | 
261 |     bool success = true;
262 |     char peek = cursor_.peek(distance);
263 |     while (utils::isAlphaNumeric(peek) && peek != '\0') {
264 | 
265 |       // If we encounter an 'i' or an 'L', assume
266 |       // that this ends the identifier.
267 |       if (peek == 'i' || peek == 'L')
268 |       {
269 |         ++distance;
270 |         break;
271 |       }
272 | 
273 |       if (!utils::isHexDigit(peek))
274 |         success = false;
275 | 
276 |       ++distance;
277 |       peek = cursor_.peek(distance);
278 |     }
279 | 
280 |     consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken);
281 |     return true;
282 |   }
283 | 
284 |   void consumeNumber(Token* pToken)
285 |   {
286 |     bool success = true;
287 |     index_type distance = 0;
288 | 
289 |     // NOTE: A leading '-' or '+' is not consumed as part of
290 |     // the number.
291 | 
292 |     // Try parsing this as a hexadecimal number first (e.g. '0xabc').
293 |     if (consumeHexadecimalNumber(pToken))
294 |       return;
295 | 
296 |     // Consume digits
297 |     while (utils::isDigit(cursor_.peek(distance)))
298 |       ++distance;
299 | 
300 |     // Consume a dot for decimals
301 |     // Note: '.5' is a valid specification for a number
302 |     // So is '100.'; ie, with a trailing decimal.
303 |     if (cursor_.peek(distance) == '.') {
304 |       ++distance;
305 |       while (utils::isDigit(cursor_.peek(distance)))
306 |         ++distance;
307 |     }
308 | 
309 |     // Consume 'e', 'E' for exponential notation
310 |     if (cursor_.peek(distance) == 'e' || cursor_.peek(distance) == 'E') {
311 |       ++distance;
312 | 
313 |       // Consume a '-' or a '+' for a negative number
314 |       if (cursor_.peek(distance) == '-' || cursor_.peek(distance) == '+')
315 |         ++distance;
316 | 
317 |       // Parse another set of numbers following the E
318 |       success = utils::isDigit(cursor_.peek(distance));
319 |       while (utils::isDigit(cursor_.peek(distance)))
320 |         ++distance;
321 | 
322 |       // Consume '.' and following numbers. Note that this is
323 |       // not really a valid number for R but it's better to tokenize
324 |       // this is a single entity (and then report failure later)
325 |       if (cursor_.peek(distance) == '.') {
326 |         success = false;
327 |         ++distance;
328 |         while (utils::isDigit(cursor_.peek(distance)))
329 |           ++distance;
330 |       }
331 |     }
332 | 
333 |     // Consume a final 'L' for integer literals,
334 |     // or a final 'i' for complex numbers.
335 |     if (cursor_.peek(distance) == 'L' ||
336 |         cursor_.peek(distance) == 'i')
337 |     {
338 |       ++distance;
339 |     }
340 | 
341 |     consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken);
342 |   }
343 | 
344 |   void consumeSymbol(Token* pToken)
345 |   {
346 |     index_type distance = 1;
347 |     char ch = cursor_.peek(distance);
348 |     while (utils::isValidForRSymbol(ch)) {
349 |       ++distance;
350 |       ch = cursor_.peek(distance);
351 |     }
352 | 
353 |     const char* ptr = &*(cursor_.begin() + cursor_.offset());
354 |     consumeToken(tokens::symbolType(ptr, distance), distance, pToken);
355 |   }
356 | 
357 | public:
358 | 
359 |   Tokenizer(const char* code, index_type n)
360 |     : cursor_(code, n)
361 |   {
362 |   }
363 | 
364 |   bool tokenize(Token* pToken)
365 |   {
366 |     if (cursor_ >= cursor_.end())
367 |     {
368 |       *pToken = Token(tokens::END);
369 |       return false;
370 |     }
371 | 
372 |     char ch = cursor_.peek();
373 |     int n = 0;
374 | 
375 |     // Block-related tokens
376 |     if (ch == '{')
377 |       consumeToken(tokens::LBRACE, 1, pToken);
378 |     else if (ch == '}')
379 |       consumeToken(tokens::RBRACE, 1, pToken);
380 |     else if (ch == '(')
381 |       consumeToken(tokens::LPAREN, 1, pToken);
382 |     else if (ch == ')')
383 |       consumeToken(tokens::RPAREN, 1, pToken);
384 |     else if (ch == '[') {
385 |       if (cursor_.peek(1) == '[') {
386 |         tokenStack_.push(tokens::LDBRACKET);
387 |         consumeToken(tokens::LDBRACKET, 2, pToken);
388 |       } else {
389 |         tokenStack_.push(tokens::LBRACKET);
390 |         consumeToken(tokens::LBRACKET, 1, pToken);
391 |       }
392 |     } else if (ch == ']') {
393 |       if (tokenStack_.empty()) {
394 |         consumeToken(tokens::INVALID, 1, pToken);
395 |       } else if (tokenStack_.top() == tokens::LDBRACKET) {
396 |         tokenStack_.pop();
397 |         if (cursor_.peek(1) == ']')
398 |           consumeToken(tokens::RDBRACKET, 2, pToken);
399 |         else
400 |           consumeToken(tokens::INVALID, 1, pToken);
401 |       } else {
402 |         tokenStack_.pop();
403 |         consumeToken(tokens::RBRACKET, 1, pToken);
404 |       }
405 |     }
406 | 
407 |     // Operators
408 |     else if (ch == '<')  // <<-, <=, <-, <
409 |     {
410 |       char next = cursor_.peek(1);
411 |       if (next == '-') // <-
412 |         consumeToken(tokens::OPERATOR_ASSIGN_LEFT, 2, pToken);
413 |       else if (next == '=') // <=
414 |         consumeToken(tokens::OPERATOR_LESS_OR_EQUAL, 2, pToken);
415 |       else if (next == '<' && cursor_.peek(2) == '-')
416 |         consumeToken(tokens::OPERATOR_ASSIGN_LEFT_PARENT, 3, pToken);
417 |       else
418 |         consumeToken(tokens::OPERATOR_LESS, 1, pToken);
419 |     }
420 | 
421 |     else if (ch == '>')  // >=, >
422 |     {
423 |       if (cursor_.peek(1) == '=')
424 |         consumeToken(tokens::OPERATOR_GREATER_OR_EQUAL, 2, pToken);
425 |       else
426 |         consumeToken(tokens::OPERATOR_GREATER, 1, pToken);
427 |     }
428 |     else if (ch == '=')  // '==', '=>', '='
429 |     {
430 |       char next = cursor_.peek(1);
431 |       if (next == '>')
432 |         consumeToken(tokens::OPERATOR_PIPE_BIND, 2, pToken);
433 |       else if (next == '=')
434 |         consumeToken(tokens::OPERATOR_EQUAL, 2, pToken);
435 |       else
436 |         consumeToken(tokens::OPERATOR_ASSIGN_LEFT_EQUALS, 1, pToken);
437 |     }
438 |     else if (ch == '|')  // '||', '|>', '|'
439 |     {
440 |       char next = cursor_.peek(1);
441 |       if (next == '>')
442 |         consumeToken(tokens::OPERATOR_PIPE, 2, pToken);
443 |       else if (next == '|')
444 |         consumeToken(tokens::OPERATOR_OR_SCALAR, 2, pToken);
445 |       else
446 |         consumeToken(tokens::OPERATOR_OR_VECTOR, 1, pToken);
447 |     }
448 |     else if (ch == '&')  // '&&', '&'
449 |     {
450 |       if (cursor_.peek(1) == '&')
451 |         consumeToken(tokens::OPERATOR_AND_SCALAR, 2, pToken);
452 |       else
453 |         consumeToken(tokens::OPERATOR_AND_VECTOR, 1, pToken);
454 |     }
455 |     else if (ch == '*')  // **, *
456 |     {
457 |       if (cursor_.peek(1) == '*')
458 |         consumeToken(tokens::OPERATOR_EXPONENTATION_STARS, 2, pToken);
459 |       else
460 |         consumeToken(tokens::OPERATOR_MULTIPLY, 1, pToken);
461 |     }
462 |     else if (ch == ':')  // ':::', '::', ':=', ':'
463 |     {
464 |       if (cursor_.peek(1) == ':')
465 |       {
466 |         if (cursor_.peek(2) == ':')
467 |           consumeToken(tokens::OPERATOR_NAMESPACE_ALL, 3, pToken);
468 |         else
469 |           consumeToken(tokens::OPERATOR_NAMESPACE_EXPORTS, 2, pToken);
470 |       }
471 |       else if (cursor_.peek(1) == '=')
472 |         consumeToken(tokens::OPERATOR_ASSIGN_LEFT_COLON, 2, pToken);
473 |       else
474 |         consumeToken(tokens::OPERATOR_SEQUENCE, 1, pToken);
475 |     }
476 |     else if (ch == '!')
477 |     {
478 |       if (cursor_.peek(1) == '=')
479 |         consumeToken(tokens::OPERATOR_NOT_EQUAL, 2, pToken);
480 |       else
481 |         consumeToken(tokens::OPERATOR_NEGATION, 1, pToken);
482 |     }
483 |     else if (ch == '-') // '->>', '->', '-'
484 |     {
485 |       if (cursor_.peek(1) == '>')
486 |       {
487 |         if (cursor_.peek(2) == '>')
488 |           consumeToken(tokens::OPERATOR_ASSIGN_RIGHT_PARENT, 3, pToken);
489 |         else
490 |           consumeToken(tokens::OPERATOR_ASSIGN_RIGHT, 2, pToken);
491 |       }
492 |       else
493 |         consumeToken(tokens::OPERATOR_MINUS, 1, pToken);
494 |     }
495 |     else if (ch == '+')
496 |       consumeToken(tokens::OPERATOR_PLUS, 1, pToken);
497 |     else if (ch == '~')
498 |       consumeToken(tokens::OPERATOR_FORMULA, 1, pToken);
499 |     else if (ch == '?')
500 |       consumeToken(tokens::OPERATOR_HELP, 1, pToken);
501 |     else if (ch == '/')
502 |       consumeToken(tokens::OPERATOR_DIVIDE, 1, pToken);
503 |     else if (ch == '@')
504 |       consumeToken(tokens::OPERATOR_AT, 1, pToken);
505 |     else if (ch == '$')
506 |       consumeToken(tokens::OPERATOR_DOLLAR, 1, pToken);
507 |     else if (ch == '^')
508 |       consumeToken(tokens::OPERATOR_HAT, 1, pToken);
509 | 
510 |     // User operators
511 |     else if (ch == '%')
512 |       consumeUserOperator(pToken);
513 | 
514 |     // Punctuation-related tokens
515 |     else if (ch == ',')
516 |       consumeToken(tokens::COMMA, 1, pToken);
517 |     else if (ch == ';')
518 |       consumeToken(tokens::SEMI, 1, pToken);
519 | 
520 |     // Whitespace
521 |     else if (utils::countWhitespaceBytes(cursor_, &n))
522 |       consumeToken(tokens::WHITESPACE, n, pToken);
523 | 
524 |     // Strings and symbols
525 |     else if (ch == '\'')
526 |       consumeQString(pToken);
527 |     else if (ch == '"')
528 |       consumeQQString(pToken);
529 |     else if (ch == '`')
530 |       consumeQuotedSymbol(pToken);
531 |     else if (isStartOfRawString(cursor_))
532 |       consumeRawString(pToken);
533 | 
534 |     // Comments
535 |     else if (ch == '#')
536 |       consumeComment(pToken);
537 | 
538 |     // Number
539 |     else if (isStartOfNumber())
540 |       consumeNumber(pToken);
541 | 
542 |     // Symbol
543 |     else if (isStartOfSymbol())
544 |       consumeSymbol(pToken);
545 | 
546 |     // Nothing matched -- error
547 |     else
548 |       consumeToken(tokens::INVALID, 1, pToken);
549 | 
550 |     return true;
551 |   }
552 | 
553 |   Token peek(index_type lookahead = 1)
554 |   {
555 |     Tokenizer clone(*this);
556 | 
557 |     Token result(tokens::END);
558 |     for (index_type i = 0; i < lookahead; ++i) {
559 |       if (!clone.tokenize(&result)) {
560 |         break;
561 |       }
562 |     }
563 | 
564 |     return result;
565 |   }
566 | 
567 | private:
568 |   TextCursor cursor_;
569 |   std::stack<TokenType, std::vector<TokenType> > tokenStack_;
570 | };
571 | 
572 | } // namespace tokenizer
573 | 
574 | inline std::vector<tokens::Token> tokenize(const char* code, index_type n)
575 | {
576 |   typedef tokenizer::Tokenizer Tokenizer;
577 |   typedef tokens::Token Token;
578 | 
579 |   std::vector<Token> tokens;
580 |   if (n == 0)
581 |     return tokens;
582 | 
583 |   Token token;
584 |   Tokenizer tokenizer(code, n);
585 |   while (tokenizer.tokenize(&token))
586 |     tokens.push_back(token);
587 | 
588 |   return tokens;
589 | }
590 | 
591 | inline std::vector<tokens::Token> tokenize(const std::string& code)
592 | {
593 |   return tokenize(code.data(), code.size());
594 | }
595 | 
596 | } // namespace sourcetools
597 | 
598 | #endif /* SOURCETOOLS_TOKENIZATION_TOKENIZER_H */
599 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/tokenization/tokenization.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_TOKENIZATION_TOKENIZATION_H
2 | #define SOURCETOOLS_TOKENIZATION_TOKENIZATION_H
3 | 
4 | #include <sourcetools/tokenization/Registration.h>
5 | #include <sourcetools/tokenization/Token.h>
6 | #include <sourcetools/tokenization/Tokenizer.h>
7 | 
8 | #endif /* SOURCETOOLS_TOKENIZATION_TOKENIZATION_H */
9 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/utf8/utf8.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_UTF8_UTF8_H
  2 | #define SOURCETOOLS_UTF8_UTF8_H
  3 | 
  4 | #include <cstddef>
  5 | 
  6 | #include <sourcetools/core/core.h>
  7 | 
  8 | namespace sourcetools {
  9 | namespace utf8 {
 10 | 
 11 | namespace detail {
 12 | static const unsigned char mask[] = {
 13 |   0,    // 00000000
 14 |   0x7F, // 01111111
 15 |   0x1F, // 00011111
 16 |   0x0F, // 00001111
 17 |   0x07, // 00000111
 18 |   0x03, // 00000011
 19 |   0x01  // 00000001
 20 | };
 21 | } // namespace detail
 22 | 
 23 | class iterator
 24 | {
 25 | public:
 26 |   iterator(const char* data)
 27 |     : data_(reinterpret_cast<const unsigned char*>(data)),
 28 |       offset_(0)
 29 |   {
 30 |   }
 31 | 
 32 |   iterator(const iterator& other)
 33 |     : data_(other.data_),
 34 |       offset_(other.offset_)
 35 |   {
 36 |   }
 37 | 
 38 |   wchar_t operator*()
 39 |   {
 40 |     index_type n = size();
 41 |     if (n == 0 || n > 6)
 42 |       return -1;
 43 | 
 44 |     const unsigned char* it = data_ + offset_;
 45 |     wchar_t ch = (*it++) & detail::mask[n];
 46 |     for (index_type i = 1; i < n; ++i)
 47 |     {
 48 |       ch <<= 6;
 49 |       ch |= (*it++) & 0x3F;
 50 |     }
 51 | 
 52 |     return ch;
 53 |   }
 54 | 
 55 |   iterator& operator++()
 56 |   {
 57 |     offset_ += size();
 58 |     return *this;
 59 |   }
 60 | 
 61 |   iterator operator++(int)
 62 |   {
 63 |     iterator copy(*this);
 64 |     operator++();
 65 |     return copy;
 66 |   }
 67 | 
 68 |   bool operator==(const iterator& it)
 69 |   {
 70 |     return
 71 |       data_ + offset_ ==
 72 |       it.data_ + it.offset_;
 73 |   }
 74 | 
 75 |   bool operator!=(const iterator& it)
 76 |   {
 77 |     return
 78 |       data_ + offset_ !=
 79 |       it.data_ + it.offset_;
 80 |   }
 81 | 
 82 | private:
 83 | 
 84 |   int size()
 85 |   {
 86 |     unsigned char ch = data_[offset_];
 87 |     if (ch == 0)
 88 |       return 0;
 89 |     else if (ch < 192)
 90 |       return 1;
 91 |     else if (ch < 224)
 92 |       return 2;
 93 |     else if (ch < 240)
 94 |       return 3;
 95 |     else if (ch < 248)
 96 |       return 4;
 97 |     else if (ch < 252)
 98 |       return 5;
 99 |     else if (ch < 254)
100 |       return 6;
101 | 
102 |     // TODO: on error?
103 |     return 1;
104 |   }
105 | 
106 | private:
107 | 
108 |   const unsigned char* data_;
109 |   index_type offset_;
110 | };
111 | 
112 | } // namespace utf8
113 | } // namespace sourcetools
114 | 
115 | #endif /* SOURCETOOLS_UTF8_UTF8_H */
116 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/validation/SyntaxValidator.h:
--------------------------------------------------------------------------------
  1 | #ifndef SOURCETOOLS_VALIDATION_SYNTAX_VALIDATOR_H
  2 | #define SOURCETOOLS_VALIDATION_SYNTAX_VALIDATOR_H
  3 | 
  4 | #include <sstream>
  5 | #include <vector>
  6 | 
  7 | #include <sourcetools/collection/Position.h>
  8 | #include <sourcetools/tokenization/Token.h>
  9 | #include <sourcetools/cursor/TokenCursor.h>
 10 | 
 11 | namespace sourcetools {
 12 | namespace validators {
 13 | 
 14 | class SyntaxError {
 15 | private:
 16 |   typedef collections::Position Position;
 17 |   typedef cursors::TokenCursor TokenCursor;
 18 |   typedef tokens::Token Token;
 19 | 
 20 | public:
 21 | 
 22 |   explicit SyntaxError(const Position& position,
 23 |                        const std::string& message)
 24 |     : position_(position),
 25 |       message_(message)
 26 |   {}
 27 | 
 28 |   std::string report()
 29 |   {
 30 |     std::ostringstream os;
 31 |     os << "[" << position_.row << ":" << position_.column << "]: "
 32 |        << message_;
 33 | 
 34 |     return os.str();
 35 |   }
 36 | 
 37 |   index_type row() const { return position_.row; }
 38 |   index_type column() const { return position_.column; }
 39 |   const Position& position() const { return position_; }
 40 |   const std::string& message() const { return message_; }
 41 | 
 42 | private:
 43 |   Position position_;
 44 |   std::string message_;
 45 | };
 46 | 
 47 | class SyntaxValidator {
 48 | 
 49 | private:
 50 |   typedef tokens::Token Token;
 51 |   typedef cursors::TokenCursor TokenCursor;
 52 |   typedef tokens::TokenType TokenType;
 53 | 
 54 |   void unexpectedToken(const Token& token, const std::string& expected = std::string())
 55 |   {
 56 |     std::string message = "unexpected token '" + token.contents() + "'";
 57 |     if (!expected.empty())
 58 |       message += " (expected '" + expected + "')";
 59 | 
 60 |     errors_.push_back(SyntaxError(token.position(), message));
 61 |   }
 62 | 
 63 |   void updateBracketStack(const Token& token, std::vector<TokenType>* pStack)
 64 |   {
 65 |     using namespace tokens;
 66 | 
 67 |     // Update brace state
 68 |     if (isLeftBracket(token)) {
 69 |       pStack->push_back(token.type());
 70 |     } else if (isRightBracket(token)) {
 71 |       index_type size = pStack->size();
 72 |       TokenType last = pStack->at(size - 1);
 73 |       if (size == 1) {
 74 |         unexpectedToken(token);
 75 |       } else {
 76 |         if (!isComplement(token.type(), last))
 77 |           unexpectedToken(token, toString(complement(last)));
 78 |         pStack->pop_back();
 79 |       }
 80 |     }
 81 |   }
 82 | 
 83 | public:
 84 | 
 85 |   explicit SyntaxValidator(const std::vector<Token>& tokens)
 86 |   {
 87 |     if (tokens.empty())
 88 |       return;
 89 | 
 90 |     TokenCursor cursor(tokens);
 91 |     std::vector<TokenType> stack;
 92 |     stack.push_back(tokens::INVALID);
 93 | 
 94 |     const Token* pThisToken = &(cursor.currentToken());
 95 |     const Token* pPrevToken = pThisToken;
 96 | 
 97 |     while (cursor.moveToNextSignificantToken()) {
 98 | 
 99 |       pPrevToken = pThisToken;
100 |       pThisToken = &(cursor.currentToken());
101 | 
102 |       updateBracketStack(cursor.currentToken(), &stack);
103 |       executeValidators(*pPrevToken, *pThisToken);
104 | 
105 |     }
106 |   }
107 | 
108 |   const std::vector<SyntaxError>& errors() const { return errors_; }
109 | 
110 | private:
111 | 
112 |   void executeValidators(const tokens::Token& prevToken,
113 |                          const tokens::Token& thisToken)
114 |   {
115 |     using namespace tokens;
116 | 
117 |     if (isOperator(prevToken)) {
118 | 
119 |       // Operator followed non-unary operator.
120 |       if (isNonUnaryOperator(thisToken))
121 |         unexpectedToken(thisToken);
122 | 
123 |       // Operator (other than =) followed by any kind of right bracket.
124 |       // We need to allow e.g. 'parse(text = )'.
125 |       if (isRightBracket(thisToken) && !prevToken.isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS))
126 |         unexpectedToken(thisToken);
127 | 
128 |       // Operator followed by '[' or '[['.
129 |       if (thisToken.isType(tokens::LBRACKET) ||
130 |           thisToken.isType(tokens::LDBRACKET))
131 |         unexpectedToken(thisToken);
132 |     }
133 | 
134 |     else if (isSymbolic(prevToken)) {
135 | 
136 |       // Two symbols on the same line.
137 |       if (isSymbolic(thisToken) && prevToken.row() == thisToken.row())
138 |         unexpectedToken(thisToken);
139 |     }
140 | 
141 |   }
142 | 
143 |   std::vector<SyntaxError> errors_;
144 | 
145 | };
146 | 
147 | } // namespace validators
148 | } // namespace sourcetools
149 | 
150 | #endif /* SOURCETOOLS_VALIDATION_SYNTAX_VALIDATOR_H */
151 | 


--------------------------------------------------------------------------------
/inst/include/sourcetools/validation/validation.h:
--------------------------------------------------------------------------------
1 | #ifndef SOURCETOOLS_VALIDATION_VALIDATION_H
2 | #define SOURCETOOLS_VALIDATION_VALIDATION_H
3 | 
4 | #include <sourcetools/validation/SyntaxValidator.h>
5 | 
6 | #endif /* SOURCETOOLS_VALIDATION_VALIDATION_H */
7 | 


--------------------------------------------------------------------------------
/man/read.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sourcetools.R
 3 | \name{read}
 4 | \alias{read}
 5 | \alias{read_lines}
 6 | \alias{read_bytes}
 7 | \alias{read_lines_bytes}
 8 | \title{Read the Contents of a File}
 9 | \usage{
10 | read(path)
11 | 
12 | read_lines(path)
13 | 
14 | read_bytes(path)
15 | 
16 | read_lines_bytes(path)
17 | }
18 | \arguments{
19 | \item{path}{A file path.}
20 | }
21 | \description{
22 | Read the contents of a file into a string (or, in the case of
23 | \code{read_lines}, a vector of strings).
24 | }
25 | 


--------------------------------------------------------------------------------
/man/register_routines.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/register.R
 3 | \name{register_routines}
 4 | \alias{register_routines}
 5 | \title{Register Native Routines}
 6 | \usage{
 7 | register_routines(package = ".", prefix = "C_", dynamic.symbols = FALSE)
 8 | }
 9 | \arguments{
10 | \item{package}{The path to an \R package.}
11 | 
12 | \item{prefix}{The prefix to assign to the \R objects
13 | generated that map to each routine.}
14 | 
15 | \item{dynamic.symbols}{Boolean; should dynamic symbol lookup
16 | be enabled?}
17 | }
18 | \description{
19 | Discover and register native routines in a package.
20 | Functions to be registered should be prefixed with the
21 | `// [[export(<methods>)]]` attribute.
22 | }
23 | 


--------------------------------------------------------------------------------
/man/tokenize-methods.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sourcetools.R
 3 | \name{tokenize_file}
 4 | \alias{tokenize_file}
 5 | \alias{tokenize_string}
 6 | \alias{tokenize}
 7 | \title{Tokenize R Code}
 8 | \usage{
 9 | tokenize_file(path)
10 | 
11 | tokenize_string(string)
12 | 
13 | tokenize(file = "", text = NULL)
14 | }
15 | \arguments{
16 | \item{file, path}{A file path.}
17 | 
18 | \item{text, string}{\R code as a character vector of length one.}
19 | }
20 | \value{
21 | A \code{data.frame} with the following columns:
22 | 
23 | \tabular{ll}{
24 | \code{value}  \tab The token's contents, as a string.     \cr
25 | \code{row}    \tab The row where the token is located.    \cr
26 | \code{column} \tab The column where the token is located. \cr
27 | \code{type}   \tab The token type, as a string.           \cr
28 | }
29 | }
30 | \description{
31 | Tools for tokenizing \R code.
32 | }
33 | \note{
34 | Line numbers are determined by existence of the \code{\\n}
35 | line feed character, under the assumption that code being tokenized
36 | will use either \code{\\n} to indicate newlines (as on modern
37 | Unix systems), or \code{\\r\\n} as on Windows.
38 | }
39 | \examples{
40 | tokenize_string("x <- 1 + 2")
41 | }
42 | 


--------------------------------------------------------------------------------
/man/validate_syntax.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sourcetools.R
 3 | \name{validate_syntax}
 4 | \alias{validate_syntax}
 5 | \title{Find Syntax Errors}
 6 | \usage{
 7 | validate_syntax(string)
 8 | }
 9 | \arguments{
10 | \item{string}{A character vector (of length one).}
11 | }
12 | \description{
13 | Find syntax errors in a string of \R code.
14 | }
15 | 


--------------------------------------------------------------------------------
/notes/notes-tdop.R:
--------------------------------------------------------------------------------
  1 | ## An adapation of http://effbot.org/zone/simple-top-down-parsing.htm
  2 | ## for R. We examine a super simple language (calculator)
  3 | ## that consists only of '+', '*', and single-digit numbers
  4 | ## (no whitespace). For example, "1+2*3+4" is a valid
  5 | ## program, doing what you expect. More comments are
  6 | ## included inline to help make sense of what's going on.
  7 | ##
  8 | ## While simple, this example showcases the main points
  9 | ## needed to understand top-down operator precedence
 10 | ## parsing.
 11 | 
 12 | ## First, some simple utility functions.
 13 | 
 14 | # Check whether a token is an operator (ie, '+' or '*')
 15 | is_operator <- function(token) {
 16 |   token == "+" || token == "*"
 17 | }
 18 | 
 19 | # Single-digit numbers
 20 | is_number <- function(token) {
 21 |   token %in% as.character(0:9)
 22 | }
 23 | 
 24 | # We 'tokenize' a program (string of code) just by splitting
 25 | # it. So "1+2" becomes c("1", "+", "2"). Obviously a 'real'
 26 | # tokenizer would tokenize incrementally and separate words
 27 | # etc. but tokenization is not the interesting part of this
 28 | # example, so we just keep it simple.
 29 | tokenize <- function(program) {
 30 |   strsplit(program, "", fixed = TRUE)[[1]]
 31 | }
 32 | 
 33 | # A simple tokenizer 'class' that accepts a program,
 34 | # tokenizes it, and returns a method that accesses the next
 35 | # token (if available). Elements postfixed with '_' are
 36 | # 'private' (hidden in the closure); access to them is made
 37 | # available through 'public' functions exported as part of
 38 | # the list object.
 39 | Tokenizer <- function(program) {
 40 |   tokens_ <- tokenize(program)
 41 |   index_ <- 0
 42 |   n_ <- length(tokens_)
 43 |   list(
 44 |     tokenize = function() {
 45 |       index_ <<- index_ + 1
 46 |       if (index_ <= n_)
 47 |         tokens_[[index_]]
 48 |       else
 49 |         ""
 50 |     }
 51 |   )
 52 | }
 53 | 
 54 | # Our 'Parser' class will be used to construct
 55 | # our parse tree (an AST).
 56 | Parser <- function(tokenizer) {
 57 | 
 58 |   tokenizer_ <- tokenizer
 59 | 
 60 |   # We save a lookahead token, to help inform what action we
 61 |   # should take as we parse. It needs to exist as a private
 62 |   # variable so that the various recursing functions see the
 63 |   # correct 'state' of the program.
 64 |   lookahead_ <- tokenizer_$tokenize()
 65 | 
 66 |   # A hacky helper function for printing debug output when
 67 |   # running our parser. Don't worry too much about this.
 68 |   indent <- function() {
 69 |     paste(
 70 |       paste(character(length(sys.calls())), collapse = "-"),
 71 |       "-> ",
 72 |       sep = ""
 73 |     )
 74 |   }
 75 | 
 76 |   # The left-binding precedence for a token. The important
 77 |   # thing is that '*' has a higher precedence than '+'. This
 78 |   # function either receives operators, or a special 'end of
 79 |   # line' token, implying that there is nothing left to
 80 |   # parse. We give it a left-binding precedence of 0, to
 81 |   # indicate that parsing should end now.
 82 |   precedence <- function(token) {
 83 |     if (token == "+")
 84 |       10
 85 |     else if (token == "*")
 86 |       20
 87 |     else if (token == "")
 88 |       0
 89 |     else
 90 |       stop("unexpected token '", token, "'; expected operator or end-of-parse")
 91 |   }
 92 | 
 93 |   # Handling of 'null denotation' tokens. This is for tokens
 94 |   # that are discovered at the start of an expression; ie,
 95 |   # unary operators, or regular old numbers. Note how for
 96 |   # the '+' operator, we simply construct a single-element
 97 |   # node with "+" on the left-hand side, and the new
 98 |   # expression on the right-hand side.
 99 |   parsePrefixExpression <- function(token) {
100 |     if (token == "+")
101 |       call(token, parseTopLevelExpression(100))
102 |     else if (is_number(token))
103 |       as.numeric(token)
104 |     else
105 |       stop("unexpected token '", token, "'")
106 |   }
107 | 
108 |   # 'led', for 'left denotation', is used when a token
109 |   # appears within a construct (ie, when a binary operator
110 |   # is encountered). This function will be called once a
111 |   # binary operator is encountered, with 'lhs' being that
112 |   # operator, and 'rhs' being the current parse tree. Each
113 |   # call to 'led' constructs a new node, with our 'lhs'
114 |   # operator as the parent, the current parse tree ('rhs')
115 |   # as the left child, and the next part of the expression
116 |   # as the right child.
117 |   parseInfixExpression <- function(lhs, rhs) {
118 |     if (!is_operator(lhs))
119 |       stop("unexpected token '", lhs, "'; expecting an operator")
120 |     call(lhs, rhs, parseTopLevelExpression(precedence(lhs)))
121 |   }
122 | 
123 |   # This is the entry-point that parses a whole expression.
124 |   parseTopLevelExpression <- function(rbp = 0) {
125 | 
126 |     # Save the current token in 'token', and advance to the
127 |     # next token.
128 |     #
129 |     # Why do we need to save the token in a 'global'
130 |     # variable? When the various parse recursions end, we
131 |     # need to make sure those routines are seeing the
132 |     # current state, rather than their own state.
133 |     token <- lookahead_
134 |     lookahead_ <<- tokenizer_$tokenize()
135 | 
136 |     # Parse the 'null denotation' expression. This
137 |     # represents tokens that are discovered at the beginning
138 |     # of an expression. We expect this to handle both unary
139 |     # operators (wherein 'nud' will recurse until
140 |     # discovering a non-operator token), and numeric tokens
141 |     # (which end the recursion).
142 |     cat(indent(), "lhs <- parsePrefixExpression(", format(token), ")\n", sep = "")
143 |     node <- parsePrefixExpression(token)
144 | 
145 |     # Now, we need to construct the right-hand side of this
146 |     # expression. The 'lbp' tells us whether we can continue
147 |     # 'joining' expressions into the current parse tree.
148 |     # TODO: make this more clear
149 |     while (rbp < precedence(lookahead_)) {
150 | 
151 |       # Save the current token, and get the next token.
152 |       token <- lookahead_
153 |       lookahead_ <<- tokenizer$tokenize()
154 | 
155 |       # Construct a new 'node' for our tree. Notice how we
156 |       # 'grow' the left-hand side here.
157 |       cat(indent(), "lhs <- parseInfixExpression(", format(token), ", ", format(node), ")\n", sep = "")
158 |       node <- parseInfixExpression(token, node)
159 |     }
160 | 
161 |     # Return our parse tree.
162 |     node
163 |   }
164 | 
165 |   list(parse = parseTopLevelExpression)
166 | }
167 | 
168 | # Let's test it out!
169 | program <- "1+2*3*4+5"
170 | tokens <- tokenize(program)
171 | tokenizer <- Tokenizer(program)
172 | parser <- Parser(tokenizer)
173 | expr <- parser$parse()
174 | stopifnot(eval(parse(text = program)) == eval(expr))
175 | 
176 | ## Other materials:
177 | ##
178 | ## http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/ 
179 | ## http://eli.thegreenplace.net/2010/01/02/top-down-operator-precedence-parsing
180 | ## 
181 | 


--------------------------------------------------------------------------------
/sourcetools.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: knitr
13 | LaTeX: pdfLaTeX
14 | 
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 | 
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace,vignette
22 | 


--------------------------------------------------------------------------------
/src/Makevars:
--------------------------------------------------------------------------------
1 | PKG_CPPFLAGS = -I../inst/include
2 | 


--------------------------------------------------------------------------------
/src/Makevars.win:
--------------------------------------------------------------------------------
1 | PKG_CPPFLAGS = -I../inst/include
2 | 


--------------------------------------------------------------------------------
/src/NSE.cpp:
--------------------------------------------------------------------------------
 1 | #include <sourcetools.h>
 2 | using namespace sourcetools;
 3 | using namespace sourcetools::r;
 4 | 
 5 | extern "C" SEXP sourcetools_performs_nse(SEXP fnSEXP)
 6 | {
 7 |   if (TYPEOF(fnSEXP) == VECSXP || TYPEOF(fnSEXP) == EXPRSXP)
 8 |   {
 9 |     Protect protect;
10 |     index_type n = Rf_length(fnSEXP);
11 |     SEXP resultSEXP = protect(Rf_allocVector(LGLSXP, n));
12 |     for (index_type i = 0; i < n; ++i)
13 |     {
14 |       SEXP elSEXP = VECTOR_ELT(fnSEXP, i);
15 |       LOGICAL(resultSEXP)[i] = Rf_isFunction(elSEXP)
16 |         ? nse::performsNonStandardEvaluation(elSEXP)
17 |         : 0;
18 |     }
19 |     return resultSEXP;
20 |   }
21 | 
22 |   bool result = Rf_isFunction(fnSEXP)
23 |     ? nse::performsNonStandardEvaluation(fnSEXP)
24 |     : false;
25 | 
26 |   return Rf_ScalarLogical(result);
27 | }
28 | 


--------------------------------------------------------------------------------
/src/Parser.cpp:
--------------------------------------------------------------------------------
  1 | #include <sourcetools.h>
  2 | 
  3 | #define R_NO_REMAP
  4 | #include <R.h>
  5 | #include <Rinternals.h>
  6 | 
  7 | namespace sourcetools {
  8 | 
  9 | void log(parser::ParseNode* pNode, int depth)
 10 | {
 11 |   if (!pNode)
 12 |     return;
 13 | 
 14 |   for (int i = 0; i < depth; ++i)
 15 |     Rprintf("  ");
 16 | 
 17 |   Rprintf("%s\n", toString(pNode->token()).c_str());
 18 | 
 19 |   using parser::ParseNode;
 20 |   const std::vector<ParseNode*>& children = pNode->children();
 21 |   for (std::vector<ParseNode*>::const_iterator it = children.begin();
 22 |        it != children.end();
 23 |        ++it)
 24 |   {
 25 |     log(*it, depth + 1);
 26 |   }
 27 | }
 28 | 
 29 | namespace {
 30 | 
 31 | class SEXPConverter
 32 | {
 33 | private:
 34 |   typedef parser::ParseNode ParseNode;
 35 | 
 36 |   static SEXP asKeywordSEXP(const tokens::Token& token)
 37 |   {
 38 |     using namespace tokens;
 39 | 
 40 |     switch (token.type())
 41 |     {
 42 |     case KEYWORD_FALSE:         return Rf_ScalarLogical(0);
 43 |     case KEYWORD_TRUE:          return Rf_ScalarLogical(1);
 44 |     case KEYWORD_Inf:           return Rf_ScalarReal(R_PosInf);
 45 |     case KEYWORD_NA:            return Rf_ScalarLogical(NA_LOGICAL);
 46 |     case KEYWORD_NA_character_: return Rf_ScalarString(NA_STRING);
 47 |     // case KEYWORD_NA_complex_:   return NA_COM
 48 |     case KEYWORD_NA_integer_:   return Rf_ScalarInteger(NA_INTEGER);
 49 |     case KEYWORD_NA_real_:      return Rf_ScalarReal(NA_REAL);
 50 |     case KEYWORD_NaN:           return Rf_ScalarReal(R_NaN);
 51 |     case KEYWORD_NULL:          return R_NilValue;
 52 |     default:                    return Rf_install(token.contents().c_str());
 53 |     }
 54 |   }
 55 | 
 56 |   static SEXP asFunctionCallSEXP(const ParseNode* pNode)
 57 |   {
 58 |     using namespace tokens;
 59 | 
 60 |     const Token& token = pNode->token();
 61 | 
 62 |     // Figure out the 'head' of this language object.
 63 |     // '[' and '[[' get these tokens as-is, while '('
 64 |     // instead uses the name of the first child.
 65 |     SEXP langSEXP;
 66 |     if (token.isType(LBRACKET))
 67 |       langSEXP = Rf_lang1(Rf_install("["));
 68 |     else if (token.isType(LDBRACKET))
 69 |       langSEXP = Rf_lang1(Rf_install("[["));
 70 |     else
 71 |       langSEXP = Rf_lang1(R_NilValue);
 72 | 
 73 |     // Start appending the child nodes to our list.
 74 |     r::Protect protect;
 75 |     SEXP headSEXP = protect(langSEXP);
 76 |     for (std::vector<ParseNode*>::const_iterator it = pNode->children().begin();
 77 |          it != pNode->children().end();
 78 |          ++it)
 79 |     {
 80 |       const ParseNode* node = *it;
 81 |       const Token& token = node->token();
 82 |       if (token.isType(EMPTY))
 83 |         break;
 84 |       else if (token.isType(MISSING))
 85 |         SETCDR(langSEXP, Rf_lang1(R_MissingArg));
 86 | 
 87 |       else if (token.isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS))
 88 |       {
 89 |         const ParseNode* lhs = node->children()[0];
 90 |         const ParseNode* rhs = node->children()[1];
 91 | 
 92 |         if (rhs->token().isType(MISSING))
 93 |           SETCDR(langSEXP, Rf_lang1(R_MissingArg));
 94 |         else
 95 |           SETCDR(langSEXP, Rf_lang1(asSEXP(rhs)));
 96 | 
 97 |         const Token& token = lhs->token();
 98 |         SEXP nameSEXP = Rf_install(tokens::stringValue(token).c_str());
 99 |         SET_TAG(CDR(langSEXP), nameSEXP);
100 |       }
101 |       else
102 |       {
103 |         SETCDR(langSEXP, Rf_lang1(asSEXP(node)));
104 |       }
105 | 
106 |       langSEXP = CDR(langSEXP);
107 |     }
108 | 
109 |     SEXP resultSEXP = CAR(headSEXP) == R_NilValue
110 |       ? CDR(headSEXP)
111 |       : headSEXP;
112 | 
113 |     // Convert strings to symbols at head position
114 |     if (TYPEOF(CAR(resultSEXP)) == STRSXP)
115 |       SETCAR(resultSEXP, Rf_install(CHAR(STRING_ELT(CAR(resultSEXP), 0))));
116 | 
117 |     return resultSEXP;
118 |   }
119 | 
120 |   static SEXP asFunctionArgumentListSEXP(const ParseNode* pNode)
121 |   {
122 |     index_type n = pNode->children().size();
123 |     if (n == 0)
124 |       return R_NilValue;
125 | 
126 |     r::Protect protect;
127 |     SEXP listSEXP = protect(Rf_allocList(n));
128 |     SEXP headSEXP = listSEXP;
129 |     for (std::vector<ParseNode*>::const_iterator it = pNode->children().begin();
130 |          it != pNode->children().end();
131 |          ++it)
132 |     {
133 |       const ParseNode* pChild = *it;
134 |       const tokens::Token& token = pChild->token();
135 | 
136 |       if (tokens::isOperator(token))
137 |       {
138 |         const ParseNode* pLhs = pChild->children()[0];
139 |         const ParseNode* pRhs = pChild->children()[1];
140 | 
141 |         if (pLhs->token().isType(tokens::SYMBOL))
142 |           SET_TAG(headSEXP, Rf_install(tokens::stringValue(pLhs->token()).c_str()));
143 |         SETCAR(headSEXP, asSEXP(pRhs));
144 |       }
145 |       else if (token.isType(tokens::SYMBOL))
146 |       {
147 |         SETCAR(headSEXP, R_MissingArg);
148 |         SET_TAG(headSEXP, Rf_install(tokens::stringValue(token).c_str()));
149 |       }
150 | 
151 |       headSEXP = CDR(headSEXP);
152 |     }
153 | 
154 |     return listSEXP;
155 |   }
156 | 
157 |   static SEXP asFunctionDeclSEXP(const ParseNode* pNode)
158 |   {
159 |     if (pNode->children().size() != 2)
160 |       return R_NilValue;
161 | 
162 |     r::Protect protect;
163 |     SEXP argsSEXP = protect(asFunctionArgumentListSEXP(pNode->children()[0]));
164 |     SEXP bodySEXP = protect(asSEXP(pNode->children()[1]));
165 |     SEXP fnSEXP = Rf_install("function");
166 |     SEXP resultSEXP = Rf_lang4(fnSEXP, argsSEXP, bodySEXP, R_NilValue);
167 |     return resultSEXP;
168 |   }
169 | 
170 |   static SEXP asNumericSEXP(const tokens::Token& token)
171 |   {
172 |     if (*(token.end() - 1) == 'L')
173 |       return Rf_ScalarInteger(::atof(token.begin()));
174 |     else
175 |       return Rf_ScalarReal(::atof(token.begin()));
176 |   }
177 | 
178 |   static bool isFunctionCall(const ParseNode* pNode)
179 |   {
180 |     const tokens::Token& token = pNode->token();
181 |     if (token.isType(tokens::LBRACKET) || token.isType(tokens::LDBRACKET))
182 |       return true;
183 | 
184 |     // Differentiate between '(a)' and 'a()'.
185 |     if (token.isType(tokens::LPAREN))
186 |       return pNode->children().size() > 1;
187 | 
188 |     return false;
189 |   }
190 | 
191 | public:
192 |   static SEXP asSEXP(const ParseNode* pNode)
193 |   {
194 |     using namespace tokens;
195 | 
196 |     if (!pNode)
197 |       return R_NilValue;
198 | 
199 |     if (pNode->token().isType(tokens::ROOT))
200 |     {
201 |       const std::vector<ParseNode*>& children = pNode->children();
202 |       index_type n = pNode->children().size();
203 |       r::Protect protect;
204 |       SEXP exprSEXP = protect(Rf_allocVector(EXPRSXP, n));
205 |       for (index_type i = 0; i < n; ++i)
206 |         SET_VECTOR_ELT(exprSEXP, i, asSEXP(children[i]));
207 |       return exprSEXP;
208 |     }
209 | 
210 |     // Handle function calls specially
211 |     if (isFunctionCall(pNode))
212 |       return asFunctionCallSEXP(pNode);
213 | 
214 |     const tokens::Token& token = pNode->token();
215 |     if (token.isType(KEYWORD_FUNCTION))
216 |       return asFunctionDeclSEXP(pNode);
217 | 
218 |     SEXP elSEXP;
219 |     r::Protect protect;
220 |     if (token.isType(MISSING))
221 |       elSEXP = R_MissingArg;
222 |     else if (token.isType(OPERATOR_EXPONENTATION_STARS))
223 |       elSEXP = Rf_install("^");
224 |     else if (token.isType(KEYWORD_BREAK))
225 |       elSEXP = Rf_lang1(Rf_install("break"));
226 |     else if (token.isType(KEYWORD_NEXT))
227 |       elSEXP = Rf_lang1(Rf_install("next"));
228 |     else if (isKeyword(token))
229 |       elSEXP = asKeywordSEXP(token);
230 |     else if (isOperator(token) || isLeftBracket(token))
231 |       elSEXP = Rf_install(token.contents().c_str());
232 |     else if (isNumeric(token))
233 |       elSEXP = asNumericSEXP(token);
234 |     else if (isSymbol(token))
235 |       elSEXP = Rf_install(tokens::stringValue(token).c_str());
236 |     else if (isString(token))
237 |       elSEXP = Rf_mkString(tokens::stringValue(token).c_str());
238 |     else
239 |       elSEXP = Rf_mkString(token.contents().c_str());
240 | 
241 |     if (pNode->children().empty())
242 |       return elSEXP;
243 | 
244 |     SEXP headSEXP = protect(Rf_lang1(protect(elSEXP)));
245 |     SEXP listSEXP = headSEXP;
246 |     for (std::vector<ParseNode*>::const_iterator it = pNode->children().begin();
247 |          it != pNode->children().end();
248 |          ++it)
249 |     {
250 |       const ParseNode* child = *it;
251 |       if (!child->token().isType(EMPTY))
252 |         listSEXP = SETCDR(listSEXP, Rf_lang1(asSEXP(child)));
253 |     }
254 | 
255 |     return headSEXP;
256 |   }
257 | 
258 |   static SEXP asSEXP(const std::vector<ParseNode*>& expression)
259 |   {
260 |     index_type n = expression.size();
261 |     r::Protect protect;
262 |     SEXP exprSEXP = protect(Rf_allocVector(EXPRSXP, n));
263 |     for (index_type i = 0; i < n; ++i)
264 |       SET_VECTOR_ELT(exprSEXP, i, asSEXP(expression[i]));
265 |     return exprSEXP;
266 |   }
267 | 
268 | };
269 | 
270 | void reportErrors(const std::vector<parser::ParseError>& errors)
271 | {
272 |   if (errors.empty())
273 |     return;
274 | 
275 |   std::stringstream ss;
276 |   ss << "\n  ";
277 |   typedef std::vector<parser::ParseError>::const_iterator Iterator;
278 |   for (Iterator it = errors.begin();
279 |        it != errors.end();
280 |        ++it)
281 |   {
282 |     ss << "[" << it->start().row << ":" << it->start().column << "]: "
283 |        << it->message() << std::endl << "  ";
284 |   }
285 | 
286 |   std::string message = ss.str();
287 |   Rf_warning("%s", message.c_str());
288 | }
289 | 
290 | } // anonymous namespace
291 | } // namespace sourcetools
292 | 
293 | extern "C" SEXP sourcetools_parse_string(SEXP programSEXP)
294 | {
295 |   using namespace sourcetools;
296 |   using parser::ParseStatus;
297 |   using parser::Parser;
298 |   using parser::ParseNode;
299 | 
300 |   SEXP charSEXP = STRING_ELT(programSEXP, 0);
301 |   Parser parser(CHAR(charSEXP), Rf_length(charSEXP));
302 | 
303 |   ParseStatus status;
304 |   scoped_ptr<ParseNode> pRoot(parser.parse(&status));
305 | 
306 |   sourcetools::reportErrors(status.getErrors());
307 | 
308 |   return sourcetools::SEXPConverter::asSEXP(pRoot);
309 | }
310 | 
311 | extern "C" SEXP sourcetools_diagnose_string(SEXP strSEXP)
312 | {
313 |   using namespace sourcetools;
314 |   using parser::Parser;
315 |   using parser::ParseStatus;
316 |   using parser::ParseNode;
317 |   using r::Protect;
318 | 
319 |   SEXP charSEXP = STRING_ELT(strSEXP, 0);
320 |   Parser parser(CHAR(charSEXP), Rf_length(charSEXP));
321 | 
322 |   ParseStatus status;
323 |   scoped_ptr<ParseNode> pNode(parser.parse(&status));
324 | 
325 |   using namespace diagnostics;
326 |   scoped_ptr<DiagnosticsSet> pDiagnostics(createDefaultDiagnosticsSet());
327 |   std::vector<Diagnostic> diagnostics = pDiagnostics->run(pNode);
328 |   return r::create(diagnostics);
329 | }
330 | 


--------------------------------------------------------------------------------
/src/Reader.cpp:
--------------------------------------------------------------------------------
 1 | #include <sourcetools/read/read.h>
 2 | #include <sourcetools/r/r.h>
 3 | 
 4 | #include <cstring>
 5 | 
 6 | #define R_NO_REMAP
 7 | #include <R.h>
 8 | #include <Rinternals.h>
 9 | 
10 | extern "C" SEXP sourcetools_read(SEXP absolutePathSEXP)
11 | {
12 |   const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
13 | 
14 |   std::string contents;
15 |   bool result = sourcetools::read(absolutePath, &contents);
16 |   if (!result)
17 |   {
18 |     Rf_warning("Failed to read file");
19 |     return R_NilValue;
20 |   }
21 | 
22 |   sourcetools::r::Protect protect;
23 |   SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1));
24 |   SET_STRING_ELT(resultSEXP, 0, sourcetools::r::createChar(contents));
25 |   return resultSEXP;
26 | }
27 | 
28 | extern "C" SEXP sourcetools_read_lines(SEXP absolutePathSEXP)
29 | {
30 |   const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
31 | 
32 |   std::vector<std::string> lines;
33 |   bool result = sourcetools::read_lines(absolutePath, &lines);
34 |   if (!result)
35 |   {
36 |     Rf_warning("Failed to read file");
37 |     return R_NilValue;
38 |   }
39 | 
40 |   sourcetools::index_type n = lines.size();
41 |   sourcetools::r::Protect protect;
42 |   SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n));
43 |   for (sourcetools::index_type i = 0; i < n; ++i)
44 |   {
45 |     SEXP charSEXP = sourcetools::r::createChar(lines[i]);
46 |     SET_STRING_ELT(resultSEXP, i, charSEXP);
47 |   }
48 |   return resultSEXP;
49 | }
50 | 
51 | extern "C" SEXP sourcetools_read_bytes(SEXP absolutePathSEXP)
52 | {
53 |   const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
54 | 
55 |   std::string contents;
56 |   bool result = sourcetools::read(absolutePath, &contents);
57 |   if (!result)
58 |   {
59 |     Rf_warning("Failed to read file");
60 |     return R_NilValue;
61 |   }
62 | 
63 |   sourcetools::r::Protect protect;
64 |   SEXP resultSEXP = protect(Rf_allocVector(RAWSXP, contents.size()));
65 |   std::memcpy(RAW(resultSEXP), contents.c_str(), contents.size());
66 |   return resultSEXP;
67 | }
68 | 
69 | extern "C" SEXP sourcetools_read_lines_bytes(SEXP absolutePathSEXP)
70 | {
71 |   const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
72 | 
73 |   std::vector<std::string> lines;
74 |   bool result = sourcetools::read_lines(absolutePath, &lines);
75 |   if (!result)
76 |   {
77 |     Rf_warning("Failed to read file");
78 |     return R_NilValue;
79 |   }
80 | 
81 |   sourcetools::index_type n = lines.size();
82 |   sourcetools::r::Protect protect;
83 |   SEXP resultSEXP = protect(Rf_allocVector(VECSXP, n));
84 |   for (sourcetools::index_type i = 0; i < n; ++i)
85 |   {
86 |     SEXP rawSEXP = Rf_allocVector(RAWSXP, lines[i].size());
87 |     std::memcpy(RAW(rawSEXP), lines[i].c_str(), lines[i].size());
88 |     SET_VECTOR_ELT(resultSEXP, i, rawSEXP);
89 |   }
90 |   return resultSEXP;
91 | }
92 | 


--------------------------------------------------------------------------------
/src/Tokenizer.cpp:
--------------------------------------------------------------------------------
  1 | #include <sourcetools.h>
  2 | 
  3 | #define R_NO_REMAP
  4 | #include <R.h>
  5 | #include <Rinternals.h>
  6 | 
  7 | namespace sourcetools {
  8 | namespace {
  9 | 
 10 | void asDataFrame(SEXP listSEXP, int n)
 11 | {
 12 |   r::Protect protect;
 13 |   SEXP classSEXP = protect(Rf_mkString("data.frame"));
 14 |   Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP);
 15 | 
 16 |   SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2));
 17 |   INTEGER(rownamesSEXP)[0] = NA_INTEGER;
 18 |   INTEGER(rownamesSEXP)[1] = -n;
 19 |   Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP);
 20 | }
 21 | 
 22 | SEXP asSEXP(const std::vector<tokens::Token>& tokens)
 23 | {
 24 |   r::Protect protect;
 25 |   index_type n = tokens.size();
 26 |   SEXP resultSEXP = protect(Rf_allocVector(VECSXP, 4));
 27 | 
 28 |   // Set vector elements
 29 |   SEXP valueSEXP = protect(Rf_allocVector(STRSXP, n));
 30 |   SET_VECTOR_ELT(resultSEXP, 0, valueSEXP);
 31 |   for (index_type i = 0; i < n; ++i) {
 32 |     const std::string& contents = tokens[i].contents();
 33 |     SET_STRING_ELT(valueSEXP, i, r::createChar(contents));
 34 |   }
 35 | 
 36 |   SEXP rowSEXP = protect(Rf_allocVector(INTSXP, n));
 37 |   SET_VECTOR_ELT(resultSEXP, 1, rowSEXP);
 38 |   for (index_type i = 0; i < n; ++i)
 39 |     INTEGER(rowSEXP)[i] = tokens[i].row() + 1;
 40 | 
 41 |   SEXP columnSEXP = protect(Rf_allocVector(INTSXP, n));
 42 |   SET_VECTOR_ELT(resultSEXP, 2, columnSEXP);
 43 |   for (index_type i = 0; i < n; ++i)
 44 |     INTEGER(columnSEXP)[i] = tokens[i].column() + 1;
 45 | 
 46 |   SEXP typeSEXP = protect(Rf_allocVector(STRSXP, n));
 47 |   SET_VECTOR_ELT(resultSEXP, 3, typeSEXP);
 48 |   for (index_type i = 0; i < n; ++i) {
 49 |     const std::string& type = toString(tokens[i].type());
 50 |     SET_STRING_ELT(typeSEXP, i, r::createChar(type));
 51 |   }
 52 | 
 53 |   // Set names
 54 |   SEXP namesSEXP = protect(Rf_allocVector(STRSXP, 4));
 55 | 
 56 |   SET_STRING_ELT(namesSEXP, 0, Rf_mkChar("value"));
 57 |   SET_STRING_ELT(namesSEXP, 1, Rf_mkChar("row"));
 58 |   SET_STRING_ELT(namesSEXP, 2, Rf_mkChar("column"));
 59 |   SET_STRING_ELT(namesSEXP, 3, Rf_mkChar("type"));
 60 | 
 61 |   Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP);
 62 | 
 63 |   asDataFrame(resultSEXP, n);
 64 | 
 65 |   return resultSEXP;
 66 | }
 67 | 
 68 | } // anonymous namespace
 69 | } // namespace sourcetools
 70 | 
 71 | extern "C" SEXP sourcetools_tokenize_file(SEXP absolutePathSEXP)
 72 | {
 73 |   typedef sourcetools::tokens::Token Token;
 74 | 
 75 |   const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0));
 76 |   std::string contents;
 77 |   if (!sourcetools::read(absolutePath, &contents))
 78 |   {
 79 |     Rf_warning("Failed to read file");
 80 |     return R_NilValue;
 81 |   }
 82 | 
 83 |   if (contents.empty()) return R_NilValue;
 84 |   const std::vector<Token>& tokens = sourcetools::tokenize(contents);
 85 |   return sourcetools::asSEXP(tokens);
 86 | }
 87 | 
 88 | extern "C" SEXP sourcetools_tokenize_string(SEXP stringSEXP)
 89 | {
 90 |   typedef sourcetools::tokens::Token Token;
 91 | 
 92 |   if (Rf_length(stringSEXP) == 0)
 93 |     return sourcetools::asSEXP(std::vector<Token>());
 94 | 
 95 |   SEXP charSEXP = STRING_ELT(stringSEXP, 0);
 96 |   const std::vector<Token>& tokens =
 97 |     sourcetools::tokenize(CHAR(charSEXP), Rf_length(charSEXP));
 98 |   return sourcetools::asSEXP(tokens);
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/ValidateSyntax.cpp:
--------------------------------------------------------------------------------
 1 | #include <sourcetools.h>
 2 | using namespace sourcetools;
 3 | 
 4 | namespace {
 5 | 
 6 | typedef sourcetools::validators::SyntaxError Error;
 7 | struct RowSetter
 8 | {
 9 |   void operator()(SEXP dataSEXP, index_type i, const Error& error)
10 |   {
11 |     INTEGER(dataSEXP)[i] = error.row() + 1;
12 |   }
13 | };
14 | 
15 | struct ColSetter
16 | {
17 |   void operator()(SEXP dataSEXP, index_type i, const Error& error)
18 |   {
19 |     INTEGER(dataSEXP)[i] = error.column() + 1;
20 |   }
21 | };
22 | 
23 | struct ErrSetter
24 | {
25 |   void operator()(SEXP dataSEXP, index_type i, const Error& error)
26 |   {
27 |     const std::string& msg = error.message();
28 |     SEXP charSEXP = sourcetools::r::createChar(msg);
29 |     SET_STRING_ELT(dataSEXP, i, charSEXP);
30 |   }
31 | };
32 | 
33 | } // anonymous namespace
34 | 
35 | extern "C" SEXP sourcetools_validate_syntax(SEXP contentsSEXP) {
36 |   using namespace sourcetools;
37 |   using namespace sourcetools::tokens;
38 |   using namespace sourcetools::validators;
39 | 
40 |   r::Protect protect;
41 |   if (Rf_length(contentsSEXP) == 0)
42 |     contentsSEXP = protect(Rf_mkString(""));
43 | 
44 |   const char* contents = CHAR(STRING_ELT(contentsSEXP, 0));
45 |   const std::vector<tokens::Token>& tokens = sourcetools::tokenize(contents);
46 | 
47 |   SyntaxValidator validator(tokens);
48 |   const std::vector<SyntaxError>& errors = validator.errors();
49 |   index_type n = errors.size();
50 | 
51 |   r::RObjectFactory factory;
52 |   SEXP resultSEXP = factory.create(VECSXP, 3);
53 |   SET_VECTOR_ELT(resultSEXP, 0, factory.create(INTSXP, errors, RowSetter()));
54 |   SET_VECTOR_ELT(resultSEXP, 1, factory.create(INTSXP, errors, ColSetter()));
55 |   SET_VECTOR_ELT(resultSEXP, 2, factory.create(STRSXP, errors, ErrSetter()));
56 | 
57 |   const char* names[] = {"row", "column", "error"};
58 |   r::util::setNames(resultSEXP, names, 3);
59 |   r::util::listToDataFrame(resultSEXP, n);
60 | 
61 |   return resultSEXP;
62 | }
63 | 


--------------------------------------------------------------------------------
/src/sourcetools-init.c:
--------------------------------------------------------------------------------
 1 | #include <R.h>
 2 | #include <Rinternals.h>
 3 | #include <stdlib.h> // for NULL
 4 | #include <R_ext/Rdynload.h>
 5 | 
 6 | /* FIXME: 
 7 |    Check these declarations against the C/Fortran source code.
 8 | */
 9 | 
10 | /* .Call calls */
11 | extern SEXP run_testthat_tests();
12 | extern SEXP sourcetools_diagnose_string(SEXP);
13 | extern SEXP sourcetools_parse_string(SEXP);
14 | extern SEXP sourcetools_performs_nse(SEXP);
15 | extern SEXP sourcetools_read(SEXP);
16 | extern SEXP sourcetools_read_bytes(SEXP);
17 | extern SEXP sourcetools_read_lines(SEXP);
18 | extern SEXP sourcetools_read_lines_bytes(SEXP);
19 | extern SEXP sourcetools_tokenize_file(SEXP);
20 | extern SEXP sourcetools_tokenize_string(SEXP);
21 | extern SEXP sourcetools_validate_syntax(SEXP);
22 | 
23 | static const R_CallMethodDef CallEntries[] = {
24 |     {"run_testthat_tests",           (DL_FUNC) &run_testthat_tests,           0},
25 |     {"sourcetools_diagnose_string",  (DL_FUNC) &sourcetools_diagnose_string,  1},
26 |     {"sourcetools_parse_string",     (DL_FUNC) &sourcetools_parse_string,     1},
27 |     {"sourcetools_performs_nse",     (DL_FUNC) &sourcetools_performs_nse,     1},
28 |     {"sourcetools_read",             (DL_FUNC) &sourcetools_read,             1},
29 |     {"sourcetools_read_bytes",       (DL_FUNC) &sourcetools_read_bytes,       1},
30 |     {"sourcetools_read_lines",       (DL_FUNC) &sourcetools_read_lines,       1},
31 |     {"sourcetools_read_lines_bytes", (DL_FUNC) &sourcetools_read_lines_bytes, 1},
32 |     {"sourcetools_tokenize_file",    (DL_FUNC) &sourcetools_tokenize_file,    1},
33 |     {"sourcetools_tokenize_string",  (DL_FUNC) &sourcetools_tokenize_string,  1},
34 |     {"sourcetools_validate_syntax",  (DL_FUNC) &sourcetools_validate_syntax,  1},
35 |     {NULL, NULL, 0}
36 | };
37 | 
38 | void R_init_sourcetools(DllInfo *dll)
39 | {
40 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
41 |     R_useDynamicSymbols(dll, FALSE);
42 | }
43 | 


--------------------------------------------------------------------------------
/src/test-Parser.cpp:
--------------------------------------------------------------------------------
 1 | #include <testthat.h>
 2 | #include <sourcetools.h>
 3 | 
 4 | using namespace sourcetools;
 5 | using namespace sourcetools::parser;
 6 | using namespace sourcetools::cursors;
 7 | using namespace sourcetools::collections;
 8 | 
 9 | typedef sourcetools::tokens::Token Token;
10 | 
11 | context("Parser") {
12 | 
13 |   test_that("we can extract partial parse trees from code")
14 |   {
15 |     std::string code = "foo <- function(a = {1 + 2}) {}";
16 | 
17 |     std::vector<Token> tokens = tokenize(code);
18 |     Parser parser(code);
19 | 
20 |     ParseStatus status;
21 |     scoped_ptr<ParseNode> pRoot(parser.parse(&status));
22 | 
23 |     TokenCursor cursor(tokens);
24 |     expect_true(cursor.findFwd("="));
25 | 
26 |     Position position = cursor.currentToken().position();
27 |     ParseNode* pTarget = status.getNodeAtPosition(position);
28 |     expect_true((pTarget != NULL));
29 |     if (pTarget == NULL)
30 |       return;
31 | 
32 |     const char* begin;
33 |     const char* end;
34 |     pTarget->bounds(&begin, &end);
35 | 
36 |     std::string contents(begin, end);
37 |     expect_true(contents == "a = {1 + 2}");
38 | 
39 |     expect_true(cursor.findFwd("{"));
40 |     pTarget = status.getNodeAtPosition(cursor.position());
41 |     expect_true((pTarget != NULL));
42 |     expect_true((pTarget->token().contentsEqual("{")));
43 |     if (pTarget == NULL)
44 |       return;
45 | 
46 |     pTarget->bounds(&begin, &end);
47 |     contents = std::string(begin, end);
48 |     expect_true(contents == "{1 + 2}");
49 |   }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test-Tokenizer.cpp:
--------------------------------------------------------------------------------
  1 | #include <testthat.h>
  2 | #include <sourcetools.h>
  3 | 
  4 | using namespace sourcetools;
  5 | using namespace sourcetools::cursors;
  6 | 
  7 | typedef sourcetools::tokens::Token Token;
  8 | 
  9 | namespace {
 10 | 
 11 | class OpenBracketLocator
 12 | {
 13 | public:
 14 |   inline bool operator()(TokenCursor* pCursor) const {
 15 | 
 16 |     if (pCursor->bwdToMatchingBracket())
 17 |       return false;
 18 | 
 19 |     return tokens::isLeftBracket(pCursor->currentToken());
 20 |   }
 21 | };
 22 | 
 23 | } // anonymous namespace
 24 | 
 25 | context("Tokenizer") {
 26 | 
 27 |   test_that("Complements are detected correctly") {
 28 | 
 29 |     using namespace sourcetools::tokens;
 30 | 
 31 |     expect_true(complement(LPAREN)    == RPAREN);
 32 |     expect_true(complement(LBRACE)    == RBRACE);
 33 |     expect_true(complement(LBRACKET)  == RBRACKET);
 34 |     expect_true(complement(LDBRACKET) == RDBRACKET);
 35 | 
 36 |     expect_true(complement(RPAREN)    == LPAREN);
 37 |     expect_true(complement(RBRACE)    == LBRACE);
 38 |     expect_true(complement(RBRACKET)  == LBRACKET);
 39 |     expect_true(complement(RDBRACKET) == LDBRACKET);
 40 | 
 41 |     expect_true(isComplement(LPAREN,    RPAREN));
 42 |     expect_true(isComplement(LBRACE,    RBRACE));
 43 |     expect_true(isComplement(LBRACKET,  RBRACKET));
 44 |     expect_true(isComplement(LDBRACKET, RDBRACKET));
 45 | 
 46 |     expect_true(isComplement(RPAREN,    LPAREN));
 47 |     expect_true(isComplement(RBRACE,    LBRACE));
 48 |     expect_true(isComplement(RBRACKET,  LBRACKET));
 49 |     expect_true(isComplement(RDBRACKET, LDBRACKET));
 50 |   }
 51 | 
 52 |   test_that("Keywords are detected correctly") {
 53 |     std::string code = "if for while break repeat";
 54 |     const std::vector<Token>& tokens = sourcetools::tokenize(code);
 55 |     for (std::vector<Token>::const_iterator it = tokens.begin();
 56 |          it != tokens.end();
 57 |          ++it)
 58 |     {
 59 |       const Token& token = *it;
 60 |       if (isWhitespace(token))
 61 |         continue;
 62 |       expect_true(isKeyword(token));
 63 | 
 64 |     }
 65 |   }
 66 | 
 67 |   test_that("TokenCursor operations work as expected") {
 68 |     std::string code = "if (foo) { print(bar) } else {}";
 69 |     const std::vector<Token>& tokens = sourcetools::tokenize(code);
 70 |     TokenCursor cursor(tokens);
 71 |     expect_true(cursor.currentToken().contentsEqual("if"));
 72 |     cursor.moveToNextSignificantToken();
 73 |     expect_true(cursor.currentToken().contentsEqual("("));
 74 |     expect_true(cursor.fwdToMatchingBracket());
 75 |     expect_true(cursor.currentToken().contentsEqual(")"));
 76 |   }
 77 | 
 78 |   test_that("Move to position works as expected") {
 79 |     std::string code = "if (foo) { print(1) }";
 80 |     const std::vector<Token>& tokens = sourcetools::tokenize(code);
 81 |     TokenCursor cursor(tokens);
 82 | 
 83 |     // move to 'if'
 84 |     expect_true(cursor.moveToPosition(0, 0));
 85 |     expect_true(cursor.isType(tokens::KEYWORD_IF));
 86 | 
 87 |     // move to whitespace before print
 88 |     expect_true(cursor.moveToPosition(0, 10));
 89 |     expect_true(cursor.currentToken().contentsEqual(" "));
 90 | 
 91 |     // move to 'print'
 92 |     expect_true(cursor.moveToPosition(0, 11));
 93 |     expect_true(cursor.currentToken().contentsEqual("print"));
 94 | 
 95 |     // move to 'print' but target in middle
 96 |     expect_true(cursor.moveToPosition(0, 12));
 97 |     expect_true(cursor.currentToken().contentsEqual("print"));
 98 | 
 99 |     expect_true(cursor.moveToPosition(0, 13));
100 |     expect_true(cursor.currentToken().contentsEqual("print"));
101 | 
102 |     expect_true(cursor.moveToPosition(0, 14));
103 |     expect_true(cursor.currentToken().contentsEqual("print"));
104 | 
105 |     expect_true(cursor.moveToPosition(0, 15));
106 |     expect_true(cursor.currentToken().contentsEqual("print"));
107 | 
108 |     // move to '('
109 |     expect_true(cursor.moveToPosition(0, 16));
110 |     expect_true(cursor.currentToken().contentsEqual("("));
111 |   }
112 | 
113 |   test_that("find operations work")
114 |   {
115 |     std::string code = "(if (foo) { print(1) })";
116 |     const std::vector<Token>& tokens = sourcetools::tokenize(code);
117 |     TokenCursor cursor(tokens);
118 | 
119 |     OpenBracketLocator locator;
120 |     expect_true(cursor.moveToPosition(0, 13));
121 |     expect_true(cursor.currentToken().contentsEqual("print"));
122 |     expect_true(cursor.findBwd(locator));
123 |     expect_true(cursor.currentToken().contentsEqual("{"));
124 |     expect_true(cursor.fwdToMatchingBracket());
125 |     expect_true(cursor.currentToken().contentsEqual("}"));
126 |     expect_true(cursor.findBwd(locator));
127 |     expect_true(cursor.currentToken().contentsEqual("("));
128 |   }
129 | }
130 | 


--------------------------------------------------------------------------------
/src/test-multibyte.cpp:
--------------------------------------------------------------------------------
 1 | #include <testthat.h>
 2 | #include <sourcetools.h>
 3 | 
 4 | #include <R.h>
 5 | #include <Rinternals.h>
 6 | 
 7 | namespace {
 8 | 
 9 | std::string charType(wchar_t ch)
10 | {
11 |   std::string result;
12 |   if (std::iswcntrl(ch))
13 |     result += "cntrl,";
14 |   if (std::iswprint(ch))
15 |     result += "print,";
16 |   if (std::iswspace(ch))
17 |     result += "space,";
18 | #ifdef SOURCETOOLS_COMPILER_CXX11
19 |   if (std::iswblank(ch))
20 |     result += "blank,";
21 | #endif
22 |   if (std::iswgraph(ch))
23 |     result += "graph,";
24 |   if (std::iswpunct(ch))
25 |     result += "punct,";
26 |   if (std::iswalnum(ch))
27 |     result += "alnum,";
28 |   if (std::iswalpha(ch))
29 |     result += "alpha,";
30 |   if (std::iswupper(ch))
31 |     result += "upper,";
32 |   if (std::iswlower(ch))
33 |     result += "lower,";
34 |   if (std::iswdigit(ch))
35 |     result += "digit,";
36 |   if (std::iswxdigit(ch))
37 |     result += "xdigit,";
38 | 
39 |   if (!result.empty())
40 |     result = result.substr(0, result.size() - 1);
41 | 
42 |   return result;
43 | }
44 | 
45 | } // anonymous namespace
46 | 
47 | extern "C" SEXP sourcetools_print_multibyte(SEXP dataSEXP)
48 | {
49 |   const char* data = CHAR(STRING_ELT(dataSEXP, 0));
50 |   sourcetools::index_type size = Rf_length(STRING_ELT(dataSEXP, 0));
51 | 
52 |   wchar_t ch;
53 |   const char* it = data;
54 |   while (true)
55 |   {
56 |     int length = std::mbtowc(&ch, it, MB_CUR_MAX);
57 |     if (length == 0)
58 |       break;
59 | 
60 |     if (length == -1)
61 |     {
62 |       Rf_warning("Invalid multibyte character at index %li\n", (long) (it - data));
63 |       ++it;
64 |       continue;
65 |     }
66 | 
67 |     std::string type = charType(ch);
68 |     Rprintf("%5i: [%s,%i] '%lc'\n", (int) ch, type.c_str(), length, ch);
69 | 
70 |     it += length;
71 |   }
72 | 
73 |   return R_NilValue;
74 | }
75 | 
76 | extern "C" SEXP sourcetools_print_utf8(SEXP dataSEXP)
77 | {
78 |   using namespace sourcetools;
79 | 
80 |   const char* data = CHAR(STRING_ELT(dataSEXP, 0));
81 |   utf8::iterator it(data);
82 | 
83 |   wchar_t ch = *it++;
84 |   while (true)
85 |   {
86 |     wchar_t ch = *it++;
87 |     if (ch == 0 || ch == -1)
88 |       break;
89 |     Rprintf("[%i]: %lc\n", (int) ch, ch);
90 |   }
91 | 
92 |   return R_NilValue;
93 | 
94 | }
95 | 


--------------------------------------------------------------------------------
/src/test-r.cpp:
--------------------------------------------------------------------------------
 1 | #include <testthat.h>
 2 | #include <sourcetools.h>
 3 | 
 4 | using namespace sourcetools;
 5 | 
 6 | class StringRecorder : public r::CallRecurser::Operation
 7 | {
 8 | public:
 9 | 
10 |   virtual void apply(SEXP dataSEXP)
11 |   {
12 |     if (TYPEOF(dataSEXP) == STRSXP)
13 |       strings_.insert(CHAR(STRING_ELT(dataSEXP, 0)));
14 |   }
15 | 
16 |   const std::set<std::string>& strings() const
17 |   {
18 |     return strings_;
19 |   }
20 | 
21 | private:
22 |   std::set<std::string> strings_;
23 | };
24 | 
25 | context("CallRecurser")
26 | {
27 |   test_that("The R call recurser works")
28 |   {
29 |     SEXP fnSEXP   = Rf_findFun(Rf_install("all.equal"), R_BaseNamespace);
30 |     SEXP bodySEXP = r::util::functionBody(fnSEXP);
31 | 
32 |     scoped_ptr<StringRecorder> recorder(new StringRecorder);
33 | 
34 |     r::CallRecurser recurser(bodySEXP);
35 |     recurser.add(recorder);
36 |     recurser.run();
37 | 
38 |     const std::set<std::string>& discoveries = recorder->strings();
39 | 
40 |     expect_true(discoveries.size() == 1);
41 |     expect_true(discoveries.count("all.equal"));
42 |   }
43 | 
44 |   test_that("Functions which perform non-standard evaluation are detected")
45 |   {
46 |     SEXP fnSEXP;
47 |     fnSEXP = Rf_findFun(Rf_install("library"), R_BaseNamespace);
48 |     expect_true(r::nse::performsNonStandardEvaluation(fnSEXP));
49 | 
50 |     fnSEXP = Rf_findFun(Rf_install(".gtn"), R_BaseNamespace);
51 |     expect_false(r::nse::performsNonStandardEvaluation(fnSEXP));
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/test-runner.cpp:
--------------------------------------------------------------------------------
1 | #define TESTTHAT_TEST_RUNNER
2 | #include <testthat.h>
3 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | if (require("testthat", quietly = TRUE)) {
2 |   library(sourcetools)
3 |   test_check("sourcetools")
4 | }
5 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-aaa.R:
--------------------------------------------------------------------------------
 1 | with_gctorture <- function(expr) {
 2 |   gctorture(TRUE)
 3 |   result <- expr
 4 |   gctorture(FALSE)
 5 |   result
 6 | }
 7 | 
 8 | compare_tokens <- function(tokens, expected) {
 9 | 
10 |   if (is.character(tokens))
11 |     tokens <- tokenize_string(tokens)
12 | 
13 |   expect_true(
14 |     nrow(tokens) == length(expected),
15 |     "different number of tokens"
16 |   )
17 | 
18 |   for (i in 1:nrow(tokens)) {
19 |     expect_true(
20 |       tokens$value[[i]] == expected[[i]],
21 |       paste0("expected token '", tokens$value[[i]], "'; got '", expected[[i]], "'")
22 |     )
23 |   }
24 | 
25 | }
26 | 
27 | check_parse <- function(R, S = R) {
28 |   lhs <- base::parse(text = R, keep.source = FALSE)
29 |   rhs <- with_gctorture(parse_string(S))
30 |   check_parse_impl(lhs, rhs)
31 | }
32 | 
33 | check_parse_impl <- function(lhs, rhs) {
34 | 
35 |   lhsType <- typeof(lhs)
36 |   rhsType <- typeof(rhs)
37 | 
38 |   onError <- function(format, ...) {
39 |     message <- c(
40 |       sprintf(format, ...),
41 |       sprintf("R: '%s'", deparse(lhs)),
42 |       sprintf("S: '%s'", deparse(rhs))
43 |     )
44 |     stop(paste(message, collapse = "\n"), call. = FALSE)
45 |   }
46 | 
47 |   if (lhsType != rhsType)
48 |     onError("TypeError: '%s' != '%s'", lhsType, rhsType)
49 | 
50 |   if (length(lhs) != length(rhs))
51 |     onError("LengthError: %s != %s", length(lhs), length(rhs))
52 | 
53 |   if (is.call(lhs) || is.expression(lhs)) {
54 |     lapply(seq_along(lhs), function(i) {
55 |       check_parse_impl(lhs[[i]], rhs[[i]])
56 |     })
57 |   }
58 | 
59 |   if (!identical(lhs, rhs))
60 |     onError("IdenticalError: '%s' != '%s'", lhs, rhs)
61 | 
62 |   TRUE
63 | }
64 | 
65 | expect_parse <- function(R, S = R) {
66 |   testthat::expect_true(check_parse(R, S))
67 | }
68 | 
69 | 


--------------------------------------------------------------------------------
/tests/testthat/helper-utf8.R:
--------------------------------------------------------------------------------
1 | octal <- "\012"
2 | hex   <- "\xE2\x99\xA5"
3 | utf8  <- "\u2665"
4 | 


--------------------------------------------------------------------------------
/tests/testthat/test-catch.R:
--------------------------------------------------------------------------------
1 | context("Catch")
2 | 
3 | test_that("C++ tests pass", {
4 |   expect_cpp_tests_pass("sourcetools")
5 | })
6 | 


--------------------------------------------------------------------------------
/tests/testthat/test-diagnostics.R:
--------------------------------------------------------------------------------
 1 | context("Diagnostics")
 2 | 
 3 | expect_no_diagnostics <- function(string) {
 4 |   diagnostics <- diagnose_string(string)
 5 |   expect_true(length(diagnostics) == 0)
 6 |   if (interactive()) print(diagnostics)
 7 | }
 8 | 
 9 | expect_diagnostics <- function(string) {
10 |   diagnostics <- diagnose_string(string)
11 |   expect_true(length(diagnostics) > 0)
12 |   if (interactive()) print(diagnostics)
13 | }
14 | 
15 | test_that("missing symbols reported appropriately", {
16 |   expect_diagnostics("foo <- function(apple) { print(Apple) }")
17 | })
18 | 
19 | test_that("unused computations are reported", {
20 |   expect_diagnostics("foo <- function(apple) { apple < 1; print(TRUE) }")
21 | })
22 | 
23 | test_that("use of '=' in if statement is reported", {
24 |   expect_diagnostics("if (foo = 1) { print(1) }")
25 | })
26 | 
27 | test_that("use of '&', '|' in 'if' is reported", {
28 |   expect_diagnostics("if (1 & 2) print(1)")
29 |   expect_diagnostics("if (1 | 2) print(1)")
30 | })
31 | 
32 | test_that("x == NULL is reported", {
33 |   expect_diagnostics("status <- print(1) == NULL; print(status)")
34 | })
35 | 


--------------------------------------------------------------------------------
/tests/testthat/test-parser.R:
--------------------------------------------------------------------------------
  1 | context("Parser")
  2 | 
  3 | test_that("precedence of '?' vs '=' correctly handled", {
  4 | 
  5 |   skip_if(getRversion() < "4.0.3")
  6 | 
  7 |   expect_parse("foo ? bar = baz")
  8 |   expect_parse("foo ? bar <- baz")
  9 | 
 10 | })
 11 | 
 12 | test_that("parser handles simple control flow", {
 13 | 
 14 |   expect_parse("if (foo) bar + baz")
 15 |   expect_parse("while (1) 1 + 2")
 16 |   expect_parse("repeat 1 + 2")
 17 |   expect_parse("if (foo) bar else baz")
 18 |   expect_parse("if (foo) bar else if (baz) bat")
 19 |   expect_parse("for (i in 1:10) 1 + 10")
 20 | 
 21 | })
 22 | 
 23 | test_that("parser handles compound expressions", {
 24 | 
 25 |   expect_parse("if (foo) while (bar) 1")
 26 |   expect_parse("if (foo) (1 + 2)")
 27 |   expect_parse("{1; 2; 3}")
 28 |   expect_parse("{1 + 2\n3 + 4\n5 + 6}")
 29 | 
 30 | })
 31 | 
 32 | test_that("parser handles function calls", {
 33 |   expect_parse("foo <- bar(baz)[[1]]$bat")
 34 |   expect_parse("foo <- bar() + bam() * bat()")
 35 | })
 36 | 
 37 | test_that("parser handles precedence", {
 38 |   expect_parse("a$b[[1]]$c")
 39 |   expect_parse("object <- unclass(object)[i]")
 40 | })
 41 | 
 42 | test_that("parser handles numbers of various forms", {
 43 |   expect_parse(".15")
 44 |   expect_parse("15.")
 45 |   expect_parse("1.5")
 46 |   # expect_parse("1.5L") #TODO: R warns and parses as numeric
 47 |   expect_parse("15L")
 48 |   expect_parse("10E5")
 49 |   expect_parse("10E5L")
 50 | })
 51 | 
 52 | test_that("parser handles function calls with no args", {
 53 |   # Did you know?
 54 |   #
 55 |   # > length(base::parse(text = "a[]")[[1]])   # [1] 3
 56 |   # > length(base::parse(text = "a[[]]")[[1]]) # [1] 3
 57 |   #
 58 |   # R inserts an empty 'R_MissingArg' argument
 59 |   # into the third spot. This is sensible, albeit
 60 |   # a bit surprising when you first see it.
 61 |   expect_parse("a()")
 62 |   expect_parse("a[]")
 63 |   expect_parse("a[[]]")
 64 | })
 65 | 
 66 | test_that("parser recovers from missing commas", {
 67 |   expect_warning(expect_parse("a(1, 2, 3)", "a(1 2 3)"))
 68 |   expect_warning(expect_parse("function(a, b, c) 1", "function(a b c) 1"))
 69 | })
 70 | 
 71 | test_that("parser handles missing arguments", {
 72 |   expect_parse("a(,)")
 73 |   expect_parse("a[,]")
 74 |   expect_parse("a[[,]]")
 75 | 
 76 |   expect_parse("a(1,)")
 77 |   expect_parse("a[1,]")
 78 |   expect_parse("a[[1,]]")
 79 | 
 80 |   expect_parse("a(,1)")
 81 |   expect_parse("a[,1]")
 82 |   expect_parse("a[[,1]]")
 83 | 
 84 |   expect_parse("a(x =, b =)")
 85 |   expect_parse("quote(expr =)")
 86 |   expect_parse("a(x = ,)")
 87 | })
 88 | 
 89 | test_that("parser handles chained function calls", {
 90 |   expect_parse("a(b)(c)(d)(e)")
 91 |   expect_parse("a[b][c][d][e]")
 92 |   expect_parse("a[[b]][[c]][[d]][[e]]")
 93 | })
 94 | 
 95 | test_that("parser handles newlines as statement delimiter", {
 96 |   expect_parse("a <- b\n+1")
 97 |   expect_parse("a <- 1\n(b)")
 98 |   expect_parse("a <- foo(1)\n(b)")
 99 |   expect_parse("(a <- foo(1)\n(b))")
100 | })
101 | 
102 | test_that("parser handles semi-colons as statement delimiter", {
103 |   expect_parse("a <- 1; b <- 2; c <- 3")
104 |   expect_parse("{a <- 1;}")
105 |   expect_parse("{a <- 1;;; b}")
106 | })
107 | 
108 | test_that("parser handles various escapes in strings", {
109 |   expect_parse("'a = \\u{A0}'")
110 |   expect_parse("a <- ifelse(a, '\\u{A0}', '\\u{A1}')")
111 | })
112 | 
113 | test_that("parser normalizes string names in function calls", {
114 |   expect_parse('"["(unclass(object), i)')
115 |   expect_parse('"lol"(1, 2)')
116 | })
117 | 
118 | test_that("parser handles if-else", {
119 | 
120 |   expect_parse("if (foo) {\nbar\n} else if (baz) {\n}")
121 | 
122 | })
123 | 
124 | test_that("parser handles various escapes in strings", {
125 |   # TODO: when deparsing UTF-8 escapes, Windows just prints
126 |   # the code point and so this test fails. E.g.
127 |   #
128 |   #   > format("\u2665")
129 |   #   [1] "<U+2665>"
130 |   skip_on_os("windows")
131 | 
132 |   contents <- read("helper-utf8.R")
133 |   expect_parse(contents)
134 | })
135 | 
136 | test_that("parser handles multi-line strings", {
137 | 
138 |   expect_parse('"a\nb\nc" + 1')
139 |   expect_parse('"a\nb\nc" * 1')
140 | 
141 | })
142 | 


--------------------------------------------------------------------------------
/tests/testthat/test-read.R:
--------------------------------------------------------------------------------
 1 | context("Reader")
 2 | 
 3 | files <- list.files(
 4 |   pattern = "[.]R$",
 5 |   full.names = TRUE,
 6 |   include.dirs = FALSE
 7 | )
 8 | 
 9 | test_that("read_lines and readLines agree on output", {
10 |   for (file in files) {
11 |     r <- readLines(file, warn = FALSE, encoding = "UTF-8")
12 |     s <- sourcetools::read_lines(file)
13 |     expect_identical(r, s)
14 |   }
15 | })
16 | 
17 | test_that("read and readChar agree on output", {
18 |   for (file in files) {
19 |     r <- readChar(file, file.info(file)$size, TRUE)
20 |     Encoding(r) <- "UTF-8"
21 |     s <- sourcetools::read(file)
22 |     expect_identical(r, s)
23 |   }
24 | })
25 | 
26 | test_that("read_bytes and readBin agree on output", {
27 |   for (file in files) {
28 |     r <- readBin(file, "raw", file.info(file)$size)
29 |     s <- sourcetools::read_bytes(file)
30 |     expect_identical(r, s)
31 |   }
32 | })
33 | 
34 | test_that("read_lines can handle '\\r' line endings", {
35 | 
36 |   file <- tempfile()
37 |   on.exit(unlink(file), add = TRUE)
38 | 
39 |   text <- "this\ris\rsome\rtext"
40 |   writeLines(text, con = file, useBytes = TRUE)
41 | 
42 |   r <- readLines(file)
43 |   s <- read_lines(file)
44 |   expect_identical(r, s)
45 | 
46 | })
47 | 
48 | test_that("read_lines can handle '\\r\\n' line endings", {
49 | 
50 |   file <- tempfile()
51 |   on.exit(unlink(file), add = TRUE)
52 | 
53 |   text <- "this\r\nis\r\nsome\r\ntext\r"
54 |   writeBin(charToRaw(text), file)
55 | 
56 |   r <- readLines(file)
57 |   s <- read_lines(file)
58 |   expect_identical(r, s)
59 | 
60 | })
61 | 
62 | test_that("read_lines can handle mixed line endings", {
63 | 
64 |   file <- tempfile()
65 |   on.exit(unlink(file), add = TRUE)
66 | 
67 |   text <- "this\ris\nsome\r\ntext\r"
68 |   writeBin(charToRaw(text), file)
69 | 
70 |   r <- readLines(file)
71 |   s <- read_lines(file)
72 |   expect_identical(r, s)
73 | 
74 | })
75 | 


--------------------------------------------------------------------------------
/tests/testthat/test-tokenize.R:
--------------------------------------------------------------------------------
  1 | context("Tokenizer")
  2 | 
  3 | test_that("Operators are tokenized correctly", {
  4 | 
  5 |   operators <- c(
  6 |     "::", ":::", "$", "@", "[", "[[", "^", "-", "+", ":",
  7 |     "*", "/", "+", "-", "<", ">", "<=", ">=", "==", "!=",
  8 |     "!", "&", "&&", "|", "|>", "||", "~", "->", "->>", "<-", "<<-",
  9 |     "=", "?", "**", "%%", "%for%"
 10 |   )
 11 | 
 12 |   tokenized <- tokenize_string(paste(operators, collapse = " "))
 13 | 
 14 |   for (operator in operators) {
 15 |     tokens <- tokenize_string(operator)
 16 |     expect_true(nrow(tokens) == 1, paste("expected a single token ('", operator, "')"))
 17 |   }
 18 | })
 19 | 
 20 | test_that("Numbers are tokenized correctly", {
 21 | 
 22 |   numbers <- c("1", "1.0", "0.1", ".1", "0.1E1", "1L", "1.0L", "1.5L",
 23 |                "1E1", "1E-1", "1E-1L", ".100E-105L", "0.", "100.",
 24 |                "1e+09", "1e+90", "1e-90", "1e-00000000000000009")
 25 | 
 26 |   for (number in numbers) {
 27 |     tokens <- tokenize_string(number)
 28 |     expect_true(nrow(tokens) == 1, paste("expected a single token ('", number, "')", sep = ""))
 29 |     token <- as.list(tokens[1, ])
 30 |     expect_true(token$type == "number", paste("expected a number ('", token$type, "')", sep = ""))
 31 |   }
 32 | 
 33 | })
 34 | 
 35 | test_that("The tokenizer accepts UTF-8 symbols", {
 36 |   expect_true(nrow(tokenize_string("鬼門")) == 1)
 37 | })
 38 | 
 39 | test_that("The tokenizer works correctly", {
 40 | 
 41 |   # TODO: Should newlines be absorbed as part of the comment string?
 42 |   tokens <- tokenize_string("# A Comment\n")
 43 |   expected <- "# A Comment\n"
 44 |   compare_tokens(tokens, expected)
 45 | 
 46 |   tokens <- tokenize_string("a <- 1 + 2\n")
 47 |   compare_tokens(
 48 |     tokens,
 49 |     c("a", " ", "<-", " ", "1", " ", "+", " ", "2", "\n")
 50 |   )
 51 | 
 52 |   compare_tokens(
 53 |     tokenize_string("a<-1"),
 54 |     c("a", "<-", "1")
 55 |   )
 56 | 
 57 |   # NOTE: '-' sign tokenized separately from number
 58 |   compare_tokens(
 59 |     tokenize_string("a< -1"),
 60 |     c("a", "<", " ", "-", "1")
 61 |   )
 62 | 
 63 |   compare_tokens("1.0E5L", "1.0E5L")
 64 |   compare_tokens(".1", ".1")
 65 |   compare_tokens("'\\''", "'\\''")
 66 |   compare_tokens(".a", ".a")
 67 |   compare_tokens("...", "...")
 68 |   compare_tokens(":=", ":=")
 69 |   compare_tokens("x ** 2", c("x", " ", "**", " ", "2"))
 70 | 
 71 | })
 72 | 
 73 | test_that("`[[` and `[` are tokenized correctly", {
 74 | 
 75 |   compare_tokens("x[[1]]", c("x", "[[", "1", "]]"))
 76 | 
 77 |   # not really valid R code, but the tokenizer should still
 78 |   # get it right
 79 |   compare_tokens("[[[]]]", c("[[", "[", "]", "]]"))
 80 | 
 81 |   compare_tokens(
 82 |     "x[[a[b[[c[1]]]]]]",
 83 |     c("x", "[[", "a", "[", "b", "[[", "c", "[", "1",
 84 |       "]", "]]", "]", "]]")
 85 |   )
 86 | 
 87 | })
 88 | 
 89 | test_that("Failures during number tokenization is detected", {
 90 |   tokens <- tokenize_string("1.5E---")
 91 |   expect_true(tokens$type[[1]] == "invalid")
 92 | })
 93 | 
 94 | test_that("invalid number e.g. 1E1.5 tokenized as single entity", {
 95 |   tokens <- tokenize_string("1E1.5")
 96 |   expect_true(nrow(tokens) == 1)
 97 |   expect_true(tokens$type[[1]] == "invalid")
 98 | })
 99 | 
100 | test_that("keywords are tokenized as keywords", {
101 | 
102 |   keywords <- c("if", "else", "repeat", "while", "function",
103 |                 "for", "in", "next", "break",
104 |                 "TRUE", "FALSE", "NULL", "Inf", "NaN", "NA",
105 |                 "NA_integer_", "NA_real_", "NA_complex_", "NA_character_")
106 | 
107 |   tokens <- lapply(keywords, function(keyword) {
108 |     tokenize_string(keyword)[1, ]
109 |   })
110 | 
111 |   types <- unlist(lapply(tokens, `[[`, "type"))
112 |   expect_true(all(types == "keyword"))
113 | })
114 | 
115 | test_that("comments without a trailing newline are tokenized", {
116 |   tokens <- tokenize_string("# abc")
117 |   expect_identical(tokens$type, "comment")
118 | })
119 | 
120 | test_that("tokenization errors handled correctly", {
121 |   # previously, these reported an error where a NUL
122 |   # byte was accidentally included as part of the
123 |   # token value
124 |   tokenize_string("'abc")
125 |   tokenize_string("\"abc")
126 |   tokenize_string("%abc")
127 |   expect_true(TRUE, "we didn't segfault")
128 | })
129 | 
130 | test_that("raw tokens are tokenized correctly", {
131 | 
132 |   prefixes <- c("r", "R")
133 |   quotes <- c("'", '"')
134 |   dashes <- c("", "-", "--", "---")
135 |   lhs <- c("(", "{", "[")
136 | 
137 |   all <- expand.grid(prefixes, quotes, dashes, lhs, stringsAsFactors = FALSE)
138 | 
139 |   all$Var5 <- ""
140 |   all$Var5[all$Var4 == "("] <- ")"
141 |   all$Var5[all$Var4 == "{"] <- "}"
142 |   all$Var5[all$Var4 == "["] <- "]"
143 | 
144 |   all$Var6 <- all$Var3
145 |   all$Var7 <- all$Var2
146 |   strings <- do.call(paste0, all)
147 | 
148 |   for (string in strings) {
149 |     token <- tokenize_string(string)
150 |     expect_true(nrow(token) == 1L)
151 |     expect_true(token$type == "string")
152 |   }
153 | 
154 | })
155 | 


--------------------------------------------------------------------------------
/tools/header-guards.R:
--------------------------------------------------------------------------------
 1 | # convert a file path to a header guard name
 2 | guarded_name <- function(path) {
 3 | 
 4 |   transformations <- list(
 5 |     dots    = function(x) gsub(".", "_", x, fixed = TRUE),
 6 |     camel   = function(x) gsub("(?<!^)([A-Z])", "_\\1", x, perl = TRUE),
 7 |     upper   = function(x) toupper(x)
 8 |   )
 9 | 
10 |   parts <- strsplit(path, "/", fixed = TRUE)[[1]]
11 |   paste(lapply(parts, function(part) {
12 |     transformed <- part
13 |     for (f in transformations)
14 |       transformed <- f(transformed)
15 |     transformed
16 |   }), collapse = "_")
17 | 
18 | }
19 | 
20 | update_headers <- function() {
21 | 
22 |   # move to include directory
23 |   owd <- setwd("inst/include")
24 |   on.exit(setwd(owd), add = TRUE)
25 | 
26 |   files <- list.files(
27 |     "sourcetools",
28 |     pattern = "h(?:pp)?$",
29 |     recursive = TRUE,
30 |     full.names = TRUE
31 |   )
32 | 
33 |   for (file in files) {
34 |     guard <- guarded_name(file)
35 |     contents <- sourcetools::read_lines(file)
36 |     n <- length(contents)
37 |     contents[[1]] <- paste("#ifndef", guard)
38 |     contents[[2]] <- paste("#define", guard)
39 |     contents[[n]] <- sprintf("#endif /* %s */", guard)
40 |     cat(contents, file = file, sep = "\n")
41 |   }
42 | }
43 | 
44 | update_headers()
45 | 


--------------------------------------------------------------------------------