├── .Rbuildignore ├── .clang-format ├── .github ├── .gitignore └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS.md ├── R ├── catch.R ├── diagnostics.R ├── nse.R ├── register.R ├── sourcetools.R └── util.R ├── README.Rmd ├── README.html ├── README.md ├── TODO.md ├── benchmark ├── benchmark-parser.R ├── benchmark-read.R └── benchmark-tokenizer.R ├── configure.R ├── inst └── include │ ├── sourcetools.h │ └── sourcetools │ ├── collection │ ├── Position.h │ ├── Range.h │ └── collection.h │ ├── completion │ ├── CodeCompletion.h │ └── completion.h │ ├── core │ ├── config.h │ ├── core.h │ ├── macros.h │ └── util.h │ ├── cursor │ ├── TextCursor.h │ ├── TokenCursor.h │ └── cursor.h │ ├── diagnostics │ ├── Checkers.h │ ├── Diagnostic.h │ ├── DiagnosticsSet.h │ └── diagnostics.h │ ├── multibyte │ └── multibyte.h │ ├── parse │ ├── ParseError.h │ ├── ParseNode.h │ ├── ParseStatus.h │ ├── Parser.h │ ├── Precedence.h │ └── parse.h │ ├── platform │ └── platform.h │ ├── r │ ├── RCallRecurser.h │ ├── RConverter.h │ ├── RFunctions.h │ ├── RHeaders.h │ ├── RNonStandardEvaluation.h │ ├── RProtect.h │ ├── RUtils.h │ └── r.h │ ├── read │ ├── MemoryMappedReader.h │ ├── posix │ │ ├── FileConnection.h │ │ └── MemoryMappedConnection.h │ ├── read.h │ └── windows │ │ ├── FileConnection.h │ │ └── MemoryMappedConnection.h │ ├── tokenization │ ├── Registration.h │ ├── Token.h │ ├── Tokenizer.h │ └── tokenization.h │ ├── utf8 │ └── utf8.h │ └── validation │ ├── SyntaxValidator.h │ └── validation.h ├── man ├── read.Rd ├── register_routines.Rd ├── tokenize-methods.Rd └── validate_syntax.Rd ├── notes └── notes-tdop.R ├── sourcetools.Rproj ├── src ├── Makevars ├── Makevars.win ├── NSE.cpp ├── Parser.cpp ├── Reader.cpp ├── Tokenizer.cpp ├── ValidateSyntax.cpp ├── sourcetools-init.c ├── test-Parser.cpp ├── test-Tokenizer.cpp ├── test-multibyte.cpp ├── test-r.cpp └── test-runner.cpp ├── tests ├── testthat.R └── testthat │ ├── helper-aaa.R │ ├── helper-utf8.R │ ├── test-catch.R │ ├── test-diagnostics.R │ ├── test-parser.R │ ├── test-read.R │ └── test-tokenize.R └── tools └── header-guards.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rhistory$ 3 | ^\.Rproj\.user$ 4 | ^\.clang-format$ 5 | ^\.gitignore$ 6 | ^\.travis\.yml$ 7 | ^appveyor\.yml$ 8 | ^configure\.R$ 9 | ^README\.Rmd$ 10 | ^TODO\.md$ 11 | ^benchmark/ 12 | ^notes/ 13 | ^tools/ 14 | ^travis/ 15 | ^src/*\.s?o 16 | ^\.github$ 17 | -------------------------------------------------------------------------------- /.clang-format: -------------------------------------------------------------------------------- 1 | --- 2 | Language: Cpp 3 | # BasedOnStyle: LLVM 4 | AccessModifierOffset: -2 5 | AlignAfterOpenBracket: true 6 | AlignConsecutiveAssignments: false 7 | AlignEscapedNewlinesLeft: false 8 | AlignOperands: true 9 | AlignTrailingComments: true 10 | AllowAllParametersOfDeclarationOnNextLine: true 11 | AllowShortBlocksOnASingleLine: false 12 | AllowShortCaseLabelsOnASingleLine: false 13 | AllowShortFunctionsOnASingleLine: All 14 | AllowShortIfStatementsOnASingleLine: false 15 | AllowShortLoopsOnASingleLine: false 16 | AlwaysBreakAfterDefinitionReturnType: None 17 | AlwaysBreakBeforeMultilineStrings: false 18 | AlwaysBreakTemplateDeclarations: false 19 | BinPackArguments: true 20 | BinPackParameters: true 21 | BreakBeforeBinaryOperators: None 22 | BreakBeforeBraces: Mozilla 23 | BreakBeforeTernaryOperators: true 24 | BreakConstructorInitializersBeforeComma: false 25 | ColumnLimit: 80 26 | CommentPragmas: '^ IWYU pragma:' 27 | ConstructorInitializerAllOnOneLineOrOnePerLine: false 28 | ConstructorInitializerIndentWidth: 4 29 | ContinuationIndentWidth: 4 30 | Cpp11BracedListStyle: true 31 | DerivePointerAlignment: false 32 | DisableFormat: false 33 | ExperimentalAutoDetectBinPacking: false 34 | ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] 35 | IndentCaseLabels: false 36 | IndentWidth: 2 37 | IndentWrappedFunctionNames: false 38 | KeepEmptyLinesAtTheStartOfBlocks: true 39 | MacroBlockBegin: '' 40 | MacroBlockEnd: '' 41 | MaxEmptyLinesToKeep: 1 42 | NamespaceIndentation: None 43 | ObjCBlockIndentWidth: 2 44 | ObjCSpaceAfterProperty: false 45 | ObjCSpaceBeforeProtocolList: true 46 | PenaltyBreakBeforeFirstCallParameter: 19 47 | PenaltyBreakComment: 300 48 | PenaltyBreakFirstLessLess: 120 49 | PenaltyBreakString: 1000 50 | PenaltyExcessCharacter: 1000000 51 | PenaltyReturnTypeOnItsOwnLine: 60 52 | PointerAlignment: Left 53 | SpaceAfterCStyleCast: false 54 | SpaceBeforeAssignmentOperators: true 55 | SpaceBeforeParens: ControlStatements 56 | SpaceInEmptyParentheses: false 57 | SpacesBeforeTrailingComments: 2 58 | SpacesInAngles: false 59 | SpacesInContainerLiterals: true 60 | SpacesInCStyleCastParentheses: false 61 | SpacesInParentheses: false 62 | SpacesInSquareBrackets: false 63 | Standard: Cpp11 64 | TabWidth: 8 65 | UseTab: Never 66 | ... 67 | 68 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | 8 | name: R-CMD-check.yaml 9 | 10 | permissions: read-all 11 | 12 | jobs: 13 | R-CMD-check: 14 | runs-on: ubuntu-latest 15 | env: 16 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 17 | R_KEEP_PKG_SOURCE: yes 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::rcmdcheck 28 | needs: check 29 | 30 | - uses: r-lib/actions/check-r-package@v2 31 | with: 32 | upload-snapshots: true 33 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 34 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | src/*.o 4 | src/*.o-* 5 | src/*.so 6 | src/*.dll 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: sourcetools 2 | Type: Package 3 | Title: Tools for Reading, Tokenizing and Parsing R Code 4 | Version: 0.1.7-9000 5 | Author: Kevin Ushey 6 | Maintainer: Kevin Ushey 7 | Description: Tools for Reading, Tokenizing and Parsing R Code. 8 | License: MIT + file LICENSE 9 | LazyData: TRUE 10 | Depends: 11 | R (>= 3.0.2) 12 | Suggests: 13 | testthat 14 | LinkingTo: 15 | testthat (>= 1.0.2) 16 | RoxygenNote: 7.1.1 17 | BugReports: https://github.com/kevinushey/sourcetools/issues 18 | Encoding: UTF-8 19 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Copyright 2015-2017 Kevin Ushey 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(print,RTokens) 4 | export(read) 5 | export(read_bytes) 6 | export(read_lines) 7 | export(read_lines_bytes) 8 | export(tokenize) 9 | export(tokenize_file) 10 | export(tokenize_string) 11 | export(validate_syntax) 12 | useDynLib(sourcetools, .registration = TRUE) 13 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2 | ## sourcetools 0.2.0 (UNRELEASED) 3 | 4 | - Remove calls to `std::sprintf()`. 5 | 6 | - Support `=>` pipe-bind operator, to be introduced in R 4.1.0. 7 | 8 | - Support `|>` pipe operator, to be introduced in R 4.1.0. 9 | 10 | - Support raw string syntax, introduced in R 4.0.0. 11 | 12 | ## sourcetools 0.1.7 13 | 14 | - Ensure tests pass on platforms where `char` is unsigned. (#21) 15 | 16 | ## sourcetools 0.1.6 17 | 18 | - Register native routines. 19 | 20 | ## sourcetools 0.1.5 21 | 22 | - More work to ensure `sourcetools` can build on Solaris. 23 | 24 | ## sourcetools 0.1.4 25 | 26 | - More work to ensure `sourcetools` can build on Solaris. 27 | 28 | ## sourcetools 0.1.3 29 | 30 | - Relax C++11 requirement, to ensure that `sourcetools` can 31 | build on machines with older compilers (e.g. gcc 4.4). 32 | 33 | ## sourcetools 0.1.2 34 | 35 | - Disable failing tests on Solaris. 36 | 37 | ## sourcetools 0.1.1 38 | 39 | - Rename token type `ERR` to `INVALID` to fix build errors 40 | on Solaris. 41 | 42 | ## sourcetools 0.1.0 43 | 44 | ### Features 45 | 46 | The first release of `sourcetools` comes with a small set 47 | of features exposed to R: 48 | 49 | - `read(file)`: Read a file (as a string). Similar to 50 | `readChar()`, but faster (and maybe be optimized to 51 | use a memory mapped file reader in the future). 52 | 53 | - `tokenize_file(file)`: Tokenize an R script. 54 | 55 | - `tokenize_string(string)`: Tokenize a string of R code. 56 | -------------------------------------------------------------------------------- /R/catch.R: -------------------------------------------------------------------------------- 1 | (function() .Call(run_testthat_tests)) 2 | -------------------------------------------------------------------------------- /R/diagnostics.R: -------------------------------------------------------------------------------- 1 | diagnose_string <- function(string) { 2 | .Call(sourcetools_diagnose_string, as.character(string)) 3 | } 4 | 5 | diagnose_file <- function(file) { 6 | diagnose_string(read(file)) 7 | } 8 | -------------------------------------------------------------------------------- /R/nse.R: -------------------------------------------------------------------------------- 1 | 2 | performs_nse <- function(...) { 3 | .Call(sourcetools_performs_nse, list(...)) 4 | } 5 | -------------------------------------------------------------------------------- /R/register.R: -------------------------------------------------------------------------------- 1 | #' Register Native Routines 2 | #' 3 | #' Discover and register native routines in a package. 4 | #' Functions to be registered should be prefixed with the 5 | #' `// [[export()]]` attribute. 6 | #' 7 | #' @param package The path to an \R package. 8 | #' @param prefix The prefix to assign to the \R objects 9 | #' generated that map to each routine. 10 | #' @param dynamic.symbols Boolean; should dynamic symbol lookup 11 | #' be enabled? 12 | #' 13 | register_routines <- function(package = ".", 14 | prefix = "C_", 15 | dynamic.symbols = FALSE) 16 | { 17 | # read DESCRIPTION file 18 | desc_path <- file.path(package, "DESCRIPTION") 19 | if (!file.exists(desc_path)) { 20 | fmt <- "no DESCRIPTION at path '%s'" 21 | stop(sprintf(fmt, desc_path)) 22 | } 23 | desc <- read.dcf(desc_path, all = TRUE) 24 | pkg_name <- desc$Package 25 | 26 | # find C, C++ files in package 27 | srcfiles <- list.files( 28 | package, 29 | pattern = "\\.(?:h|c|cc|cpp)$", 30 | full.names = TRUE, 31 | recursive = TRUE 32 | ) 33 | 34 | # discover routines in these files 35 | routines <- unlist( 36 | lapply(srcfiles, discover_routines), 37 | recursive = FALSE 38 | ) 39 | 40 | # generate prototypes based on routines 41 | prototypes <- generate_prototypes(routines) 42 | 43 | # separate routines based on declared export type 44 | call_routines <- external_routines <- list() 45 | lapply(routines, function(routine) { 46 | 47 | # extract registration text and discover the interface 48 | pieces <- strsplit(routine$registration, "\\[\\[|\\]\\]")[[1]] 49 | code <- utils::tail(pieces, 1) 50 | parsed <- tryCatch( 51 | parse(text = code)[[1]], 52 | error = function(e) { 53 | warning("failed to parse registration comment '", routine$registration, "'") 54 | } 55 | ) 56 | 57 | interface <- as.character(parsed[[2]]) 58 | if (interface == ".Call") { 59 | call_routines[[length(call_routines) + 1]] <<- routine 60 | } else if (interface == ".External") { 61 | external_routines[[length(external_routines) + 1]] <<- routine 62 | } else { 63 | warning("unrecognized / unsupported interface '", interface, "'") 64 | } 65 | 66 | }) 67 | 68 | # generate method definitions for each 69 | call_methods <- generate_call_methods(call_routines, prefix = prefix) 70 | external_methods <- generate_external_methods(external_routines, prefix = prefix) 71 | 72 | # generate initialization routine 73 | r_init <- generate_r_init(pkg_name = pkg_name, 74 | call_methods = call_methods, 75 | external_methods = external_methods, 76 | dynamic_symbols = dynamic.symbols) 77 | 78 | # generate script 79 | script <- c( 80 | "// This file was automatically generated.", 81 | "", 82 | "#include ", 83 | "#include ", 84 | "#include ", 85 | "", 86 | prototypes, 87 | "", 88 | call_methods, 89 | "", 90 | external_methods, 91 | "", 92 | r_init 93 | ) 94 | 95 | # write to init file 96 | init_path <- sub("^\\./", "", file.path(package, sprintf("src/%s-init.c", pkg_name))) 97 | writeLines(script, init_path, sep = "\n") 98 | message("* Wrote registration metadata to '", init_path, "'") 99 | 100 | # remind about .registration = TRUE 101 | check_namespace_symbol_registration(package) 102 | invisible(init_path) 103 | } 104 | 105 | discover_routines <- function(file) { 106 | contents <- readBin(file, what = raw(), n = file.info(file)$size) 107 | 108 | # find routines for registration 109 | re_registration <- "//[[:space:]*]\\[\\[export" 110 | if (length(contents) < re_registration) 111 | return(list()) 112 | 113 | matches <- grepRaw(re_registration, contents, all = TRUE) 114 | lapply(matches, function(match) { 115 | 116 | # find bounds for function prototype 117 | start <- grepRaw("\n", contents, offset = match) + 1 118 | end <- grepRaw("\\{|;", contents, offset = start) - 1 119 | 120 | # extract the routine type 121 | registration <- rawToChar(contents[match:(start - 2)]) 122 | 123 | # extract all 'SEXP .*' pieces of function 124 | prototype <- rawToChar(contents[start:end]) 125 | m <- gregexpr("SEXP[[:space:]+]([[:alnum:]_])+", prototype) 126 | names <- regmatches(prototype, m)[[1]] 127 | 128 | list( 129 | registration = registration, 130 | prototype = prototype, 131 | name = names[[1]], 132 | arguments = names[-1L] 133 | ) 134 | 135 | }) 136 | 137 | } 138 | 139 | check_namespace_symbol_registration <- function(package = ".") { 140 | 141 | # check for namespace file 142 | ns_path <- file.path(package, "NAMESPACE") 143 | if (!file.exists(ns_path)) 144 | return(invisible(FALSE)) 145 | 146 | # try parsing the namespace 147 | ns <- parse(ns_path) 148 | 149 | # try finding a call to 'useDynLib(pkg, .registration = TRUE)' 150 | for (entry in ns) { 151 | if (identical(entry[[1]], as.name("useDynLib"))) { 152 | nm <- names(entry) 153 | idx <- which(nm == ".registration") 154 | if (length(idx) != 1) 155 | break 156 | 157 | if (isTRUE(entry[[idx]])) 158 | return(invisible(TRUE)) 159 | } 160 | } 161 | 162 | invisible(FALSE) 163 | } 164 | 165 | generate_prototypes <- function(routines) { 166 | # TODO: we assume only SEXP interfaces here 167 | vapply(routines, function(routine) { 168 | arglist <- paste(rep("SEXP", length(routine$arguments)), collapse = ", ") 169 | sprintf("%s(%s);", routine$name, arglist) 170 | }, character(1)) 171 | } 172 | 173 | generate_call_methods <- function(routines, prefix = "C_") { 174 | 175 | # for each routine, generate a registration line 176 | fmt <- '{"%s", (DL_FUNC) &%s, %i},' 177 | lines <- vapply(routines, function(routine) { 178 | name <- utils::tail(strsplit(routine$name, "[[:space:]+]")[[1]], 1) 179 | prefixed_name <- paste0(prefix, name) 180 | n <- length(routine$arguments) 181 | sprintf(fmt, prefixed_name, name, n) 182 | }, character(1)) 183 | 184 | # indent, add commas, add null entry at end 185 | lines <- c(lines, "{NULL, NULL, 0}") 186 | 187 | c( 188 | "static R_CallMethodDef callMethods[] = {", 189 | paste0("\t", lines), 190 | "};" 191 | ) 192 | 193 | } 194 | 195 | generate_external_methods <- function(routines, prefix = "C_") { 196 | # TODO 197 | character() 198 | } 199 | 200 | generate_r_init <- function(pkg_name, 201 | call_methods, 202 | external_methods, 203 | dynamic_symbols) 204 | { 205 | r_register_routines <- sprintf( 206 | "\tR_registerRoutines(info, %s, %s, %s, %s);", 207 | "NULL", 208 | if (length(call_methods)) "callMethods" else "NULL", 209 | "NULL", 210 | if (length(external_methods)) "externalMethods" else "NULL" 211 | ) 212 | 213 | fmt <- paste( 214 | "void R_init_%s(DllInfo* info) {", 215 | r_register_routines, 216 | "\tR_useDynamicSymbols(info, %s);", 217 | "}", 218 | sep = "\n", collapse = "\n" 219 | ) 220 | 221 | sprintf(fmt, pkg_name, if (dynamic_symbols) "TRUE" else "FALSE") 222 | 223 | } 224 | -------------------------------------------------------------------------------- /R/sourcetools.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib sourcetools, .registration = TRUE 2 | NULL 3 | 4 | #' Read the Contents of a File 5 | #' 6 | #' Read the contents of a file into a string (or, in the case of 7 | #' \code{read_lines}, a vector of strings). 8 | #' 9 | #' @param path A file path. 10 | #' 11 | #' @name read 12 | #' @rdname read 13 | #' @export 14 | read <- function(path) { 15 | path <- normalizePath(path, mustWork = TRUE) 16 | .Call(sourcetools_read, path) 17 | } 18 | 19 | #' @name read 20 | #' @rdname read 21 | #' @export 22 | read_lines <- function(path) { 23 | path <- normalizePath(path, mustWork = TRUE) 24 | .Call(sourcetools_read_lines, path) 25 | } 26 | 27 | #' @name read 28 | #' @rdname read 29 | #' @export 30 | read_bytes <- function(path) { 31 | path <- normalizePath(path, mustWork = TRUE) 32 | .Call(sourcetools_read_bytes, path) 33 | } 34 | 35 | #' @name read 36 | #' @rdname read 37 | #' @export 38 | read_lines_bytes <- function(path) { 39 | path <- normalizePath(path, mustWork = TRUE) 40 | .Call(sourcetools_read_lines_bytes, path) 41 | } 42 | 43 | #' Tokenize R Code 44 | #' 45 | #' Tools for tokenizing \R code. 46 | #' 47 | #' @param file,path A file path. 48 | #' @param text,string \R code as a character vector of length one. 49 | #' 50 | #' @note Line numbers are determined by existence of the \code{\\n} 51 | #' line feed character, under the assumption that code being tokenized 52 | #' will use either \code{\\n} to indicate newlines (as on modern 53 | #' Unix systems), or \code{\\r\\n} as on Windows. 54 | #' 55 | #' @return A \code{data.frame} with the following columns: 56 | #' 57 | #' \tabular{ll}{ 58 | #' \code{value} \tab The token's contents, as a string. \cr 59 | #' \code{row} \tab The row where the token is located. \cr 60 | #' \code{column} \tab The column where the token is located. \cr 61 | #' \code{type} \tab The token type, as a string. \cr 62 | #' } 63 | #' 64 | #' @rdname tokenize-methods 65 | #' @export 66 | #' @examples 67 | #' tokenize_string("x <- 1 + 2") 68 | tokenize_file <- function(path) { 69 | path <- normalizePath(path, mustWork = TRUE) 70 | .Call(sourcetools_tokenize_file, path) 71 | } 72 | 73 | #' @rdname tokenize-methods 74 | #' @export 75 | tokenize_string <- function(string) { 76 | .Call(sourcetools_tokenize_string, as.character(string)) 77 | } 78 | 79 | #' @rdname tokenize-methods 80 | #' @export 81 | tokenize <- function(file = "", text = NULL) { 82 | if (is.null(text)) 83 | text <- read(file) 84 | tokenize_string(text) 85 | } 86 | 87 | #' Find Syntax Errors 88 | #' 89 | #' Find syntax errors in a string of \R code. 90 | #' 91 | #' @param string A character vector (of length one). 92 | #' @export 93 | validate_syntax <- function(string) { 94 | .Call(sourcetools_validate_syntax, as.character(string)) 95 | } 96 | 97 | #' @export 98 | print.RTokens <- function(x, ...) { 99 | print.data.frame(x, ...) 100 | } 101 | 102 | parse_string <- function(string) { 103 | .Call(sourcetools_parse_string, string) 104 | } 105 | 106 | parse_file <- function(file) { 107 | parse_string(read(file)) 108 | } 109 | -------------------------------------------------------------------------------- /R/util.R: -------------------------------------------------------------------------------- 1 | search_objects <- function() { 2 | lapply(seq_along(search()), function(i) { 3 | ls(pos = i, all.names = TRUE) 4 | }) 5 | } 6 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | ```{r setup, include=FALSE} 2 | library(sourcetools) 3 | library(microbenchmark) 4 | ``` 5 | 6 | 7 | [![R-CMD-check](https://github.com/kevinushey/sourcetools/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/kevinushey/sourcetools/actions/workflows/R-CMD-check.yaml) 8 | 9 | 10 | # sourcetools 11 | 12 | Tools for reading, tokenizing, and (eventually) parsing `R` code. 13 | 14 | ## Getting Started 15 | 16 | You can install `sourcetools` from CRAN with: 17 | 18 | ```{r, eval=FALSE} 19 | install.packages("sourcetools") 20 | ``` 21 | 22 | Or, you can install the development version from GitHub with: 23 | 24 | ```{r, eval=FALSE} 25 | devtools::install_github("kevinushey/sourcetools") 26 | ``` 27 | 28 | ## Reading 29 | 30 | `sourcetools` comes with a couple fast functions for reading 31 | files into `R`. 32 | 33 | Use `read()` and `read_lines()` to quickly read a file into 34 | `R` as character vectors. `read_lines()` handles both 35 | Windows style `\r\n` line endings, as well as Unix-style 36 | `\n` endings. Performance is on par with the readers 37 | provided by the 38 | [readr](https://cran.r-project.org/package=readr) package. 39 | 40 | ```{r} 41 | text <- replicate(10000, { 42 | paste(sample(letters, 200, TRUE), collapse = "") 43 | }) 44 | file <- tempfile() 45 | cat(text, file = file, sep = "\n") 46 | mb <- microbenchmark::microbenchmark(times = 10, 47 | base::readLines(file), 48 | readr::read_lines(file), 49 | sourcetools::read_lines(file) 50 | ) 51 | sm <- summary(mb) 52 | print(sm[c("expr", "mean", "median")], digits = 3) 53 | unlink(file) 54 | ``` 55 | 56 | ## Tokenization 57 | 58 | `sourcetools` provides the `tokenize_string()` and 59 | `tokenize_file()` functions for generating a tokenized 60 | representation of R code. These produce 'raw' tokenized 61 | representations of the code, with each token's value as a 62 | string, and a recorded row, column, and type: 63 | 64 | ```{r} 65 | tokenize_string("if (x < 10) 20") 66 | ``` 67 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | [![R-CMD-check](https://github.com/kevinushey/sourcetools/workflows/R-CMD-check/badge.svg)](https://github.com/kevinushey/sourcetools/actions) 5 | 6 | 7 | # sourcetools 8 | 9 | Tools for reading, tokenizing, and (eventually) parsing `R` code. 10 | 11 | ## Getting Started 12 | 13 | You can install `sourcetools` from CRAN with: 14 | 15 | 16 | ```r 17 | install.packages("sourcetools") 18 | ``` 19 | 20 | Or, you can install the development version from GitHub with: 21 | 22 | 23 | ```r 24 | devtools::install_github("kevinushey/sourcetools") 25 | ``` 26 | 27 | ## Reading 28 | 29 | `sourcetools` comes with a couple fast functions for reading 30 | files into `R`. 31 | 32 | Use `read()` and `read_lines()` to quickly read a file into 33 | `R` as character vectors. `read_lines()` handles both 34 | Windows style `\r\n` line endings, as well as Unix-style 35 | `\n` endings. Performance is on par with the readers 36 | provided by the 37 | [readr](https://cran.r-project.org/package=readr) package. 38 | 39 | 40 | ```r 41 | text <- replicate(10000, { 42 | paste(sample(letters, 200, TRUE), collapse = "") 43 | }) 44 | file <- tempfile() 45 | cat(text, file = file, sep = "\n") 46 | mb <- microbenchmark::microbenchmark(times = 10, 47 | base::readLines(file), 48 | readr::read_lines(file), 49 | sourcetools::read_lines(file) 50 | ) 51 | sm <- summary(mb) 52 | print(sm[c("expr", "mean", "median")], digits = 3) 53 | ``` 54 | 55 | ``` 56 | ## expr mean median 57 | ## 1 base::readLines(file) 17.29 16.22 58 | ## 2 readr::read_lines(file) 30.70 8.11 59 | ## 3 sourcetools::read_lines(file) 6.67 6.43 60 | ``` 61 | 62 | ```r 63 | unlink(file) 64 | ``` 65 | 66 | ## Tokenization 67 | 68 | `sourcetools` provides the `tokenize_string()` and 69 | `tokenize_file()` functions for generating a tokenized 70 | representation of R code. These produce 'raw' tokenized 71 | representations of the code, with each token's value as a 72 | string, and a recorded row, column, and type: 73 | 74 | 75 | ```r 76 | tokenize_string("if (x < 10) 20") 77 | ``` 78 | 79 | ``` 80 | ## value row column type 81 | ## 1 if 1 1 keyword 82 | ## 2 1 3 whitespace 83 | ## 3 ( 1 4 bracket 84 | ## 4 x 1 5 symbol 85 | ## 5 1 6 whitespace 86 | ## 6 < 1 7 operator 87 | ## 7 1 8 whitespace 88 | ## 8 10 1 9 number 89 | ## 9 ) 1 11 bracket 90 | ## 10 1 12 whitespace 91 | ## 11 20 1 13 number 92 | ``` 93 | -------------------------------------------------------------------------------- /TODO.md: -------------------------------------------------------------------------------- 1 | Parser 2 | ====== 3 | 4 | - `::` and `:::` are only permitted within certain contexts; the parser is currently permissive about where these tokens are found. 5 | 6 | - Equality operators (`<`, `<=`, `>`, `>=`, `=`, `!=`) can only occur once within the same level of an expression. 7 | 8 | - `->` and `->>` need to be translated into `<-` and `<<-` when generating the R parse tree. 9 | 10 | 11 | 12 | Syntax Validator 13 | ================ 14 | 15 | Remove it? It really just tries to check for parse errors but the parser itself is equipped to do that. 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /benchmark/benchmark-parser.R: -------------------------------------------------------------------------------- 1 | library(sourcetools) 2 | library(microbenchmark) 3 | 4 | files <- list.files("R", full.names = TRUE) 5 | for (file in files) { 6 | 7 | mb <- microbenchmark( 8 | R = base::parse(file, keep.source = FALSE), 9 | ST = sourcetools:::parse_file(file) 10 | ) 11 | 12 | print(mb) 13 | 14 | contents <- sourcetools:::read(file) 15 | 16 | mb <- microbenchmark( 17 | R = base::parse(text = contents, keep.source = FALSE), 18 | ST = sourcetools:::parse_string(contents) 19 | ) 20 | 21 | sourcetools:::check_parse(contents) 22 | 23 | } 24 | -------------------------------------------------------------------------------- /benchmark/benchmark-read.R: -------------------------------------------------------------------------------- 1 | library(sourcetools) 2 | library(microbenchmark) 3 | 4 | file <- tempfile() 5 | 6 | n <- 1024 7 | junk <- replicate(1E4, { 8 | paste(sample(letters, n, TRUE), collapse = "") 9 | }) 10 | writeLines(junk, con = file) 11 | 12 | stopifnot(identical( 13 | read(file), 14 | readChar(file, file.info(file)$size, TRUE) 15 | )) 16 | 17 | stopifnot(identical( 18 | readLines(file), 19 | read_lines(file) 20 | )) 21 | 22 | # read a file into a string 23 | mb <- microbenchmark( 24 | sourcetools::read(file), 25 | base::readChar(file, file.info(file)$size, TRUE), 26 | readr::read_file(file) 27 | ) 28 | print(mb) 29 | 30 | # read a file, splitting on newline characters 31 | mb <- microbenchmark( 32 | sourcetools::read_lines(file), 33 | base::readLines(file), 34 | readr::read_lines(file, progress = FALSE) 35 | ) 36 | print(mb) 37 | 38 | unlink(file) 39 | -------------------------------------------------------------------------------- /benchmark/benchmark-tokenizer.R: -------------------------------------------------------------------------------- 1 | library(microbenchmark) 2 | library(sourcetools) 3 | 4 | # Obviously not fair to compare R's parser to a tokenizer but it 5 | # helps establish a baseline for the tokenizer + how much 'wiggle 6 | # room' we have in our parser 7 | file <- "R/sourcetools.R" 8 | microbenchmark( 9 | tokenize_file(file), 10 | parse(file, keep.source = FALSE) 11 | ) 12 | 13 | contents <- read(file) 14 | mb <- microbenchmark( 15 | tokenize_string(contents), 16 | parse(text = contents, keep.source = FALSE) 17 | ) 18 | print(mb) 19 | -------------------------------------------------------------------------------- /configure.R: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env Rscript 2 | knitr::render_markdown(strict = FALSE) 3 | knitr::knit("README.Rmd", output = "README.md") 4 | tools:::package_native_routine_registration_skeleton(".", "src/sourcetools-init.c", character_only = FALSE) 5 | -------------------------------------------------------------------------------- /inst/include/sourcetools.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCE_TOOLS_H 2 | #define SOURCE_TOOLS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | #endif 17 | -------------------------------------------------------------------------------- /inst/include/sourcetools/collection/Position.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_COLLECTION_POSITION_H 2 | #define SOURCETOOLS_COLLECTION_POSITION_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace sourcetools { 10 | namespace collections { 11 | 12 | struct Position 13 | { 14 | Position() 15 | : row(0), column(0) 16 | { 17 | } 18 | 19 | Position(index_type row, index_type column) 20 | : row(row), column(column) 21 | { 22 | } 23 | 24 | friend std::ostream& operator<<(std::ostream& os, 25 | const Position& position) 26 | { 27 | os << position.row << ":" << position.column; 28 | return os; 29 | } 30 | 31 | friend bool operator <(const Position& lhs, const Position& rhs) 32 | { 33 | return 34 | lhs.row < rhs.row || 35 | (lhs.row == rhs.row && lhs.column < rhs.column); 36 | } 37 | 38 | friend bool operator <=(const Position& lhs, const Position& rhs) 39 | { 40 | return 41 | lhs.row < rhs.row || 42 | (lhs.row == rhs.row && lhs.column <= rhs.column); 43 | } 44 | 45 | friend bool operator ==(const Position& lhs, const Position& rhs) 46 | { 47 | return 48 | lhs.row == rhs.row && 49 | lhs.column == rhs.column; 50 | } 51 | 52 | friend bool operator >(const Position& lhs, const Position& rhs) 53 | { 54 | return 55 | lhs.row > rhs.row || 56 | (lhs.row == rhs.row && lhs.column > rhs.column); 57 | } 58 | 59 | friend bool operator >=(const Position& lhs, const Position& rhs) 60 | { 61 | return 62 | lhs.row > rhs.row || 63 | (lhs.row == rhs.row && lhs.column >= rhs.column); 64 | } 65 | 66 | friend Position operator +(const Position& lhs, index_type rhs) 67 | { 68 | return Position(lhs.row, lhs.column + rhs); 69 | } 70 | 71 | index_type row; 72 | index_type column; 73 | 74 | }; 75 | 76 | } // namespace collections 77 | } // namespace sourcetools 78 | 79 | #endif /* SOURCETOOLS_COLLECTION_POSITION_H */ 80 | -------------------------------------------------------------------------------- /inst/include/sourcetools/collection/Range.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_COLLECTION_RANGE_H 2 | #define SOURCETOOLS_COLLECTION_RANGE_H 3 | 4 | #include 5 | #include 6 | 7 | namespace sourcetools { 8 | namespace collections { 9 | 10 | class Range 11 | { 12 | public: 13 | Range(const Position& start, const Position& end) 14 | : start_(start), end_(end) 15 | { 16 | } 17 | 18 | friend std::ostream& operator <<(std::ostream& os, const Range& range) 19 | { 20 | os << "[" << range.start() << "-" << range.end() << "]"; 21 | return os; 22 | } 23 | 24 | const Position start() const { return start_; } 25 | const Position end() const { return end_; } 26 | 27 | private: 28 | Position start_; 29 | Position end_; 30 | }; 31 | } // namespace collections 32 | } // namespace sourcetools 33 | 34 | #endif /* SOURCETOOLS_COLLECTION_RANGE_H */ 35 | -------------------------------------------------------------------------------- /inst/include/sourcetools/collection/collection.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_COLLECTION_COLLECTION_H 2 | #define SOURCETOOLS_COLLECTION_COLLECTION_H 3 | 4 | #include 5 | #include 6 | 7 | #endif /* SOURCETOOLS_COLLECTION_COLLECTION_H */ 8 | -------------------------------------------------------------------------------- /inst/include/sourcetools/completion/CodeCompletion.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_COMPLETION_CODE_COMPLETION_H 2 | #define SOURCETOOLS_COMPLETION_CODE_COMPLETION_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace sourcetools { 10 | namespace completion { 11 | 12 | enum CompletionType 13 | { 14 | CompletionTypeUnknown 15 | }; 16 | 17 | class Completion 18 | { 19 | public: 20 | Completion(const std::string& value, CompletionType type) 21 | : value_(value), type_(type) 22 | { 23 | } 24 | 25 | private: 26 | std::string value_; 27 | CompletionType type_; 28 | }; 29 | 30 | std::vector completions(const char* code, 31 | index_type n, 32 | const collections::Position& position) 33 | { 34 | std::vector completions; 35 | 36 | // TODO: 37 | // 38 | // 1) produce parse tree 39 | // 2) get node at position (note: token immediately before position?) 40 | // 3) figure out completion context type 41 | // ('$', '@', file, identifier, special context, etc) 42 | // 4) dispatch to appropriate completer for context 43 | // 5) return completions 44 | 45 | return completions; 46 | } 47 | 48 | } // namespace completion 49 | } // namespace sourcetools 50 | 51 | #endif /* SOURCETOOLS_COMPLETION_CODE_COMPLETION_H */ 52 | -------------------------------------------------------------------------------- /inst/include/sourcetools/completion/completion.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_COMPLETION_COMPLETION_H 2 | #define SOURCETOOLS_COMPLETION_COMPLETION_H 3 | 4 | #include 5 | 6 | #endif /* SOURCETOOLS_COMPLETION_COMPLETION_H */ 7 | -------------------------------------------------------------------------------- /inst/include/sourcetools/core/config.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_CORE_CONFIG_H 2 | #define SOURCETOOLS_CORE_CONFIG_H 3 | 4 | namespace sourcetools { 5 | 6 | #ifndef SOURCETOOLS_CONFIG_INDEX_TYPE 7 | # define SOURCETOOLS_CONFIG_INDEX_TYPE int 8 | #endif 9 | 10 | typedef SOURCETOOLS_CONFIG_INDEX_TYPE index_type; 11 | 12 | } // namespace sourcetools 13 | 14 | #endif /* SOURCETOOLS_CORE_CONFIG_H */ 15 | 16 | -------------------------------------------------------------------------------- /inst/include/sourcetools/core/core.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_CORE_CORE_H 2 | #define SOURCETOOLS_CORE_CORE_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #endif /* SOURCETOOLS_CORE_CORE_H */ 9 | -------------------------------------------------------------------------------- /inst/include/sourcetools/core/macros.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_CORE_MACROS_H 2 | #define SOURCETOOLS_CORE_MACROS_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | /* Utility */ 11 | #ifdef __GNUC__ 12 | # define LIKELY(x) __builtin_expect(!!(x), 1) 13 | # define UNLIKELY(x) __builtin_expect(!!(x), 0) 14 | #else 15 | # define LIKELY(x) x 16 | # define UNLIKELY(x) x 17 | #endif 18 | 19 | #define SOURCE_TOOLS_CHECK_MASK(__SELF__, __MASK__) \ 20 | ((__MASK__ & __SELF__) == __MASK__) 21 | 22 | #define SOURCE_TOOLS_LOWER_BITS(__VALUE__, __BITS__) \ 23 | (((1 << __BITS__) - 1) & __VALUE__) 24 | 25 | #define SOURCE_TOOLS_PASTE(__X__, __Y__) __X__ ## __Y__ 26 | #define SOURCE_TOOLS_STRINGIFY(__X__) #__X__ 27 | 28 | /* Logging */ 29 | namespace sourcetools { 30 | namespace debug { 31 | 32 | inline std::string shortFilePath(const std::string& filePath) 33 | { 34 | std::string::size_type index = filePath.find_last_of("/"); 35 | if (index != std::string::npos) 36 | return filePath.substr(index + 1); 37 | return filePath; 38 | } 39 | 40 | inline std::string debugPosition(const char* filePath, int line) 41 | { 42 | static const int N = 1024; 43 | char buffer[N]; 44 | std::string shortPath = shortFilePath(filePath); 45 | if (shortPath.size() > N / 2) 46 | shortPath = shortPath.substr(0, N / 2); 47 | std::snprintf(buffer, N, "[%s:%4i]", shortPath.c_str(), line); 48 | return buffer; 49 | } 50 | 51 | } // namespace debug 52 | } // namespace sourcetools 53 | 54 | // Flip on/off as necessary 55 | #ifdef SOURCETOOLS_ENABLE_DEBUG_LOGGING 56 | 57 | #include 58 | 59 | #define DEBUG(__X__) \ 60 | std::cerr << ::sourcetools::debug::debugPosition(__FILE__, __LINE__) \ 61 | << ": " << __X__ << ::std::endl; 62 | #define DEBUG_BLOCK(x) 63 | 64 | #else 65 | 66 | #define DEBUG(x) 67 | #define DEBUG_BLOCK(x) if (false) 68 | 69 | #endif 70 | 71 | #endif /* SOURCETOOLS_CORE_MACROS_H */ 72 | -------------------------------------------------------------------------------- /inst/include/sourcetools/core/util.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_CORE_UTIL_H 2 | #define SOURCETOOLS_CORE_UTIL_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | 9 | #include 10 | 11 | namespace sourcetools { 12 | namespace detail { 13 | 14 | class noncopyable 15 | { 16 | protected: 17 | noncopyable() {} 18 | ~noncopyable() {} 19 | 20 | private: 21 | noncopyable(const noncopyable&); 22 | noncopyable& operator=(const noncopyable&); 23 | }; 24 | 25 | } // namespace detail 26 | typedef detail::noncopyable noncopyable; 27 | 28 | template 29 | class scoped_ptr : noncopyable 30 | { 31 | public: 32 | explicit scoped_ptr(T* pData) : pData_(pData) {} 33 | T& operator*() const { return *pData_; } 34 | T* operator->() const { return pData_; } 35 | operator T*() const { return pData_; } 36 | ~scoped_ptr() { delete pData_; } 37 | private: 38 | T* pData_; 39 | }; 40 | 41 | template 42 | class scoped_array : noncopyable 43 | { 44 | public: 45 | explicit scoped_array(T* pData) : pData_(pData) {} 46 | T& operator*() const { return *pData_; } 47 | T* operator->() const { return pData_; } 48 | operator T*() const { return pData_; } 49 | ~scoped_array() { delete[] pData_; } 50 | private: 51 | T* pData_; 52 | }; 53 | 54 | namespace utils { 55 | 56 | inline bool isWhitespace(char ch) 57 | { 58 | return 59 | ch == ' ' || 60 | ch == '\f' || 61 | ch == '\r' || 62 | ch == '\n' || 63 | ch == '\t' || 64 | ch == '\v'; 65 | } 66 | 67 | template 68 | inline bool countWhitespaceBytes(const char* data, 69 | T* pBytes) 70 | { 71 | T bytes = 0; 72 | while (isWhitespace(*data)) { 73 | ++data; 74 | ++bytes; 75 | } 76 | 77 | *pBytes = bytes; 78 | return bytes != 0; 79 | } 80 | 81 | inline bool isDigit(char ch) 82 | { 83 | return 84 | (ch >= '0' && ch <= '9'); 85 | } 86 | 87 | inline bool isAlphabetic(char ch) 88 | { 89 | return 90 | (ch >= 'a' && ch <= 'z') || 91 | (ch >= 'A' && ch <= 'Z'); 92 | } 93 | 94 | inline bool isAlphaNumeric(char ch) 95 | { 96 | return 97 | (ch >= 'a' && ch <= 'z') || 98 | (ch >= 'A' && ch <= 'Z') || 99 | (ch >= '0' && ch <= '9'); 100 | } 101 | 102 | inline bool isHexDigit(char ch) 103 | { 104 | return 105 | (ch >= '0' && ch <= '9') || 106 | (ch >= 'a' && ch <= 'f') || 107 | (ch >= 'A' && ch <= 'F'); 108 | } 109 | 110 | inline bool isValidForStartOfRSymbol(char ch) 111 | { 112 | return 113 | isAlphabetic(ch) || 114 | ch == '.' || 115 | static_cast(ch) < 0; 116 | } 117 | 118 | inline bool isValidForRSymbol(char ch) 119 | { 120 | return 121 | isAlphaNumeric(ch) || 122 | ch == '.' || 123 | ch == '_' || 124 | static_cast(ch) < 0; 125 | } 126 | 127 | inline std::string escape(char ch) 128 | { 129 | switch (ch) { 130 | case '\r': 131 | return "\\r"; 132 | case '\n': 133 | return "\\n"; 134 | case '\t': 135 | return "\\t"; 136 | default: 137 | return std::string(1, ch); 138 | } 139 | } 140 | 141 | template 142 | index_type size(const T& object) 143 | { 144 | return static_cast(object.size()); 145 | } 146 | 147 | } // namespace utils 148 | } // namespace sourcetools 149 | 150 | #endif /* SOURCETOOLS_CORE_UTIL_H */ 151 | -------------------------------------------------------------------------------- /inst/include/sourcetools/cursor/TextCursor.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_CURSOR_TEXT_CURSOR_H 2 | #define SOURCETOOLS_CURSOR_TEXT_CURSOR_H 3 | 4 | #include 5 | #include 6 | 7 | namespace sourcetools { 8 | namespace cursors { 9 | 10 | class TextCursor 11 | { 12 | public: 13 | 14 | TextCursor(const char* text, index_type n) 15 | : text_(text), 16 | n_(n), 17 | offset_(0), 18 | position_(0, 0) 19 | { 20 | } 21 | 22 | char peek(index_type offset = 0) const 23 | { 24 | index_type index = offset_ + offset; 25 | if (UNLIKELY(index >= n_)) 26 | return '\0'; 27 | return text_[index]; 28 | } 29 | 30 | void advance(index_type times = 1) 31 | { 32 | for (index_type i = 0; i < times; ++i) { 33 | if (peek() == '\n') { 34 | ++position_.row; 35 | position_.column = 0; 36 | } else { 37 | ++position_.column; 38 | } 39 | ++offset_; 40 | } 41 | } 42 | 43 | operator const char*() const { return text_ + offset_; } 44 | 45 | index_type offset() const { return offset_; } 46 | 47 | const collections::Position& position() const { return position_; } 48 | index_type row() const { return position_.row; } 49 | index_type column() const { return position_.column; } 50 | 51 | const char* begin() const { return text_; } 52 | const char* end() const { return text_ + n_; } 53 | 54 | private: 55 | const char* text_; 56 | index_type n_; 57 | index_type offset_; 58 | collections::Position position_; 59 | }; 60 | 61 | } // namespace cursors 62 | } // namespace sourcetools 63 | 64 | #endif /* SOURCETOOLS_CURSOR_TEXT_CURSOR_H */ 65 | -------------------------------------------------------------------------------- /inst/include/sourcetools/cursor/TokenCursor.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_CURSOR_TOKEN_CURSOR_H 2 | #define SOURCETOOLS_CURSOR_TOKEN_CURSOR_H 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | 14 | namespace sourcetools { 15 | namespace cursors { 16 | 17 | class TokenCursor { 18 | 19 | private: 20 | typedef collections::Position Position; 21 | typedef tokens::Token Token; 22 | 23 | public: 24 | 25 | TokenCursor(const std::vector& tokens) 26 | : tokens_(tokens), 27 | offset_(0), 28 | n_(tokens.size()), 29 | noSuchToken_(tokens::END) 30 | {} 31 | 32 | bool moveToNextToken() 33 | { 34 | if (UNLIKELY(offset_ >= n_ - 1)) 35 | return false; 36 | 37 | ++offset_; 38 | return true; 39 | } 40 | 41 | bool moveToNextSignificantToken() 42 | { 43 | if (!moveToNextToken()) 44 | return false; 45 | 46 | if (!fwdOverWhitespaceAndComments()) 47 | return false; 48 | 49 | return true; 50 | } 51 | 52 | bool moveToPreviousToken() 53 | { 54 | if (UNLIKELY(offset_ == 0)) 55 | return false; 56 | 57 | --offset_; 58 | return true; 59 | } 60 | 61 | bool moveToPreviousSignificantToken() 62 | { 63 | if (!moveToPreviousToken()) 64 | return false; 65 | 66 | if (!bwdOverWhitespaceAndComments()) 67 | return false; 68 | 69 | return true; 70 | } 71 | 72 | const Token& peekFwd(index_type offset = 1) const 73 | { 74 | index_type index = offset_ + offset; 75 | if (UNLIKELY(index >= n_)) 76 | return noSuchToken_; 77 | 78 | return tokens_[index]; 79 | } 80 | 81 | const Token& peekBwd(index_type offset = 1) const 82 | { 83 | if (UNLIKELY(offset > offset_)) 84 | return noSuchToken_; 85 | 86 | index_type index = offset_ - offset; 87 | return tokens_[index]; 88 | } 89 | 90 | const Token& currentToken() const 91 | { 92 | if (UNLIKELY(offset_ >= n_)) 93 | return noSuchToken_; 94 | return tokens_[offset_]; 95 | } 96 | 97 | operator const Token&() const { return currentToken(); } 98 | 99 | bool fwdOverWhitespace() 100 | { 101 | while (isType(tokens::WHITESPACE)) 102 | if (!moveToNextToken()) 103 | return false; 104 | return true; 105 | } 106 | 107 | bool bwdOverWhitespace() 108 | { 109 | while (isType(tokens::WHITESPACE)) 110 | if (!moveToPreviousToken()) 111 | return false; 112 | return true; 113 | } 114 | 115 | bool fwdOverComments() 116 | { 117 | while (isType(tokens::COMMENT)) 118 | if (!moveToNextToken()) 119 | return false; 120 | return true; 121 | } 122 | 123 | bool bwdOverComments() 124 | { 125 | while (isType(tokens::COMMENT)) 126 | if (!moveToPreviousToken()) 127 | return false; 128 | return true; 129 | } 130 | 131 | bool fwdOverWhitespaceAndComments() 132 | { 133 | while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE)) 134 | if (!moveToNextToken()) 135 | return false; 136 | return true; 137 | } 138 | 139 | bool bwdOverWhitespaceAndComments() 140 | { 141 | while (isType(tokens::COMMENT) || isType(tokens::WHITESPACE)) 142 | if (!moveToPreviousToken()) 143 | return false; 144 | return true; 145 | } 146 | 147 | const Token& nextSignificantToken(index_type times = 1) const 148 | { 149 | TokenCursor clone(*this); 150 | for (index_type i = 0; i < times; ++i) 151 | clone.moveToNextSignificantToken(); 152 | return clone; 153 | } 154 | 155 | const Token& previousSignificantToken(index_type times = 1) const 156 | { 157 | TokenCursor clone(*this); 158 | for (index_type i = 0; i < times; ++i) 159 | clone.moveToPreviousSignificantToken(); 160 | return clone; 161 | } 162 | 163 | bool moveToPosition(index_type row, index_type column) 164 | { 165 | return moveToPosition(Position(row, column)); 166 | } 167 | 168 | bool moveToPosition(const Position& target) 169 | { 170 | if (UNLIKELY(n_ == 0)) 171 | return false; 172 | 173 | if (UNLIKELY(tokens_[n_ - 1].position() <= target)) 174 | { 175 | offset_ = n_ - 1; 176 | return true; 177 | } 178 | 179 | index_type start = 0; 180 | index_type end = n_; 181 | 182 | index_type offset = 0; 183 | while (true) 184 | { 185 | offset = (start + end) / 2; 186 | const Position& current = tokens_[offset].position(); 187 | 188 | if (current == target || start == end) 189 | break; 190 | else if (current < target) 191 | start = offset + 1; 192 | else 193 | end = offset - 1; 194 | } 195 | 196 | offset_ = offset; 197 | return true; 198 | } 199 | 200 | template 201 | bool findFwd(F f) 202 | { 203 | do { 204 | if (f(this)) 205 | return true; 206 | } while (moveToNextToken()); 207 | 208 | return false; 209 | } 210 | 211 | template 212 | bool findBwd(F f) 213 | { 214 | do { 215 | if (f(this)) 216 | return true; 217 | } while (moveToPreviousToken()); 218 | 219 | return false; 220 | } 221 | 222 | bool findFwd(const char* contents) 223 | { 224 | return findFwd(std::string(contents, std::strlen(contents))); 225 | } 226 | 227 | bool findFwd(const std::string& contents) 228 | { 229 | do { 230 | if (currentToken().contentsEqual(contents)) 231 | return true; 232 | } while (moveToNextToken()); 233 | 234 | return false; 235 | } 236 | 237 | bool findBwd(const char* contents) 238 | { 239 | return findBwd(std::string(contents, std::strlen(contents))); 240 | } 241 | 242 | bool findBwd(const std::string& contents) 243 | { 244 | do { 245 | if (currentToken().contentsEqual(contents)) 246 | return true; 247 | } while (moveToPreviousToken()); 248 | 249 | return false; 250 | } 251 | 252 | bool fwdToMatchingBracket() 253 | { 254 | using namespace tokens; 255 | if (!isLeftBracket(currentToken())) 256 | return false; 257 | 258 | TokenType lhs = currentToken().type(); 259 | TokenType rhs = complement(lhs); 260 | index_type balance = 1; 261 | 262 | while (moveToNextSignificantToken()) 263 | { 264 | TokenType type = currentToken().type(); 265 | balance += type == lhs; 266 | balance -= type == rhs; 267 | if (balance == 0) return true; 268 | } 269 | 270 | return false; 271 | } 272 | 273 | bool bwdToMatchingBracket() 274 | { 275 | using namespace tokens; 276 | if (!isRightBracket(currentToken())) 277 | return false; 278 | 279 | TokenType lhs = currentToken().type(); 280 | TokenType rhs = complement(lhs); 281 | index_type balance = 1; 282 | 283 | while (moveToPreviousSignificantToken()) 284 | { 285 | TokenType type = currentToken().type(); 286 | balance += type == lhs; 287 | balance -= type == rhs; 288 | if (balance == 0) return true; 289 | } 290 | 291 | return false; 292 | } 293 | 294 | friend std::ostream& operator<<(std::ostream& os, const TokenCursor& cursor) 295 | { 296 | return os << toString(cursor.currentToken()); 297 | } 298 | 299 | tokens::TokenType type() const { return currentToken().type(); } 300 | bool isType(tokens::TokenType type) const { return currentToken().isType(type); } 301 | collections::Position position() const { return currentToken().position(); } 302 | index_type offset() const { return offset_; } 303 | index_type row() const { return currentToken().row(); } 304 | index_type column() const { return currentToken().column(); } 305 | 306 | 307 | private: 308 | 309 | const std::vector& tokens_; 310 | index_type offset_; 311 | index_type n_; 312 | Token noSuchToken_; 313 | 314 | }; 315 | 316 | } // namespace cursors 317 | 318 | inline std::string toString(const cursors::TokenCursor& cursor) 319 | { 320 | return toString(cursor.currentToken()); 321 | } 322 | 323 | } // namespace sourcetools 324 | 325 | #endif /* SOURCETOOLS_CURSOR_TOKEN_CURSOR_H */ 326 | -------------------------------------------------------------------------------- /inst/include/sourcetools/cursor/cursor.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_CURSOR_CURSOR_H 2 | #define SOURCETOOLS_CURSOR_CURSOR_H 3 | 4 | #include 5 | #include 6 | 7 | #endif /* SOURCETOOLS_CURSOR_CURSOR_H */ 8 | -------------------------------------------------------------------------------- /inst/include/sourcetools/diagnostics/Checkers.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_DIAGNOSTICS_CHECKERS_H 2 | #define SOURCETOOLS_DIAGNOSTICS_CHECKERS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace sourcetools { 13 | namespace diagnostics { 14 | namespace checkers { 15 | 16 | class CheckerBase 17 | { 18 | public: 19 | typedef tokens::Token Token; 20 | typedef tokens::TokenType TokenType; 21 | typedef parser::ParseNode ParseNode; 22 | 23 | virtual void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth) = 0; 24 | virtual ~CheckerBase() {} 25 | }; 26 | 27 | /** 28 | * Warn about code of the form: 29 | * 30 | * x == NULL 31 | * 32 | * The user likely intended to check if a value was NULL, 33 | * and in such a case should use `is.null()` instead. 34 | */ 35 | class ComparisonWithNullChecker : public CheckerBase 36 | { 37 | public: 38 | void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth) 39 | { 40 | const Token& token = pNode->token(); 41 | bool isEquals = 42 | token.isType(tokens::OPERATOR_EQUAL) || 43 | token.isType(tokens::OPERATOR_NOT_EQUAL); 44 | 45 | if (!isEquals) 46 | return; 47 | 48 | if (pNode->children().size() != 2) 49 | return; 50 | 51 | ParseNode* pLhs = pNode->children()[0]; 52 | ParseNode* pRhs = pNode->children()[1]; 53 | 54 | if (pLhs->token().isType(tokens::KEYWORD_NULL) || 55 | pRhs->token().isType(tokens::KEYWORD_NULL)) 56 | { 57 | pDiagnostics->addWarning( 58 | "Use 'is.null()' to check if an object is NULL", 59 | pNode->range()); 60 | } 61 | } 62 | }; 63 | 64 | /** 65 | * Warn about code of the form: 66 | * 67 | * if (x = 1) { ... } 68 | * 69 | * The user likely intended to write 'if (x == 1)'. 70 | */ 71 | class AssignmentInIfChecker : public CheckerBase 72 | { 73 | public: 74 | void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth) 75 | { 76 | if (!pNode->token().isType(tokens::KEYWORD_IF)) 77 | return; 78 | 79 | if (pNode->children().size() < 1) 80 | return; 81 | 82 | ParseNode* pCondition = pNode->children()[0]; 83 | if (!pCondition->token().isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS)) 84 | return; 85 | 86 | pDiagnostics->addWarning( 87 | "Using '=' for assignment in 'if' condition", 88 | pCondition->range()); 89 | 90 | } 91 | }; 92 | 93 | /** 94 | * Warn about vectorized '&' or '|' used in 95 | * 'if' statements. The scalar forms, '&&' and '||', 96 | * are likely preferred. 97 | */ 98 | class ScalarOpsInIfChecker : public CheckerBase 99 | { 100 | public: 101 | void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth) 102 | { 103 | if (!pNode->token().isType(tokens::KEYWORD_IF)) 104 | return; 105 | 106 | if (pNode->children().size() < 1) 107 | return; 108 | 109 | ParseNode* pCondition = pNode->children()[0]; 110 | const Token& token = pCondition->token(); 111 | if (token.isType(tokens::OPERATOR_AND_VECTOR)) 112 | { 113 | pDiagnostics->addInfo( 114 | "Prefer '&&' to '&' in 'if' statement condition", 115 | pCondition->range()); 116 | } 117 | else if (token.isType(tokens::OPERATOR_OR_VECTOR)) 118 | { 119 | pDiagnostics->addInfo( 120 | "Prefer '||' to '|' in 'if' statement condition", 121 | pCondition->range()); 122 | } 123 | } 124 | }; 125 | 126 | /** 127 | * Warn about unused computations, e.g. 128 | * 129 | * foo <- function(x) { 130 | * x < 1 131 | * print(x) 132 | * } 133 | * 134 | * For example, in the above code, it's possible that the user 135 | * intended to assign 1 to x, or use that result elsewhere. 136 | * 137 | * Don't warn if the expression shows up as the last statement 138 | * within a parent function's body. 139 | */ 140 | class UnusedResultChecker : public CheckerBase 141 | { 142 | public: 143 | void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth) 144 | { 145 | if (pNode->parent() == NULL) 146 | return; 147 | 148 | const Token& parentToken = pNode->parent()->token(); 149 | bool isTopLevelContext = 150 | parentToken.isType(tokens::ROOT) || 151 | parentToken.isType(tokens::LBRACE); 152 | 153 | if (!isTopLevelContext) 154 | return; 155 | 156 | if (parentToken.isType(tokens::LBRACE)) 157 | { 158 | const std::vector& siblings = pNode->parent()->children(); 159 | if (pNode == siblings[siblings.size() - 1]) 160 | return; 161 | } 162 | 163 | const Token& token = pNode->token(); 164 | if (!tokens::isOperator(token)) 165 | return; 166 | 167 | if (tokens::isAssignmentOperator(token)) 168 | return; 169 | 170 | 171 | pDiagnostics->addInfo( 172 | "result of computation is not used", 173 | pNode->range()); 174 | } 175 | }; 176 | 177 | class NoSymbolInScopeChecker : public CheckerBase 178 | { 179 | public: 180 | 181 | NoSymbolInScopeChecker() 182 | { 183 | stack_.push_back(Context(0)); 184 | objects_ = r::objectsOnSearchPath(); 185 | } 186 | 187 | void apply(const ParseNode* pNode, Diagnostics* pDiagnostics, index_type depth) 188 | { 189 | using namespace tokens; 190 | const Token& token = pNode->token(); 191 | 192 | // If we've left the last active scope, pop. 193 | if (depth < current().depth()) 194 | pop(); 195 | 196 | // Assignments update the current scope. 197 | if (token.isType(OPERATOR_ASSIGN_LEFT) || 198 | token.isType(OPERATOR_ASSIGN_LEFT_EQUALS)) 199 | { 200 | const ParseNode* pChild = pNode->children()[0]; 201 | const Token& symbol = pChild->token(); 202 | if (symbol.isType(SYMBOL) || symbol.isType(STRING)) 203 | add(symbol); 204 | } 205 | 206 | // Check if a symbol has a definition in scope. 207 | if (token.isType(SYMBOL)) 208 | check(token, pDiagnostics); 209 | 210 | // If we encounter a function definition, create a new scope 211 | // and make the function argument names present in that scope. 212 | if (token.isType(KEYWORD_FUNCTION)) 213 | push(pNode, depth); 214 | } 215 | 216 | private: 217 | 218 | class Context 219 | { 220 | public: 221 | explicit Context(index_type depth) 222 | : depth_(depth) 223 | { 224 | } 225 | 226 | void add(const Token& token) 227 | { 228 | values_.insert(token.contents()); 229 | } 230 | 231 | bool contains(const std::string& contents) const 232 | { 233 | return values_.count(contents); 234 | } 235 | 236 | index_type depth() const 237 | { 238 | return depth_; 239 | } 240 | 241 | private: 242 | std::set values_; 243 | index_type depth_; 244 | }; 245 | 246 | Context& current() 247 | { 248 | return stack_[stack_.size() - 1]; 249 | } 250 | 251 | void push(const ParseNode* pNode, index_type depth) 252 | { 253 | stack_.push_back(Context(depth)); 254 | 255 | ParseNode* pFormals = pNode->children()[0]; 256 | const std::vector& children = pFormals->children(); 257 | for (std::vector::const_iterator it = children.begin(); 258 | it != children.end(); 259 | ++it) 260 | { 261 | const Token& token = (*it)->token(); 262 | if (token.isType(tokens::SYMBOL)) 263 | add(token); 264 | else if (token.isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS)) 265 | { 266 | const Token& lhs = (*it)->children()[0]->token(); 267 | if (lhs.isType(tokens::SYMBOL)) 268 | add(lhs); 269 | } 270 | } 271 | } 272 | 273 | void pop() 274 | { 275 | stack_.pop_back(); 276 | } 277 | 278 | void add(const Token& token) 279 | { 280 | current().add(token); 281 | } 282 | 283 | void check(const Token& token, Diagnostics* pDiagnostics) 284 | { 285 | if (!token.isType(tokens::SYMBOL)) 286 | return; 287 | 288 | std::string contents = token.contents(); 289 | for (std::vector::const_iterator it = stack_.begin(); 290 | it != stack_.end(); 291 | ++it) 292 | { 293 | if (it->contains(contents)) 294 | { 295 | return; 296 | } 297 | } 298 | 299 | if (objects_.count(token.contents())) 300 | return; 301 | 302 | collections::Range range(token.position(), token.position() + token.size()); 303 | pDiagnostics->addWarning( 304 | "use of undefined symbol '" + token.contents() + "'", 305 | range); 306 | } 307 | 308 | std::vector stack_; 309 | std::set objects_; 310 | 311 | }; 312 | 313 | } // namespace checkers 314 | } // namespace diagnostics 315 | } // namespace sourcetools 316 | 317 | #endif /* SOURCETOOLS_DIAGNOSTICS_CHECKERS_H */ 318 | -------------------------------------------------------------------------------- /inst/include/sourcetools/diagnostics/Diagnostic.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_DIAGNOSTICS_DIAGNOSTIC_H 2 | #define SOURCETOOLS_DIAGNOSTICS_DIAGNOSTIC_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace sourcetools { 11 | namespace diagnostics { 12 | 13 | enum DiagnosticType 14 | { 15 | DIAGNOSTIC_ERROR, 16 | DIAGNOSTIC_WARNING, 17 | DIAGNOSTIC_INFO, 18 | DIAGNOSTIC_STYLE 19 | }; 20 | 21 | class Diagnostic 22 | { 23 | public: 24 | Diagnostic(DiagnosticType type, 25 | const std::string& message, 26 | const collections::Range& range) 27 | : type_(type), message_(message), range_(range) 28 | { 29 | } 30 | 31 | const std::string message() const { return message_; } 32 | DiagnosticType type() const { return type_; } 33 | collections::Range range() const { return range_; } 34 | collections::Position start() const { return range_.start(); } 35 | collections::Position end() const { return range_.end(); } 36 | 37 | private: 38 | DiagnosticType type_; 39 | std::string message_; 40 | collections::Range range_; 41 | }; 42 | 43 | class Diagnostics 44 | { 45 | typedef collections::Range Range; 46 | 47 | public: 48 | 49 | void add(DiagnosticType type, const std::string& message, const Range& range) 50 | { 51 | diagnostics_.push_back(Diagnostic(type, message, range)); 52 | } 53 | 54 | void addError(const std::string& message, const Range& range) 55 | { 56 | add(DIAGNOSTIC_ERROR, message, range); 57 | } 58 | 59 | void addWarning(const std::string& message, const Range& range) 60 | { 61 | add(DIAGNOSTIC_WARNING, message, range); 62 | } 63 | 64 | void addInfo(const std::string& message, const Range& range) 65 | { 66 | add(DIAGNOSTIC_INFO, message, range); 67 | } 68 | 69 | operator const std::vector&() const { return diagnostics_; } 70 | 71 | private: 72 | std::vector diagnostics_; 73 | }; 74 | 75 | } // namespace diagnostics 76 | 77 | namespace r { 78 | 79 | inline SEXP create(diagnostics::DiagnosticType type) 80 | { 81 | using namespace diagnostics; 82 | 83 | switch (type) 84 | { 85 | case DIAGNOSTIC_ERROR: return Rf_mkString("error"); 86 | case DIAGNOSTIC_WARNING: return Rf_mkString("warning"); 87 | case DIAGNOSTIC_INFO: return Rf_mkString("info"); 88 | case DIAGNOSTIC_STYLE: return Rf_mkString("style"); 89 | } 90 | 91 | // happy compiler 92 | return Rf_mkString("error"); 93 | } 94 | 95 | inline SEXP create(const diagnostics::Diagnostic& diagnostic) 96 | { 97 | using namespace diagnostics; 98 | 99 | ListBuilder builder; 100 | 101 | builder.add("type", create(diagnostic.type())); 102 | builder.add("file", Rf_mkString("")); 103 | builder.add("line", Rf_ScalarInteger(diagnostic.start().row)); 104 | builder.add("column", Rf_ScalarInteger(diagnostic.start().column)); 105 | builder.add("message", r::createString(diagnostic.message())); 106 | 107 | return builder; 108 | } 109 | 110 | inline SEXP create(const std::vector& diagnostics) 111 | { 112 | using namespace diagnostics; 113 | 114 | Protect protect; 115 | index_type n = diagnostics.size(); 116 | SEXP resultSEXP = protect(Rf_allocVector(VECSXP, n)); 117 | for (index_type i = 0; i < n; ++i) 118 | SET_VECTOR_ELT(resultSEXP, i, create(diagnostics[i])); 119 | return resultSEXP; 120 | } 121 | 122 | } // namespace r 123 | 124 | } // namespace sourcetools 125 | 126 | #endif /* SOURCETOOLS_DIAGNOSTICS_DIAGNOSTIC_H */ 127 | -------------------------------------------------------------------------------- /inst/include/sourcetools/diagnostics/DiagnosticsSet.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_SET_H 2 | #define SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_SET_H 3 | 4 | #include 5 | #include 6 | 7 | namespace sourcetools { 8 | namespace diagnostics { 9 | 10 | class DiagnosticsSet 11 | { 12 | typedef std::vector Checkers; 13 | typedef checkers::CheckerBase CheckerBase; 14 | typedef parser::ParseNode ParseNode; 15 | 16 | public: 17 | 18 | void add(CheckerBase* pChecker) 19 | { 20 | checkers_.push_back(pChecker); 21 | } 22 | 23 | const std::vector& run(const ParseNode* pNode) 24 | { 25 | runImpl(pNode); 26 | return diagnostics_; 27 | } 28 | 29 | void report() 30 | { 31 | const std::vector& diagnostics = diagnostics_; 32 | for (index_type i = 0; i < utils::size(diagnostics); ++i) 33 | { 34 | Diagnostic diagnostic = diagnostics[i]; 35 | std::cerr << diagnostic.range() << ": " 36 | << diagnostic.message() 37 | << std::endl; 38 | } 39 | } 40 | 41 | ~DiagnosticsSet() 42 | { 43 | for (Checkers::const_iterator it = checkers_.begin(); 44 | it != checkers_.end(); 45 | ++it) 46 | { 47 | delete *it; 48 | } 49 | } 50 | 51 | private: 52 | void runImpl(const ParseNode* pNode, index_type depth = 0) 53 | { 54 | for (Checkers::iterator it = checkers_.begin(); 55 | it != checkers_.end(); 56 | ++it) 57 | { 58 | (*it)->apply(pNode, &diagnostics_, depth); 59 | } 60 | 61 | for (std::vector::const_iterator it = pNode->children().begin(); 62 | it != pNode->children().end(); 63 | ++it) 64 | { 65 | runImpl(*it, depth + 1); 66 | } 67 | } 68 | 69 | 70 | private: 71 | Checkers checkers_; 72 | Diagnostics diagnostics_; 73 | }; 74 | 75 | inline DiagnosticsSet* createDefaultDiagnosticsSet() 76 | { 77 | DiagnosticsSet* pSet = new DiagnosticsSet(); 78 | pSet->add(new checkers::AssignmentInIfChecker); 79 | pSet->add(new checkers::ComparisonWithNullChecker); 80 | pSet->add(new checkers::ScalarOpsInIfChecker); 81 | pSet->add(new checkers::UnusedResultChecker); 82 | pSet->add(new checkers::NoSymbolInScopeChecker); 83 | return pSet; 84 | } 85 | 86 | } // namespace diagnostics 87 | } // namespace sourcetools 88 | 89 | #endif /* SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_SET_H */ 90 | -------------------------------------------------------------------------------- /inst/include/sourcetools/diagnostics/diagnostics.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_H 2 | #define SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #endif /* SOURCETOOLS_DIAGNOSTICS_DIAGNOSTICS_H */ 9 | -------------------------------------------------------------------------------- /inst/include/sourcetools/multibyte/multibyte.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_MULTIBYTE_MULTIBYTE_H 2 | #define SOURCETOOLS_MULTIBYTE_MULTIBYTE_H 3 | 4 | #include 5 | #include 6 | 7 | namespace sourcetools { 8 | namespace multibyte { 9 | 10 | template 11 | inline bool countWhitespaceBytes(const char* data, 12 | T* pBytes) 13 | { 14 | wchar_t ch; 15 | T bytes = 0; 16 | const char* it = data; 17 | 18 | while (true) { 19 | 20 | int status = std::mbtowc(&ch, it, MB_CUR_MAX); 21 | if (status == 0) { 22 | break; 23 | } else if (status == -1) { 24 | break; 25 | } 26 | 27 | if (!std::iswspace(ch)) 28 | break; 29 | 30 | bytes += status; 31 | it += status; 32 | } 33 | 34 | *pBytes = bytes; 35 | return bytes != 0; 36 | } 37 | 38 | } // namespace multibyte 39 | } // namespace sourcetools 40 | 41 | #endif /* SOURCETOOLS_MULTIBYTE_MULTIBYTE_H */ 42 | -------------------------------------------------------------------------------- /inst/include/sourcetools/parse/ParseError.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_PARSE_PARSE_ERROR_H 2 | #define SOURCETOOLS_PARSE_PARSE_ERROR_H 3 | 4 | #include 5 | #include 6 | 7 | namespace sourcetools { 8 | namespace parser { 9 | 10 | class ParseError 11 | { 12 | typedef collections::Position Position; 13 | typedef tokens::Token Token; 14 | 15 | Position start_; 16 | Position end_; 17 | std::string message_; 18 | 19 | public: 20 | 21 | ParseError(const tokens::Token& token, 22 | const std::string& message) 23 | : start_(token.position()), 24 | end_(token.position()), 25 | message_(message) 26 | { 27 | end_.column += token.end() - token.begin(); 28 | } 29 | 30 | ParseError(const Position& start, 31 | const Position& end, 32 | const std::string& message) 33 | : start_(start), 34 | end_(end), 35 | message_(message) 36 | { 37 | } 38 | 39 | explicit ParseError(const std::string& message) 40 | : start_(0, 0), 41 | end_(0, 0), 42 | message_(message) 43 | { 44 | } 45 | 46 | const Position& start() const { return start_; } 47 | const Position& end() const { return end_; } 48 | const std::string& message() const { return message_; } 49 | }; 50 | 51 | } // namespace parser 52 | } // namespace sourcetools 53 | 54 | #endif /* SOURCETOOLS_PARSE_PARSE_ERROR_H */ 55 | -------------------------------------------------------------------------------- /inst/include/sourcetools/parse/ParseNode.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_PARSE_PARSE_NODE_H 2 | #define SOURCETOOLS_PARSE_PARSE_NODE_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace sourcetools { 11 | namespace parser { 12 | 13 | class ParseNode 14 | { 15 | public: 16 | typedef collections::Position Position; 17 | typedef collections::Range Range; 18 | typedef tokens::Token Token; 19 | typedef tokens::TokenType TokenType; 20 | 21 | private: 22 | Token token_; 23 | ParseNode* parent_; 24 | std::vector children_; 25 | 26 | Token begin_; 27 | Token end_; 28 | 29 | public: 30 | 31 | explicit ParseNode(const Token& token) 32 | : token_(token), parent_(NULL), 33 | begin_(token), end_(token) 34 | { 35 | } 36 | 37 | static ParseNode* create(const Token& token) 38 | { 39 | return new ParseNode(token); 40 | } 41 | 42 | static ParseNode* create(const TokenType& type) 43 | { 44 | static std::map tokens; 45 | if (!tokens.count(type)) 46 | tokens[type] = Token(type); 47 | 48 | const Token& token = tokens[type]; 49 | return new ParseNode(token); 50 | } 51 | 52 | ~ParseNode() 53 | { 54 | for (std::vector::const_iterator it = children_.begin(); 55 | it != children_.end(); 56 | ++it) 57 | { 58 | delete *it; 59 | } 60 | } 61 | 62 | void remove(const ParseNode* pNode) 63 | { 64 | children_.erase( 65 | std::remove(children_.begin(), children_.end(), pNode), 66 | children_.end()); 67 | } 68 | 69 | void add(ParseNode* pNode) 70 | { 71 | if (pNode->parent_ != NULL) 72 | pNode->parent_->remove(pNode); 73 | pNode->parent_ = this; 74 | 75 | const Token& begin = pNode->begin(); 76 | const Token& end = pNode->end(); 77 | if (begin.offset() != -1 && end.offset() != -1) 78 | { 79 | for (ParseNode* pParent = this; pParent != NULL; pParent = pParent->parent_) 80 | { 81 | if (begin.begin() < pParent->begin().begin()) 82 | pParent->setBegin(begin); 83 | if (end.end() > pParent->end().end()) 84 | pParent->setEnd(end); 85 | } 86 | } 87 | 88 | children_.push_back(pNode); 89 | } 90 | 91 | const Token& begin() const 92 | { 93 | return begin_; 94 | } 95 | 96 | void setBegin(const Token& begin) 97 | { 98 | for (ParseNode* pNode = this; pNode != NULL; pNode = pNode->parent_) 99 | if (begin.begin() < pNode->begin().begin()) 100 | pNode->begin_ = begin; 101 | } 102 | 103 | const Token& end() const 104 | { 105 | return end_; 106 | } 107 | 108 | void setEnd(const Token& end) 109 | { 110 | end_ = end; 111 | for (ParseNode* pNode = this; pNode != NULL; pNode = pNode->parent_) 112 | if (end.end() > pNode->end().end()) 113 | pNode->end_ = end; 114 | } 115 | 116 | void bounds(const char** begin, const char** end) 117 | { 118 | *begin = begin_.begin(); 119 | *end = end_.end(); 120 | } 121 | 122 | Range range() const 123 | { 124 | return Range(begin_.position(), end_.position() + end_.size()); 125 | } 126 | 127 | const Token& token() const { return token_; } 128 | const ParseNode* parent() const { return parent_; } 129 | const std::vector& children() const { return children_; } 130 | }; 131 | 132 | } // namespace parser 133 | } // namespace sourcetools 134 | 135 | #endif /* SOURCETOOLS_PARSE_PARSE_NODE_H */ 136 | -------------------------------------------------------------------------------- /inst/include/sourcetools/parse/ParseStatus.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_PARSE_PARSE_STATUS_H 2 | #define SOURCETOOLS_PARSE_PARSE_STATUS_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace sourcetools { 9 | namespace parser { 10 | 11 | class ParseNode; 12 | 13 | class ParseStatus 14 | { 15 | typedef collections::Position Position; 16 | 17 | public: 18 | ParseStatus() {} 19 | 20 | void recordNodeLocation(const Position& position, 21 | ParseNode* pNode) 22 | { 23 | map_[position] = pNode; 24 | } 25 | 26 | ParseNode* getNodeAtPosition(const Position& position) 27 | { 28 | return map_[position]; 29 | } 30 | 31 | void addError(const ParseError& error) 32 | { 33 | errors_.push_back(error); 34 | } 35 | 36 | const std::vector& getErrors() const 37 | { 38 | return errors_; 39 | } 40 | 41 | private: 42 | std::map map_; 43 | std::vector errors_; 44 | }; 45 | } // namespace parser 46 | } // namespace sourcetools 47 | 48 | #endif /* SOURCETOOLS_PARSE_PARSE_STATUS_H */ 49 | -------------------------------------------------------------------------------- /inst/include/sourcetools/parse/Parser.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_PARSE_PARSER_H 2 | #define SOURCETOOLS_PARSE_PARSER_H 3 | #define SOURCE_TOOLS_PARSE_PARSER_H 4 | 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | // Defines that will go away once the parser is more tested / game ready 17 | // #define SOURCE_TOOLS_DEBUG_PARSER_TRACE 18 | // #define SOURCE_TOOLS_DEBUG_PARSER_PRINT_TOKEN_INFO 19 | // #define SOURCE_TOOLS_DEBUG_PARSER_STACK_OVERFLOW 20 | 21 | #ifdef SOURCE_TOOLS_DEBUG_PARSER_TRACE 22 | # define SOURCE_TOOLS_DEBUG_PARSER_LOG(__X__) std::cerr << __X__ << std::endl 23 | #else 24 | # define SOURCE_TOOLS_DEBUG_PARSER_LOG(__X__) 25 | #endif 26 | 27 | #ifdef SOURCE_TOOLS_DEBUG_PARSER_PRINT_TOKEN_INFO 28 | 29 | # define SOURCE_TOOLS_DEBUG_TOKEN(__TOKEN__) \ 30 | do \ 31 | { \ 32 | std::cout << __TOKEN__ << std::endl; \ 33 | } while (0) 34 | 35 | #else 36 | 37 | # define SOURCE_TOOLS_DEBUG_TOKEN(__TOKEN__) \ 38 | do \ 39 | { \ 40 | } while (0) \ 41 | 42 | #endif 43 | 44 | namespace sourcetools { 45 | namespace parser { 46 | 47 | class Parser 48 | { 49 | typedef tokenizer::Tokenizer Tokenizer; 50 | typedef tokens::Token Token; 51 | typedef tokens::TokenType TokenType; 52 | typedef collections::Position Position; 53 | 54 | enum ParseState 55 | { 56 | PARSE_STATE_TOP_LEVEL, 57 | PARSE_STATE_BRACE, 58 | PARSE_STATE_PAREN 59 | }; 60 | 61 | Tokenizer tokenizer_; 62 | Token token_; 63 | Token previous_; 64 | ParseState state_; 65 | ParseStatus* pStatus_; 66 | 67 | public: 68 | explicit Parser(const std::string& code) 69 | : tokenizer_(code.c_str(), code.size()), 70 | state_(PARSE_STATE_TOP_LEVEL) 71 | { 72 | advance(); 73 | } 74 | 75 | explicit Parser(const char* code, index_type n) 76 | : tokenizer_(code, n), 77 | state_(PARSE_STATE_TOP_LEVEL) 78 | { 79 | advance(); 80 | } 81 | 82 | private: 83 | 84 | // Error-related ---- 85 | 86 | void unexpectedEndOfInput() 87 | { 88 | ParseError error("unexpected end of input"); 89 | pStatus_->addError(error); 90 | } 91 | 92 | std::string unexpectedTokenString(const Token& token) 93 | { 94 | return std::string() + 95 | "unexpected token '" + token.contents() + "'"; 96 | } 97 | 98 | std::string unexpectedTokenString(const Token& token, 99 | TokenType expectedType) 100 | { 101 | return unexpectedTokenString(token) + 102 | "; expected type '" + toString(expectedType) + "'"; 103 | } 104 | 105 | void unexpectedToken(const Token& token) 106 | { 107 | unexpectedToken(token, unexpectedTokenString(token)); 108 | } 109 | 110 | void unexpectedToken(const Token& token, 111 | TokenType type) 112 | { 113 | unexpectedToken(token, unexpectedTokenString(token, type)); 114 | } 115 | 116 | void unexpectedToken(const Token& token, 117 | const std::string& message) 118 | { 119 | ParseError error(token, message); 120 | pStatus_->addError(error); 121 | } 122 | 123 | bool checkUnexpectedEnd(const Token& token) 124 | { 125 | if (UNLIKELY(token.isType(tokens::END))) 126 | { 127 | ParseError error(token, "unexpected end of input"); 128 | pStatus_->addError(error); 129 | return true; 130 | } 131 | 132 | return false; 133 | } 134 | 135 | // Parser sub-routines ---- 136 | 137 | ParseNode* parseFunctionArgumentListOne() 138 | { 139 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionArgument()"); 140 | using namespace tokens; 141 | 142 | check(SYMBOL); 143 | 144 | Token lookahead = peek(1); 145 | if (lookahead.isType(COMMA) || lookahead.isType(RPAREN)) 146 | return ParseNode::create(consume()); 147 | else if (lookahead.isType(OPERATOR_ASSIGN_LEFT_EQUALS)) 148 | return parseExpression(); 149 | 150 | if (isOperator(lookahead)) 151 | unexpectedToken(lookahead, "expected '=', ',' or ')' following argument name"); 152 | 153 | return parseExpression(); 154 | } 155 | 156 | ParseNode* parseFunctionArgumentList() 157 | { 158 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionArgumentList()"); 159 | using namespace tokens; 160 | 161 | ParseNode* pNode = createNode(EMPTY); 162 | if (token_.isType(RPAREN)) 163 | return pNode; 164 | 165 | while (true) 166 | { 167 | if (checkUnexpectedEnd(current())) 168 | break; 169 | 170 | pNode->add(parseFunctionArgumentListOne()); 171 | if (current().isType(RPAREN)) 172 | return pNode; 173 | else if (current().isType(COMMA)) 174 | { 175 | advance(); 176 | continue; 177 | } 178 | 179 | // TODO: how should we recover here? For now, we 180 | // assume that there should have been a comma and 181 | // continue parsing. 182 | unexpectedToken(current(), "expected ',' or ')'"); 183 | continue; 184 | } 185 | 186 | return pNode; 187 | } 188 | 189 | ParseNode* parseFunctionDefinition() 190 | { 191 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionDefinition()"); 192 | using namespace tokens; 193 | ParseNode* pNode = createNode(current()); 194 | checkAndAdvance(KEYWORD_FUNCTION); 195 | checkAndAdvance(LPAREN, false); 196 | ParseState state = state_; 197 | state_ = PARSE_STATE_PAREN; 198 | pNode->add(parseFunctionArgumentList()); 199 | state_ = state; 200 | checkAndAdvance(RPAREN, false); 201 | pNode->add(parseNonEmptyExpression()); 202 | return pNode; 203 | } 204 | 205 | ParseNode* parseFor() 206 | { 207 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFor()"); 208 | using namespace tokens; 209 | ParseNode* pNode = createNode(current()); 210 | checkAndAdvance(KEYWORD_FOR); 211 | checkAndAdvance(LPAREN, false); 212 | ParseState state = state_; 213 | state_ = PARSE_STATE_PAREN; 214 | check(SYMBOL); 215 | pNode->add(createNode(consume())); 216 | checkAndAdvance(KEYWORD_IN, false); 217 | pNode->add(parseNonEmptyExpression()); 218 | state_ = state; 219 | checkAndAdvance(RPAREN, false); 220 | pNode->add(parseNonEmptyExpression()); 221 | return pNode; 222 | } 223 | 224 | ParseNode* parseIf() 225 | { 226 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseIf()"); 227 | using namespace tokens; 228 | ParseNode* pNode = createNode(current()); 229 | checkAndAdvance(KEYWORD_IF); 230 | checkAndAdvance(LPAREN, false); 231 | ParseState state = state_; 232 | state_ = PARSE_STATE_PAREN; 233 | pNode->add(parseNonEmptyExpression()); 234 | state_ = state; 235 | checkAndAdvance(RPAREN, false); 236 | pNode->add(parseNonEmptyExpression()); 237 | if (current().isType(KEYWORD_ELSE)) 238 | { 239 | advance(); 240 | pNode->add(parseNonEmptyExpression()); 241 | } 242 | return pNode; 243 | } 244 | 245 | ParseNode* parseWhile() 246 | { 247 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseWhile()"); 248 | using namespace tokens; 249 | ParseNode* pNode = createNode(current()); 250 | checkAndAdvance(KEYWORD_WHILE); 251 | checkAndAdvance(LPAREN, false); 252 | ParseState state = state_; 253 | state_ = PARSE_STATE_PAREN; 254 | pNode->add(parseNonEmptyExpression()); 255 | state_ = state; 256 | checkAndAdvance(RPAREN, false); 257 | pNode->add(parseNonEmptyExpression()); 258 | return pNode; 259 | } 260 | 261 | ParseNode* parseRepeat() 262 | { 263 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseRepeat()"); 264 | using namespace tokens; 265 | ParseNode* pNode = createNode(current()); 266 | checkAndAdvance(KEYWORD_REPEAT); 267 | pNode->add(parseNonEmptyExpression()); 268 | return pNode; 269 | } 270 | 271 | ParseNode* parseControlFlowKeyword() 272 | { 273 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseControlFlowKeyword('" << token_.contents() << "')"); 274 | using namespace tokens; 275 | 276 | const Token& token = current(); 277 | if (token.isType(KEYWORD_FUNCTION)) 278 | return parseFunctionDefinition(); 279 | else if (token.isType(KEYWORD_IF)) 280 | return parseIf(); 281 | else if (token.isType(KEYWORD_WHILE)) 282 | return parseWhile(); 283 | else if (token.isType(KEYWORD_FOR)) 284 | return parseFor(); 285 | else if (token.isType(KEYWORD_REPEAT)) 286 | return parseRepeat(); 287 | 288 | unexpectedToken(consume(), "expected control-flow keyword"); 289 | return createNode(INVALID); 290 | } 291 | 292 | ParseNode* parseBracedExpression() 293 | { 294 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseBracedExpression()"); 295 | using namespace tokens; 296 | ParseNode* pNode = createNode(current()); 297 | 298 | checkAndAdvance(LBRACE); 299 | ParseState state = state_; 300 | state_ = PARSE_STATE_BRACE; 301 | skipSemicolons(); 302 | if (current().isType(RBRACE)) 303 | { 304 | pNode->add(createNode(EMPTY)); 305 | } 306 | else 307 | { 308 | while (!current().isType(RBRACE)) 309 | { 310 | if (checkUnexpectedEnd(current())) 311 | break; 312 | pNode->add(parseNonEmptyExpression()); 313 | skipSemicolons(); 314 | } 315 | } 316 | state_ = state; 317 | pNode->setEnd(current()); 318 | checkAndAdvance(RBRACE); 319 | 320 | return pNode; 321 | } 322 | 323 | ParseNode* parseParentheticalExpression() 324 | { 325 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseParentheticalExpression()"); 326 | using namespace tokens; 327 | ParseNode* pNode = createNode(current()); 328 | checkAndAdvance(LPAREN); 329 | ParseState state = state_; 330 | state_ = PARSE_STATE_PAREN; 331 | if (current().isType(RPAREN)) 332 | unexpectedToken(current()); 333 | else 334 | pNode->add(parseNonEmptyExpression()); 335 | state_ = state; 336 | pNode->setEnd(current()); 337 | checkAndAdvance(RPAREN); 338 | return pNode; 339 | } 340 | 341 | ParseNode* parseUnaryOperator() 342 | { 343 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseUnaryOperator()"); 344 | ParseNode* pNode = createNode(current()); 345 | pNode->add(parseNonEmptyExpression(precedence::unary(consume()))); 346 | return pNode; 347 | } 348 | 349 | ParseNode* parseExpressionStart() 350 | { 351 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseExpressionStart('" << current().contents() << "')"); 352 | SOURCE_TOOLS_DEBUG_PARSER_LOG("Type: " << toString(current().type())); 353 | using namespace tokens; 354 | 355 | skipSemicolons(); 356 | const Token& token = current(); 357 | 358 | if (isControlFlowKeyword(token)) 359 | return parseControlFlowKeyword(); 360 | else if (token.isType(LBRACE)) 361 | return parseBracedExpression(); 362 | else if (token.isType(LPAREN)) 363 | return parseParentheticalExpression(); 364 | else if (isUnaryOperator(token)) 365 | return parseUnaryOperator(); 366 | else if (isSymbolic(token) || isKeyword(token)) 367 | return createNode(consume()); 368 | else if (token.isType(END)) 369 | return NULL; 370 | 371 | unexpectedToken(consume()); 372 | return createNode(INVALID); 373 | } 374 | 375 | ParseNode* parseFunctionCallOne(TokenType rhsType) 376 | { 377 | using namespace tokens; 378 | 379 | const Token& token = current(); 380 | if (token.isType(COMMA) || token.isType(rhsType)) 381 | return createNode(Token(MISSING)); 382 | 383 | if (peek(1).isType(OPERATOR_ASSIGN_LEFT_EQUALS)) 384 | { 385 | ParseNode* pLhs = createNode(consume()); 386 | ParseNode* pNode = createNode(consume()); 387 | pNode->add(pLhs); 388 | 389 | if (current().isType(COMMA) || current().isType(rhsType)) 390 | pNode->add(createNode(MISSING)); 391 | else 392 | pNode->add(parseNonEmptyExpression()); 393 | 394 | return pNode; 395 | } 396 | 397 | return parseNonEmptyExpression(); 398 | } 399 | 400 | // Parse a function call, e.g. 401 | // 402 | // ::= 403 | // 404 | // can be one of '(', '[' or '[[', 405 | // are (potentially named) comma-separated values 406 | // is the complement of the above. 407 | // 408 | // Parsing a function call is surprisingly tricky, due to the 409 | // nature of allowing a mixture of unnamed, named, and missing 410 | // arguments. 411 | ParseNode* parseFunctionCall(ParseNode* pLhs) 412 | { 413 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseFunctionCall('" << current().contents() << "')"); 414 | using namespace tokens; 415 | TokenType lhsType = current().type(); 416 | TokenType rhsType = complement(lhsType); 417 | 418 | ParseNode* pNode = createNode(current()); 419 | pNode->add(pLhs); 420 | 421 | checkAndAdvance(lhsType); 422 | 423 | ParseState state = state_; 424 | state_ = PARSE_STATE_PAREN; 425 | 426 | if (current().isType(rhsType)) 427 | { 428 | pNode->add(lhsType == LPAREN ? 429 | createNode(Token(EMPTY)) : 430 | createNode(Token(MISSING))); 431 | } 432 | else 433 | { 434 | while (true) 435 | { 436 | if (checkUnexpectedEnd(current())) 437 | break; 438 | 439 | pNode->add(parseFunctionCallOne(rhsType)); 440 | 441 | const Token& token = current(); 442 | if (token.isType(COMMA)) 443 | { 444 | consume(); 445 | continue; 446 | } 447 | else if (token.isType(rhsType)) 448 | { 449 | break; 450 | } 451 | 452 | std::string message = std::string() + 453 | "expected ',' or '" + toString(rhsType) + "'"; 454 | unexpectedToken(current(), message); 455 | } 456 | } 457 | 458 | checkAndAdvance(rhsType); 459 | 460 | state_ = state; 461 | 462 | if (isCallOperator(current()) && canParseExpressionContinuation()) 463 | return parseFunctionCall(pNode); 464 | return pNode; 465 | } 466 | 467 | ParseNode* parseExpressionContinuation(ParseNode* pNode) 468 | { 469 | using namespace tokens; 470 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseExpressionContinuation('" << current().contents() << "')"); 471 | SOURCE_TOOLS_DEBUG_PARSER_LOG("Type: " << toString(current().type())); 472 | 473 | Token token = current(); 474 | if (isCallOperator(token)) 475 | return parseFunctionCall(pNode); 476 | else if (token.isType(END)) 477 | return createNode(token); 478 | 479 | ParseNode* pNew = createNode(token); 480 | pNew->add(pNode); 481 | 482 | advance(); 483 | int precedence = 484 | precedence::binary(token) - 485 | precedence::isRightAssociative(token); 486 | pNew->add(parseNonEmptyExpression(precedence)); 487 | 488 | return pNew; 489 | } 490 | 491 | bool canParseExpressionContinuation(int precedence = 0) 492 | { 493 | if (precedence >= precedence::binary(current())) 494 | return false; 495 | 496 | if (state_ == PARSE_STATE_PAREN) 497 | return true; 498 | 499 | index_type lhs = previous().row(); 500 | index_type rhs = current().row(); 501 | if (previous().isType(tokens::STRING)) 502 | { 503 | lhs += std::count(previous().begin(), previous().end(), '\n'); 504 | } 505 | 506 | return lhs == rhs; 507 | 508 | } 509 | 510 | ParseNode* parseExpression(int precedence = 0) 511 | { 512 | SOURCE_TOOLS_DEBUG_PARSER_LOG("parseExpression(" << precedence << ")"); 513 | using namespace tokens; 514 | ParseNode* pNode = parseExpressionStart(); 515 | while (canParseExpressionContinuation(precedence)) 516 | pNode = parseExpressionContinuation(pNode); 517 | return pNode; 518 | } 519 | 520 | ParseNode* parseNonEmptyExpression(int precedence = 0) 521 | { 522 | if (checkUnexpectedEnd(current())) 523 | return ParseNode::create(tokens::MISSING); 524 | return parseExpression(precedence); 525 | } 526 | 527 | // Tokenization ---- 528 | 529 | const Token& current() const { return token_; } 530 | const Token& previous() const { return previous_; } 531 | 532 | Token consume() 533 | { 534 | Token token = current(); 535 | advance(); 536 | return token; 537 | } 538 | 539 | bool advance() 540 | { 541 | previous_ = token_; 542 | using namespace tokens; 543 | 544 | bool success = tokenizer_.tokenize(&token_); 545 | while (success && (isComment(token_) || isWhitespace(token_))) 546 | success = tokenizer_.tokenize(&token_); 547 | return success; 548 | } 549 | 550 | bool check(TokenType type) 551 | { 552 | const Token& token = current(); 553 | bool success = token.isType(type); 554 | if (!success) 555 | unexpectedToken(token, type); 556 | return success; 557 | } 558 | 559 | bool checkAndAdvance(TokenType type, bool advanceOnError = true) 560 | { 561 | bool result = check(type); 562 | if (result || advanceOnError) advance(); 563 | return result; 564 | } 565 | 566 | Token peek(index_type lookahead = 0, 567 | bool skipWhitespace = true, 568 | bool skipComments = true) 569 | { 570 | index_type offset = lookahead; 571 | 572 | while (true) 573 | { 574 | Token result = tokenizer_.peek(offset); 575 | if ((skipWhitespace && result.isType(tokens::WHITESPACE)) || 576 | (skipComments && result.isType(tokens::COMMENT))) 577 | { 578 | ++offset; 579 | continue; 580 | } 581 | 582 | if (lookahead == 0) 583 | return result; 584 | 585 | --lookahead; 586 | } 587 | 588 | } 589 | 590 | // Utils ---- 591 | 592 | ParseNode* createNode(TokenType type) 593 | { 594 | return ParseNode::create(type); 595 | } 596 | 597 | ParseNode* createNode(const Token& token) 598 | { 599 | ParseNode* pNode = ParseNode::create(token); 600 | pStatus_->recordNodeLocation(token.position(), pNode); 601 | return pNode; 602 | } 603 | 604 | void skipSemicolons() 605 | { 606 | while (current().isType(tokens::SEMI)) 607 | { 608 | if (state_ == PARSE_STATE_PAREN) 609 | unexpectedToken(consume()); 610 | else 611 | advance(); 612 | } 613 | } 614 | 615 | public: 616 | 617 | ParseNode* parse(ParseStatus* pStatus) 618 | { 619 | pStatus_ = pStatus; 620 | ParseNode* root = createNode(tokens::ROOT); 621 | 622 | while (true) 623 | { 624 | ParseNode* pNode = parseExpression(); 625 | if (!pNode) 626 | break; 627 | 628 | root->add(pNode); 629 | } 630 | 631 | return root; 632 | } 633 | 634 | }; 635 | 636 | } // namespace parser 637 | 638 | void log(parser::ParseNode* pNode, int depth = 0); 639 | 640 | } // namespace sourcetools 641 | 642 | #endif /* SOURCETOOLS_PARSE_PARSER_H */ 643 | -------------------------------------------------------------------------------- /inst/include/sourcetools/parse/Precedence.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_PARSE_PRECEDENCE_H 2 | #define SOURCETOOLS_PARSE_PRECEDENCE_H 3 | 4 | #include 5 | 6 | namespace sourcetools { 7 | namespace parser { 8 | namespace precedence { 9 | 10 | inline int binary(const tokens::Token& token) 11 | { 12 | using namespace tokens; 13 | switch (token.type()) 14 | { 15 | case OPERATOR_HELP: 16 | return 10; 17 | case OPERATOR_ASSIGN_LEFT_COLON: 18 | return 20; 19 | case OPERATOR_ASSIGN_LEFT: 20 | case OPERATOR_ASSIGN_LEFT_EQUALS: 21 | case OPERATOR_ASSIGN_LEFT_PARENT: 22 | return 30; 23 | case OPERATOR_ASSIGN_RIGHT: 24 | case OPERATOR_ASSIGN_RIGHT_PARENT: 25 | return 40; 26 | case OPERATOR_FORMULA: 27 | return 50; 28 | case OPERATOR_PIPE: 29 | case OPERATOR_PIPE_BIND: 30 | return 55; 31 | case OPERATOR_OR_SCALAR: 32 | case OPERATOR_OR_VECTOR: 33 | return 60; 34 | case OPERATOR_AND_SCALAR: 35 | case OPERATOR_AND_VECTOR: 36 | return 70; 37 | case OPERATOR_NEGATION: 38 | return 80; 39 | case OPERATOR_LESS: 40 | case OPERATOR_LESS_OR_EQUAL: 41 | case OPERATOR_GREATER: 42 | case OPERATOR_GREATER_OR_EQUAL: 43 | case OPERATOR_EQUAL: 44 | case OPERATOR_NOT_EQUAL: 45 | return 90; 46 | case OPERATOR_PLUS: 47 | case OPERATOR_MINUS: 48 | return 100; 49 | case OPERATOR_MULTIPLY: 50 | case OPERATOR_DIVIDE: 51 | return 110; 52 | case OPERATOR_USER: 53 | return 120; 54 | case OPERATOR_SEQUENCE: 55 | return 130; 56 | case OPERATOR_EXPONENTATION_STARS: 57 | case OPERATOR_HAT: 58 | return 150; 59 | case LPAREN: 60 | case LBRACKET: 61 | case LDBRACKET: 62 | return 170; 63 | case OPERATOR_DOLLAR: 64 | case OPERATOR_AT: 65 | return 180; 66 | case OPERATOR_NAMESPACE_EXPORTS: 67 | case OPERATOR_NAMESPACE_ALL: 68 | return 190; 69 | 70 | default: 71 | return 0; 72 | } 73 | } 74 | 75 | inline int unary(const tokens::Token& token) 76 | { 77 | using namespace tokens; 78 | switch (token.type()) 79 | { 80 | case OPERATOR_HELP: 81 | return 10; 82 | case OPERATOR_FORMULA: 83 | return 50; 84 | case OPERATOR_NEGATION: 85 | return 80; 86 | case OPERATOR_PLUS: 87 | case OPERATOR_MINUS: 88 | return 140; 89 | default: 90 | return 0; 91 | } 92 | } 93 | 94 | inline bool isRightAssociative(const tokens::Token& token) 95 | { 96 | using namespace tokens; 97 | switch (token.type()) 98 | { 99 | case OPERATOR_ASSIGN_LEFT: 100 | case OPERATOR_ASSIGN_LEFT_PARENT: 101 | case OPERATOR_ASSIGN_LEFT_EQUALS: 102 | case OPERATOR_EXPONENTATION_STARS: 103 | case OPERATOR_HAT: 104 | case LPAREN: 105 | case LBRACKET: 106 | case LDBRACKET: 107 | return true; 108 | default: 109 | return false; 110 | } 111 | } 112 | 113 | } // namespace precedence 114 | } // namespace parser 115 | } // namespace sourcetools 116 | 117 | #endif /* SOURCETOOLS_PARSE_PRECEDENCE_H */ 118 | -------------------------------------------------------------------------------- /inst/include/sourcetools/parse/parse.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_PARSE_PARSE_H 2 | #define SOURCETOOLS_PARSE_PARSE_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | 10 | #endif /* SOURCETOOLS_PARSE_PARSE_H */ 11 | -------------------------------------------------------------------------------- /inst/include/sourcetools/platform/platform.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_PLATFORM_PLATFORM_H 2 | #define SOURCETOOLS_PLATFORM_PLATFORM_H 3 | 4 | #ifdef _WIN32 5 | # define SOURCETOOLS_PLATFORM_WINDOWS 6 | #endif 7 | 8 | #ifdef __APPLE__ 9 | # define SOURCETOOLS_PLATFORM_MACOS 10 | #endif 11 | 12 | #ifdef __linux__ 13 | # define SOURCETOOLS_PLATFORM_LINUX 14 | #endif 15 | 16 | #if defined(__sun) && defined(__SVR4) 17 | # define SOURCETOOLS_PLATFORM_SOLARIS 18 | #endif 19 | 20 | #if __cplusplus >= 201103L 21 | # define SOURCETOOLS_COMPILER_CXX11 22 | #endif 23 | 24 | #endif /* SOURCETOOLS_PLATFORM_PLATFORM_H */ 25 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/RCallRecurser.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_R_CALL_RECURSER_H 2 | #define SOURCETOOLS_R_R_CALL_RECURSER_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | 11 | 12 | namespace sourcetools { 13 | namespace r { 14 | 15 | class CallRecurser : noncopyable 16 | { 17 | public: 18 | 19 | class Operation 20 | { 21 | public: 22 | virtual void apply(SEXP dataSEXP) = 0; 23 | virtual ~Operation() {} 24 | }; 25 | 26 | explicit CallRecurser(SEXP dataSEXP) 27 | { 28 | if (Rf_isPrimitive(dataSEXP)) 29 | dataSEXP_ = R_NilValue; 30 | else if (Rf_isFunction(dataSEXP)) 31 | dataSEXP_ = r::util::functionBody(dataSEXP); 32 | else if (TYPEOF(dataSEXP) == LANGSXP) 33 | dataSEXP_ = dataSEXP; 34 | else 35 | dataSEXP_ = R_NilValue; 36 | } 37 | 38 | void add(Operation* pOperation) 39 | { 40 | operations_.push_back(pOperation); 41 | } 42 | 43 | void run() 44 | { 45 | runImpl(dataSEXP_); 46 | } 47 | 48 | void runImpl(SEXP dataSEXP) 49 | { 50 | for (std::vector::iterator it = operations_.begin(); 51 | it != operations_.end(); 52 | ++it) 53 | { 54 | (*it)->apply(dataSEXP); 55 | } 56 | 57 | if (TYPEOF(dataSEXP) == LANGSXP) 58 | { 59 | while (dataSEXP != R_NilValue) 60 | { 61 | runImpl(CAR(dataSEXP)); 62 | dataSEXP = CDR(dataSEXP); 63 | } 64 | } 65 | } 66 | 67 | private: 68 | SEXP dataSEXP_; 69 | std::vector operations_; 70 | }; 71 | 72 | } // namespace r 73 | } // namespace sourcetools 74 | 75 | #endif /* SOURCETOOLS_R_R_CALL_RECURSER_H */ 76 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/RConverter.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_R_CONVERTER_H 2 | #define SOURCETOOLS_R_R_CONVERTER_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace sourcetools { 11 | namespace r { 12 | 13 | inline SEXP createChar(const std::string& data) 14 | { 15 | return Rf_mkCharLenCE(data.c_str(), data.size(), CE_UTF8); 16 | } 17 | 18 | inline SEXP createString(const std::string& data) 19 | { 20 | Protect protect; 21 | SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1)); 22 | SET_STRING_ELT(resultSEXP, 0, createChar(data)); 23 | return resultSEXP; 24 | } 25 | 26 | inline SEXP create(const std::vector& vector) 27 | { 28 | Protect protect; 29 | index_type n = vector.size(); 30 | SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n)); 31 | for (index_type i = 0; i < n; ++i) 32 | SET_STRING_ELT(resultSEXP, i, createChar(vector[i])); 33 | return resultSEXP; 34 | } 35 | 36 | } // namespace r 37 | } // namespace sourcetools 38 | 39 | #endif /* SOURCETOOLS_R_R_CONVERTER_H */ 40 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/RFunctions.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_R_FUNCTIONS_H 2 | #define SOURCETOOLS_R_R_FUNCTIONS_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace sourcetools { 10 | namespace r { 11 | 12 | inline SEXP eval(const std::string& fn, SEXP envSEXP = NULL) 13 | { 14 | Protect protect; 15 | if (envSEXP == NULL) 16 | { 17 | SEXP strSEXP = protect(Rf_mkString("sourcetools")); 18 | envSEXP = R_FindNamespace(strSEXP); 19 | } 20 | 21 | SEXP callSEXP = protect(Rf_lang1(Rf_install(fn.c_str()))); 22 | SEXP resultSEXP = protect(Rf_eval(callSEXP, envSEXP)); 23 | return resultSEXP; 24 | } 25 | 26 | inline std::set objectsOnSearchPath() 27 | { 28 | std::set results; 29 | Protect protect; 30 | 31 | SEXP objectsSEXP; 32 | protect(objectsSEXP = eval("search_objects")); 33 | 34 | for (R_xlen_t i = 0; i < Rf_length(objectsSEXP); ++i) 35 | { 36 | SEXP strSEXP = VECTOR_ELT(objectsSEXP, i); 37 | for (R_xlen_t j = 0; j < Rf_length(strSEXP); ++j) 38 | { 39 | SEXP charSEXP = STRING_ELT(strSEXP, j); 40 | std::string element(CHAR(charSEXP), Rf_length(charSEXP)); 41 | results.insert(element); 42 | } 43 | } 44 | 45 | return results; 46 | } 47 | 48 | namespace util { 49 | 50 | inline void setNames(SEXP dataSEXP, const char** names, index_type n) 51 | { 52 | RObjectFactory factory; 53 | SEXP namesSEXP = factory.create(STRSXP, n); 54 | for (index_type i = 0; i < n; ++i) 55 | SET_STRING_ELT(namesSEXP, i, Rf_mkChar(names[i])); 56 | Rf_setAttrib(dataSEXP, R_NamesSymbol, namesSEXP); 57 | } 58 | 59 | inline void listToDataFrame(SEXP listSEXP, int n) 60 | { 61 | r::Protect protect; 62 | SEXP classSEXP = protect(Rf_mkString("data.frame")); 63 | Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP); 64 | 65 | SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2)); 66 | INTEGER(rownamesSEXP)[0] = NA_INTEGER; 67 | INTEGER(rownamesSEXP)[1] = -n; 68 | Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP); 69 | } 70 | 71 | inline SEXP functionBody(SEXP fnSEXP) 72 | { 73 | SEXP bodyFunctionSEXP = Rf_findFun(Rf_install("body"), R_BaseNamespace); 74 | 75 | r::Protect protect; 76 | SEXP callSEXP = protect(Rf_lang2(bodyFunctionSEXP, fnSEXP)); 77 | return Rf_eval(callSEXP, R_BaseNamespace); 78 | } 79 | 80 | } // namespace util 81 | 82 | } // namespace r 83 | } // namespace sourcetools 84 | 85 | #endif /* SOURCETOOLS_R_R_FUNCTIONS_H */ 86 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/RHeaders.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_R_HEADERS_H 2 | #define SOURCETOOLS_R_R_HEADERS_H 3 | 4 | #define R_NO_REMAP 5 | #include 6 | #include 7 | 8 | #endif /* SOURCETOOLS_R_R_HEADERS_H */ 9 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/RNonStandardEvaluation.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H 2 | #define SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | 10 | namespace sourcetools { 11 | namespace r { 12 | namespace nse { 13 | 14 | namespace detail { 15 | 16 | inline std::set makeNsePrimitives() 17 | { 18 | std::set instance; 19 | 20 | instance.insert("quote"); 21 | instance.insert("substitute"); 22 | instance.insert("eval"); 23 | instance.insert("evalq"); 24 | instance.insert("lazy_dots"); 25 | 26 | return instance; 27 | } 28 | 29 | inline std::set& nsePrimitives() 30 | { 31 | static std::set instance = makeNsePrimitives(); 32 | return instance; 33 | } 34 | 35 | class PerformsNonStandardEvaluationOperation 36 | : public r::CallRecurser::Operation 37 | { 38 | public: 39 | 40 | PerformsNonStandardEvaluationOperation() 41 | : status_(false) 42 | { 43 | } 44 | 45 | virtual void apply(SEXP dataSEXP) 46 | { 47 | if (status_ || TYPEOF(dataSEXP) != LANGSXP) 48 | return; 49 | 50 | if ((status_ = checkCall(dataSEXP))) 51 | return; 52 | 53 | SEXP fnSEXP = CAR(dataSEXP); 54 | if (TYPEOF(fnSEXP) == SYMSXP) 55 | status_ = nsePrimitives().count(CHAR(PRINTNAME(fnSEXP))); 56 | else if (TYPEOF(fnSEXP) == STRSXP) 57 | status_ = nsePrimitives().count(CHAR(STRING_ELT(fnSEXP, 0))); 58 | 59 | } 60 | 61 | bool status() const { return status_; } 62 | 63 | private: 64 | 65 | bool checkCall(SEXP callSEXP) 66 | { 67 | index_type n = Rf_length(callSEXP); 68 | if (n == 0) 69 | return false; 70 | 71 | SEXP fnSEXP = CAR(callSEXP); 72 | if (fnSEXP == Rf_install("::") || fnSEXP == Rf_install(":::")) 73 | { 74 | SEXP lhsSEXP = CADR(callSEXP); 75 | SEXP rhsSEXP = CADDR(callSEXP); 76 | 77 | if (lhsSEXP == Rf_install("lazyeval") && rhsSEXP == Rf_install("lazy_dots")) 78 | return true; 79 | } 80 | 81 | return false; 82 | } 83 | 84 | private: 85 | bool status_; 86 | }; 87 | 88 | } // namespace detail 89 | 90 | class Database 91 | { 92 | public: 93 | bool check(SEXP dataSEXP) 94 | { 95 | if (contains(dataSEXP)) 96 | return get(dataSEXP); 97 | 98 | typedef detail::PerformsNonStandardEvaluationOperation Operation; 99 | scoped_ptr operation(new Operation); 100 | 101 | r::CallRecurser recurser(dataSEXP); 102 | recurser.add(operation); 103 | recurser.run(); 104 | 105 | set(dataSEXP, operation->status()); 106 | return operation->status(); 107 | } 108 | 109 | private: 110 | 111 | bool contains(SEXP dataSEXP) 112 | { 113 | return map_.count(address(dataSEXP)); 114 | } 115 | 116 | bool get(SEXP dataSEXP) 117 | { 118 | return map_[address(dataSEXP)]; 119 | } 120 | 121 | void set(SEXP dataSEXP, bool value) 122 | { 123 | map_[address(dataSEXP)] = value; 124 | } 125 | 126 | std::size_t address(SEXP dataSEXP) 127 | { 128 | return reinterpret_cast(dataSEXP); 129 | } 130 | 131 | std::map map_; 132 | }; 133 | 134 | inline Database& database() 135 | { 136 | static Database instance; 137 | return instance; 138 | } 139 | 140 | inline bool performsNonStandardEvaluation(SEXP fnSEXP) 141 | { 142 | return database().check(fnSEXP); 143 | } 144 | 145 | } // namespace nse 146 | } // namespace r 147 | } // namespace sourcetools 148 | 149 | #endif /* SOURCETOOLS_R_R_NON_STANDARD_EVALUATION_H */ 150 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/RProtect.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_RPROTECT_H 2 | #define SOURCETOOLS_R_RPROTECT_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace sourcetools { 9 | namespace r { 10 | 11 | class Protect : noncopyable 12 | { 13 | public: 14 | Protect(): n_(0) {} 15 | ~Protect() { UNPROTECT(n_); } 16 | 17 | SEXP operator()(SEXP objectSEXP) 18 | { 19 | ++n_; 20 | return PROTECT(objectSEXP); 21 | } 22 | 23 | private: 24 | int n_; 25 | }; 26 | 27 | } // end namespace r 28 | } // end namespace sourcetools 29 | 30 | #endif /* SOURCETOOLS_R_RPROTECT_H */ 31 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/RUtils.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_R_UTILS_H 2 | #define SOURCETOOLS_R_R_UTILS_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace sourcetools { 13 | namespace r { 14 | 15 | class RObjectFactory : noncopyable 16 | { 17 | public: 18 | 19 | RObjectFactory() 20 | : n_(0) 21 | { 22 | } 23 | 24 | template 25 | SEXP create(SEXPTYPE type, const std::vector& vector, F f) 26 | { 27 | ++n_; 28 | index_type n = vector.size(); 29 | SEXP resultSEXP = PROTECT(Rf_allocVector(type, n)); 30 | for (index_type i = 0; i < n; ++i) 31 | f(resultSEXP, i, vector[i]); 32 | return resultSEXP; 33 | } 34 | 35 | SEXP create(SEXPTYPE type, index_type n) 36 | { 37 | ++n_; 38 | return PROTECT(Rf_allocVector(type, n)); 39 | } 40 | 41 | ~RObjectFactory() 42 | { 43 | UNPROTECT(n_); 44 | } 45 | 46 | private: 47 | index_type n_; 48 | }; 49 | 50 | class ListBuilder : noncopyable 51 | { 52 | public: 53 | 54 | void add(const std::string& name, SEXP value) 55 | { 56 | names_.push_back(name); 57 | data_.push_back(protect_(value)); 58 | } 59 | 60 | operator SEXP() const 61 | { 62 | index_type n = data_.size(); 63 | 64 | SEXP resultSEXP = protect_(Rf_allocVector(VECSXP, n)); 65 | SEXP namesSEXP = protect_(Rf_allocVector(STRSXP, n)); 66 | 67 | for (index_type i = 0; i < n; ++i) 68 | { 69 | SET_VECTOR_ELT(resultSEXP, i, data_[i]); 70 | SET_STRING_ELT(namesSEXP, i, createChar(names_[i])); 71 | } 72 | 73 | Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP); 74 | return resultSEXP; 75 | } 76 | 77 | private: 78 | std::vector names_; 79 | std::vector data_; 80 | mutable Protect protect_; 81 | }; 82 | 83 | } // namespace r 84 | } // namespace sourcetools 85 | 86 | #endif /* SOURCETOOLS_R_R_UTILS_H */ 87 | -------------------------------------------------------------------------------- /inst/include/sourcetools/r/r.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_R_R_H 2 | #define SOURCETOOLS_R_R_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | 12 | #endif /* SOURCETOOLS_R_R_H */ 13 | -------------------------------------------------------------------------------- /inst/include/sourcetools/read/MemoryMappedReader.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_READ_MEMORY_MAPPED_READER_H 2 | #define SOURCETOOLS_READ_MEMORY_MAPPED_READER_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | #include 11 | #include 12 | 13 | #ifndef _WIN32 14 | # include 15 | # include 16 | #else 17 | # include 18 | # include 19 | #endif 20 | 21 | namespace sourcetools { 22 | namespace detail { 23 | 24 | class MemoryMappedReader 25 | { 26 | public: 27 | 28 | class VectorReader 29 | { 30 | public: 31 | 32 | explicit VectorReader(std::vector* pData) 33 | : pData_(pData) 34 | { 35 | } 36 | 37 | template 38 | void operator()(const T& lhs, const T& rhs) 39 | { 40 | pData_->push_back(std::string(lhs, rhs)); 41 | } 42 | 43 | private: 44 | std::vector* pData_; 45 | }; 46 | 47 | static bool read(const char* path, std::string* pContent) 48 | { 49 | // Open file connection 50 | FileConnection conn(path); 51 | if (!conn.open()) 52 | return false; 53 | 54 | // Get size of file 55 | index_type size; 56 | if (!conn.size(&size)) 57 | return false; 58 | 59 | // Early return for empty files 60 | if (UNLIKELY(size == 0)) 61 | return true; 62 | 63 | // mmap the file 64 | MemoryMappedConnection map(conn, size); 65 | if (!map.open()) 66 | return false; 67 | 68 | pContent->assign(map, size); 69 | return true; 70 | } 71 | 72 | template 73 | static bool read_lines(const char* path, F f) 74 | { 75 | FileConnection conn(path); 76 | if (!conn.open()) 77 | return false; 78 | 79 | // Get size of file 80 | index_type size; 81 | if (!conn.size(&size)) 82 | return false; 83 | 84 | // Early return for empty files 85 | if (UNLIKELY(size == 0)) 86 | return true; 87 | 88 | // mmap the file 89 | MemoryMappedConnection map(conn, size); 90 | if (!map.open()) 91 | return false; 92 | 93 | // special case: just a '\n' 94 | bool endsWithNewline = 95 | map[size - 1] == '\n' || 96 | map[size - 1] == '\r'; 97 | 98 | if (size == 1 && endsWithNewline) 99 | return true; 100 | 101 | // Search for newlines 102 | const char* lower = map; 103 | const char* end = map + size; 104 | 105 | for (const char* it = lower; it != end; it++) 106 | { 107 | // check for newline 108 | char ch = *it; 109 | bool isNewline = ch == '\r' || ch == '\n'; 110 | if (!isNewline) 111 | continue; 112 | 113 | // found a newline; call functor 114 | f(lower, it); 115 | 116 | // update iterator, handling '\r\n' specially 117 | if (it[0] == '\r' && 118 | it[1] == '\n') 119 | { 120 | it += 1; 121 | } 122 | 123 | // update lower iterator 124 | lower = it + 1; 125 | 126 | } 127 | 128 | // If this file ended with a newline, we're done 129 | if (endsWithNewline) 130 | return true; 131 | 132 | // Otherwise, consume one more string, then we're done 133 | f(lower, end); 134 | return true; 135 | } 136 | 137 | static bool read_lines(const char* path, std::vector* pContent) 138 | { 139 | VectorReader reader(pContent); 140 | return read_lines(path, reader); 141 | } 142 | 143 | }; 144 | 145 | } // namespace detail 146 | } // namespace sourcetools 147 | 148 | #endif /* SOURCETOOLS_READ_MEMORY_MAPPED_READER_H */ 149 | -------------------------------------------------------------------------------- /inst/include/sourcetools/read/posix/FileConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H 2 | #define SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | 10 | namespace sourcetools { 11 | namespace detail { 12 | 13 | class FileConnection 14 | { 15 | public: 16 | 17 | typedef int FileDescriptor; 18 | 19 | FileConnection(const char* path, int flags = O_RDONLY) 20 | { 21 | fd_ = ::open(path, flags); 22 | } 23 | 24 | ~FileConnection() 25 | { 26 | if (open()) 27 | ::close(fd_); 28 | } 29 | 30 | bool open() 31 | { 32 | return fd_ != -1; 33 | } 34 | 35 | bool size(index_type* pSize) 36 | { 37 | struct stat info; 38 | if (::fstat(fd_, &info) == -1) 39 | return false; 40 | 41 | *pSize = info.st_size; 42 | return true; 43 | } 44 | 45 | operator FileDescriptor() const 46 | { 47 | return fd_; 48 | } 49 | 50 | private: 51 | FileDescriptor fd_; 52 | }; 53 | 54 | 55 | } // namespace detail 56 | } // namespace sourcetools 57 | 58 | #endif /* SOURCETOOLS_READ_POSIX_FILE_CONNECTION_H */ 59 | -------------------------------------------------------------------------------- /inst/include/sourcetools/read/posix/MemoryMappedConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H 2 | #define SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | 10 | namespace sourcetools { 11 | namespace detail { 12 | 13 | class MemoryMappedConnection 14 | { 15 | public: 16 | 17 | MemoryMappedConnection(int fd, index_type size) 18 | : size_(size) 19 | { 20 | #ifdef MAP_POPULATE 21 | map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED | MAP_POPULATE, fd, 0); 22 | #else 23 | map_ = (char*) ::mmap(0, size, PROT_READ, MAP_SHARED, fd, 0); 24 | #endif 25 | 26 | #if defined(POSIX_MADV_SEQUENTIAL) && defined(POSIX_MADV_WILLNEED) 27 | ::posix_madvise((void*) map_, size, POSIX_MADV_SEQUENTIAL | POSIX_MADV_WILLNEED); 28 | #endif 29 | } 30 | 31 | ~MemoryMappedConnection() 32 | { 33 | if (map_ != MAP_FAILED) 34 | ::munmap(map_, size_); 35 | } 36 | 37 | bool open() 38 | { 39 | return map_ != MAP_FAILED; 40 | } 41 | 42 | operator char*() const 43 | { 44 | return map_; 45 | } 46 | 47 | private: 48 | char* map_; 49 | index_type size_; 50 | }; 51 | 52 | } // namespace detail 53 | } // namespace sourcetools 54 | 55 | #endif /* SOURCETOOLS_READ_POSIX_MEMORY_MAPPED_CONNECTION_H */ 56 | -------------------------------------------------------------------------------- /inst/include/sourcetools/read/read.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_READ_READ_H 2 | #define SOURCETOOLS_READ_READ_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | 9 | namespace sourcetools { 10 | 11 | inline bool read(const std::string& absolutePath, std::string* pContent) 12 | { 13 | return detail::MemoryMappedReader::read(absolutePath.c_str(), pContent); 14 | } 15 | 16 | inline bool read_lines(const std::string& absolutePath, 17 | std::vector* pLines) 18 | { 19 | return detail::MemoryMappedReader::read_lines(absolutePath.c_str(), pLines); 20 | } 21 | 22 | } // namespace sourcetools 23 | 24 | #endif /* SOURCETOOLS_READ_READ_H */ 25 | -------------------------------------------------------------------------------- /inst/include/sourcetools/read/windows/FileConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H 2 | #define SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H 3 | 4 | #undef Realloc 5 | #undef Free 6 | #include 7 | 8 | namespace sourcetools { 9 | namespace detail { 10 | 11 | class FileConnection 12 | { 13 | public: 14 | typedef HANDLE FileDescriptor; 15 | 16 | FileConnection(const char* path, int flags = GENERIC_READ) 17 | { 18 | handle_ = ::CreateFile(path, flags, FILE_SHARE_READ, NULL, OPEN_EXISTING, 0, NULL); 19 | } 20 | 21 | ~FileConnection() 22 | { 23 | if (open()) 24 | ::CloseHandle(handle_); 25 | } 26 | 27 | bool open() 28 | { 29 | return handle_ != INVALID_HANDLE_VALUE; 30 | } 31 | 32 | bool size(index_type* pSize) 33 | { 34 | *pSize = ::GetFileSize(handle_, NULL); 35 | return true; 36 | } 37 | 38 | operator FileDescriptor() const 39 | { 40 | return handle_; 41 | } 42 | 43 | private: 44 | FileDescriptor handle_; 45 | }; 46 | 47 | } // namespace detail 48 | } // namespace sourcetools 49 | 50 | #endif /* SOURCETOOLS_READ_WINDOWS_FILE_CONNECTION_H */ 51 | -------------------------------------------------------------------------------- /inst/include/sourcetools/read/windows/MemoryMappedConnection.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H 2 | #define SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H 3 | 4 | #undef Realloc 5 | #undef Free 6 | 7 | #include 8 | 9 | #include 10 | 11 | namespace sourcetools { 12 | namespace detail { 13 | 14 | class MemoryMappedConnection 15 | { 16 | public: 17 | 18 | MemoryMappedConnection(HANDLE handle, index_type size) 19 | : map_(NULL), size_(size) 20 | { 21 | handle_ = ::CreateFileMapping(handle, NULL, PAGE_READONLY, 0, 0, NULL); 22 | if (handle_ == NULL) 23 | return; 24 | 25 | map_ = (char*) ::MapViewOfFile(handle_, FILE_MAP_READ, 0, 0, size); 26 | } 27 | 28 | ~MemoryMappedConnection() 29 | { 30 | if (handle_ != INVALID_HANDLE_VALUE) 31 | ::CloseHandle(handle_); 32 | 33 | if (map_ != NULL) 34 | ::UnmapViewOfFile(map_); 35 | } 36 | 37 | bool open() 38 | { 39 | return map_ != NULL; 40 | } 41 | 42 | operator char*() const 43 | { 44 | return map_; 45 | } 46 | 47 | private: 48 | char* map_; 49 | index_type size_; 50 | HANDLE handle_; 51 | }; 52 | 53 | } // namespace detail 54 | } // namespace sourcetools 55 | 56 | #endif /* SOURCETOOLS_READ_WINDOWS_MEMORY_MAPPED_CONNECTION_H */ 57 | -------------------------------------------------------------------------------- /inst/include/sourcetools/tokenization/Registration.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_TOKENIZATION_REGISTRATION_H 2 | #define SOURCETOOLS_TOKENIZATION_REGISTRATION_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | namespace sourcetools { 9 | namespace tokens { 10 | 11 | typedef unsigned int TokenType; 12 | 13 | // Simple, non-nestable types. 14 | #define SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(__NAME__, __TYPE__) \ 15 | static const TokenType __NAME__ = __TYPE__ 16 | 17 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(INVALID, (1 << 31)); 18 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(END, (1 << 30)); 19 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(EMPTY, (1 << 29)); 20 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(MISSING, (1 << 28)); 21 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(ROOT, (1 << 27)); 22 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SEMI, (1 << 26)); 23 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMA, (1 << 25)); 24 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(SYMBOL, (1 << 24)); 25 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(COMMENT, (1 << 23)); 26 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(WHITESPACE, (1 << 22)); 27 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(STRING, (1 << 21)); 28 | SOURCE_TOOLS_REGISTER_SIMPLE_TYPE(NUMBER, (1 << 20)); 29 | 30 | /* Brackets */ 31 | #define SOURCE_TOOLS_BRACKET_BIT (1 << 19) 32 | #define SOURCE_TOOLS_BRACKET_RIGHT_BIT (1 << 5) 33 | #define SOURCE_TOOLS_BRACKET_LEFT_BIT (1 << 4) 34 | #define SOURCE_TOOLS_BRACKET_MASK SOURCE_TOOLS_BRACKET_BIT 35 | #define SOURCE_TOOLS_BRACKET_LEFT_MASK (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT) 36 | #define SOURCE_TOOLS_BRACKET_RIGHT_MASK (SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT) 37 | 38 | #define SOURCE_TOOLS_REGISTER_BRACKET(__NAME__, __SIDE__, __INDEX__) \ 39 | static const TokenType __NAME__ = \ 40 | SOURCE_TOOLS_BRACKET_BIT | __SIDE__ | __INDEX__ 41 | 42 | SOURCE_TOOLS_REGISTER_BRACKET(LPAREN, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 0)); 43 | SOURCE_TOOLS_REGISTER_BRACKET(LBRACE, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 1)); 44 | SOURCE_TOOLS_REGISTER_BRACKET(LBRACKET, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 2)); 45 | SOURCE_TOOLS_REGISTER_BRACKET(LDBRACKET, SOURCE_TOOLS_BRACKET_LEFT_BIT, (1 << 3)); 46 | 47 | SOURCE_TOOLS_REGISTER_BRACKET(RPAREN, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 0)); 48 | SOURCE_TOOLS_REGISTER_BRACKET(RBRACE, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 1)); 49 | SOURCE_TOOLS_REGISTER_BRACKET(RBRACKET, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 2)); 50 | SOURCE_TOOLS_REGISTER_BRACKET(RDBRACKET, SOURCE_TOOLS_BRACKET_RIGHT_BIT, (1 << 3)); 51 | 52 | /* Operators */ 53 | #define SOURCE_TOOLS_OPERATOR_BIT (1 << 18) 54 | #define SOURCE_TOOLS_OPERATOR_UNARY_BIT (1 << 6) 55 | #define SOURCE_TOOLS_OPERATOR_MASK (SOURCE_TOOLS_OPERATOR_BIT) 56 | #define SOURCE_TOOLS_OPERATOR_UNARY_MASK (SOURCE_TOOLS_OPERATOR_MASK | SOURCE_TOOLS_OPERATOR_UNARY_BIT) 57 | 58 | #define SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, __MASKS__) \ 59 | \ 60 | static const TokenType OPERATOR_ ## __NAME__ = \ 61 | SOURCE_TOOLS_OPERATOR_BIT | __MASKS__; \ 62 | \ 63 | static const char* const \ 64 | OPERATOR_ ## __NAME__ ## _STRING = __STRING__ 65 | 66 | #define SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(__NAME__, __STRING__, __INDEX__) \ 67 | SOURCE_TOOLS_REGISTER_OPERATOR(__NAME__, __STRING__, SOURCE_TOOLS_OPERATOR_UNARY_BIT | __INDEX__) 68 | 69 | // See ?"Syntax" for details on R's operators. 70 | // Note: All operators registered work in a binary context, but only 71 | // some will work as unary operators. (Occurring to the left of the token). 72 | // 73 | // In other words, -1 is parsed as `-`(1). 74 | // 75 | // Note that although brackets are operators we tokenize them separately, 76 | // since we need to later check for their paired complement. 77 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(PLUS, "+", 0); 78 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(MINUS, "-", 1); 79 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(HELP, "?", 2); 80 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(NEGATION, "!", 3); 81 | SOURCE_TOOLS_REGISTER_UNARY_OPERATOR(FORMULA, "~", 4); 82 | 83 | SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_EXPORTS, "::", 5); 84 | SOURCE_TOOLS_REGISTER_OPERATOR(NAMESPACE_ALL, ":::", 6); 85 | SOURCE_TOOLS_REGISTER_OPERATOR(DOLLAR, "$", 7); 86 | SOURCE_TOOLS_REGISTER_OPERATOR(AT, "@", 8); 87 | SOURCE_TOOLS_REGISTER_OPERATOR(HAT, "^", 9); 88 | SOURCE_TOOLS_REGISTER_OPERATOR(EXPONENTATION_STARS, "**", 10); 89 | SOURCE_TOOLS_REGISTER_OPERATOR(SEQUENCE, ":", 11); 90 | SOURCE_TOOLS_REGISTER_OPERATOR(MULTIPLY, "*", 12); 91 | SOURCE_TOOLS_REGISTER_OPERATOR(DIVIDE, "/", 13); 92 | SOURCE_TOOLS_REGISTER_OPERATOR(LESS, "<", 14); 93 | SOURCE_TOOLS_REGISTER_OPERATOR(LESS_OR_EQUAL, "<=", 15); 94 | SOURCE_TOOLS_REGISTER_OPERATOR(GREATER, ">", 16); 95 | SOURCE_TOOLS_REGISTER_OPERATOR(GREATER_OR_EQUAL, ">=", 17); 96 | SOURCE_TOOLS_REGISTER_OPERATOR(EQUAL, "==", 18); 97 | SOURCE_TOOLS_REGISTER_OPERATOR(NOT_EQUAL, "!=", 19); 98 | SOURCE_TOOLS_REGISTER_OPERATOR(AND_VECTOR, "&", 20); 99 | SOURCE_TOOLS_REGISTER_OPERATOR(AND_SCALAR, "&&", 21); 100 | SOURCE_TOOLS_REGISTER_OPERATOR(OR_VECTOR, "|", 22); 101 | SOURCE_TOOLS_REGISTER_OPERATOR(OR_SCALAR, "||", 23); 102 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT, "<-", 24); 103 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_PARENT, "<<-", 25); 104 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT, "->", 26); 105 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_RIGHT_PARENT, "->>", 27); 106 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_EQUALS, "=", 28); 107 | SOURCE_TOOLS_REGISTER_OPERATOR(ASSIGN_LEFT_COLON, ":=", 29); 108 | SOURCE_TOOLS_REGISTER_OPERATOR(USER, "%%", 30); 109 | SOURCE_TOOLS_REGISTER_OPERATOR(PIPE, "|>", 31); 110 | SOURCE_TOOLS_REGISTER_OPERATOR(PIPE_BIND, ">=", 32); 111 | 112 | /* Keywords and symbols */ 113 | #define SOURCE_TOOLS_KEYWORD_BIT (1 << 17) 114 | #define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT (1 << 7) 115 | #define SOURCE_TOOLS_KEYWORD_MASK SOURCE_TOOLS_KEYWORD_BIT 116 | #define SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK (SOURCE_TOOLS_KEYWORD_MASK | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_BIT) 117 | 118 | #define SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__) \ 119 | static const TokenType KEYWORD_ ## __NAME__ = \ 120 | __MASKS__ | SOURCE_TOOLS_KEYWORD_MASK 121 | 122 | #define SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(__NAME__, __MASKS__) \ 123 | SOURCE_TOOLS_REGISTER_KEYWORD(__NAME__, __MASKS__ | SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK) 124 | 125 | // See '?Reserved' for a list of reversed R symbols. 126 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(IF, 1); 127 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FOR, 2); 128 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(WHILE, 3); 129 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(REPEAT, 4); 130 | SOURCE_TOOLS_REGISTER_CONTROL_FLOW_KEYWORD(FUNCTION, 5); 131 | 132 | SOURCE_TOOLS_REGISTER_KEYWORD(ELSE, 6); 133 | SOURCE_TOOLS_REGISTER_KEYWORD(IN, 7); 134 | SOURCE_TOOLS_REGISTER_KEYWORD(NEXT, 8); 135 | SOURCE_TOOLS_REGISTER_KEYWORD(BREAK, 9); 136 | SOURCE_TOOLS_REGISTER_KEYWORD(TRUE, 10); 137 | SOURCE_TOOLS_REGISTER_KEYWORD(FALSE, 11); 138 | SOURCE_TOOLS_REGISTER_KEYWORD(NULL, 12); 139 | SOURCE_TOOLS_REGISTER_KEYWORD(Inf, 13); 140 | SOURCE_TOOLS_REGISTER_KEYWORD(NaN, 14); 141 | SOURCE_TOOLS_REGISTER_KEYWORD(NA, 15); 142 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_integer_, 16); 143 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_real_, 17); 144 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_complex_, 18); 145 | SOURCE_TOOLS_REGISTER_KEYWORD(NA_character_, 19); 146 | 147 | inline TokenType symbolType(const char* string, index_type n) 148 | { 149 | // TODO: Is this insanity really an optimization or am I just silly? 150 | if (n < 2 || n > 13) { 151 | return SYMBOL; 152 | } else if (n == 2) { 153 | if (!std::memcmp(string, "in", n)) return KEYWORD_IN; 154 | if (!std::memcmp(string, "if", n)) return KEYWORD_IF; 155 | if (!std::memcmp(string, "NA", n)) return KEYWORD_NA; 156 | } else if (n == 3) { 157 | if (!std::memcmp(string, "for", n)) return KEYWORD_FOR; 158 | if (!std::memcmp(string, "Inf", n)) return KEYWORD_Inf; 159 | if (!std::memcmp(string, "NaN", n)) return KEYWORD_NaN; 160 | } else if (n == 4) { 161 | if (!std::memcmp(string, "else", n)) return KEYWORD_ELSE; 162 | if (!std::memcmp(string, "next", n)) return KEYWORD_NEXT; 163 | if (!std::memcmp(string, "TRUE", n)) return KEYWORD_TRUE; 164 | if (!std::memcmp(string, "NULL", n)) return KEYWORD_NULL; 165 | } else if (n == 5) { 166 | if (!std::memcmp(string, "while", n)) return KEYWORD_WHILE; 167 | if (!std::memcmp(string, "break", n)) return KEYWORD_BREAK; 168 | if (!std::memcmp(string, "FALSE", n)) return KEYWORD_FALSE; 169 | } else if (n == 6) { 170 | if (!std::memcmp(string, "repeat", n)) return KEYWORD_REPEAT; 171 | } else if (n == 8) { 172 | if (!std::memcmp(string, "function", n)) return KEYWORD_FUNCTION; 173 | if (!std::memcmp(string, "NA_real_", n)) return KEYWORD_NA_real_; 174 | } else if (n == 11) { 175 | if (!std::memcmp(string, "NA_integer_", n)) return KEYWORD_NA_integer_; 176 | if (!std::memcmp(string, "NA_complex_", n)) return KEYWORD_NA_complex_; 177 | } else if (n == 13) { 178 | if (!std::memcmp(string, "NA_character_", n)) return KEYWORD_NA_character_; 179 | } 180 | 181 | return SYMBOL; 182 | } 183 | 184 | inline TokenType symbolType(const std::string& symbol) 185 | { 186 | return symbolType(symbol.data(), symbol.size()); 187 | } 188 | 189 | } // namespace tokens 190 | } // namespace sourcetools 191 | 192 | #endif /* SOURCETOOLS_TOKENIZATION_REGISTRATION_H */ 193 | -------------------------------------------------------------------------------- /inst/include/sourcetools/tokenization/Token.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_TOKENIZATION_TOKEN_H 2 | #define SOURCETOOLS_TOKENIZATION_TOKEN_H 3 | 4 | #include 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | #include 12 | #include 13 | #include 14 | #include 15 | 16 | namespace sourcetools { 17 | namespace tokens { 18 | 19 | class Token 20 | { 21 | private: 22 | typedef cursors::TextCursor TextCursor; 23 | typedef collections::Position Position; 24 | 25 | public: 26 | 27 | Token() 28 | : begin_(NULL), 29 | end_(NULL), 30 | offset_(-1), 31 | position_(-1, -1), 32 | type_(INVALID) 33 | { 34 | } 35 | 36 | explicit Token(TokenType type) 37 | : begin_(NULL), 38 | end_(NULL), 39 | offset_(-1), 40 | position_(-1, -1), 41 | type_(type) 42 | { 43 | } 44 | 45 | Token(const Position& position) 46 | : begin_(NULL), 47 | end_(NULL), 48 | offset_(-1), 49 | position_(position), 50 | type_(INVALID) 51 | { 52 | } 53 | 54 | Token(const TextCursor& cursor, TokenType type, index_type length) 55 | : begin_(cursor.begin() + cursor.offset()), 56 | end_(cursor.begin() + cursor.offset() + length), 57 | offset_(cursor.offset()), 58 | position_(cursor.position()), 59 | type_(type) 60 | { 61 | } 62 | 63 | const char* begin() const { return begin_; } 64 | const char* end() const { return end_; } 65 | index_type offset() const { return offset_; } 66 | index_type size() const { return end_ - begin_; } 67 | 68 | std::string contents() const 69 | { 70 | return std::string(begin_, end_); 71 | } 72 | 73 | bool contentsEqual(const char* string) 74 | { 75 | return std::strcmp(begin_, string); 76 | } 77 | 78 | bool contentsEqual(const std::string& string) const 79 | { 80 | if (utils::size(string) != size()) 81 | return false; 82 | 83 | return std::memcmp(begin_, string.c_str(), size()) == 0; 84 | } 85 | 86 | const Position& position() const { return position_; } 87 | index_type row() const { return position_.row; } 88 | index_type column() const { return position_.column; } 89 | 90 | TokenType type() const { return type_; } 91 | bool isType(TokenType type) const { return type_ == type; } 92 | 93 | private: 94 | const char* begin_; 95 | const char* end_; 96 | index_type offset_; 97 | 98 | Position position_; 99 | TokenType type_; 100 | }; 101 | 102 | inline bool isBracket(const Token& token) 103 | { 104 | return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_MASK); 105 | } 106 | 107 | inline bool isLeftBracket(const Token& token) 108 | { 109 | return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_LEFT_MASK); 110 | } 111 | 112 | inline bool isRightBracket(const Token& token) 113 | { 114 | return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_BRACKET_RIGHT_MASK); 115 | } 116 | 117 | inline bool isComplement(TokenType lhs, TokenType rhs) 118 | { 119 | static const TokenType mask = 120 | SOURCE_TOOLS_BRACKET_BIT | SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT; 121 | 122 | if (SOURCE_TOOLS_CHECK_MASK((lhs | rhs), mask)) 123 | return SOURCE_TOOLS_LOWER_BITS(lhs, 4) == SOURCE_TOOLS_LOWER_BITS(rhs, 4); 124 | 125 | return false; 126 | } 127 | 128 | inline TokenType complement(TokenType type) 129 | { 130 | static const TokenType mask = 131 | SOURCE_TOOLS_BRACKET_LEFT_BIT | SOURCE_TOOLS_BRACKET_RIGHT_BIT; 132 | 133 | return type ^ mask; 134 | } 135 | 136 | inline bool isKeyword(const Token& token) 137 | { 138 | return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_MASK); 139 | } 140 | 141 | inline bool isControlFlowKeyword(const Token& token) 142 | { 143 | return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_KEYWORD_CONTROL_FLOW_MASK); 144 | } 145 | 146 | inline bool isOperator(const Token& token) 147 | { 148 | return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_MASK); 149 | } 150 | 151 | inline bool isUnaryOperator(const Token& token) 152 | { 153 | return SOURCE_TOOLS_CHECK_MASK(token.type(), SOURCE_TOOLS_OPERATOR_UNARY_MASK); 154 | } 155 | 156 | inline bool isNonUnaryOperator(const Token& token) 157 | { 158 | return isOperator(token) && !isUnaryOperator(token); 159 | } 160 | 161 | inline bool isComparisonOperator(const Token& token) 162 | { 163 | switch (token.type()) 164 | { 165 | case OPERATOR_AND_SCALAR: 166 | case OPERATOR_AND_VECTOR: 167 | case OPERATOR_OR_SCALAR: 168 | case OPERATOR_OR_VECTOR: 169 | case OPERATOR_EQUAL: 170 | case OPERATOR_NOT_EQUAL: 171 | case OPERATOR_LESS: 172 | case OPERATOR_LESS_OR_EQUAL: 173 | case OPERATOR_GREATER: 174 | case OPERATOR_GREATER_OR_EQUAL: 175 | return true; 176 | default: 177 | return false; 178 | } 179 | } 180 | 181 | inline bool isWhitespace(const Token& token) 182 | { 183 | return token.type() == WHITESPACE; 184 | } 185 | 186 | inline bool isComment(const Token& token) 187 | { 188 | return token.type() == COMMENT; 189 | } 190 | 191 | inline bool isSymbol(const Token& token) 192 | { 193 | return token.type() == SYMBOL; 194 | } 195 | 196 | inline bool isEnd(const Token& token) 197 | { 198 | return token.type() == END; 199 | } 200 | 201 | inline bool isString(const Token& token) 202 | { 203 | return token.type() == STRING; 204 | } 205 | 206 | inline bool isSymbolic(const Token& token) 207 | { 208 | static const TokenType mask = SYMBOL | NUMBER | STRING; 209 | return (token.type() & mask) != 0; 210 | } 211 | 212 | inline bool isNumeric(const Token& token) 213 | { 214 | return (token.type() & NUMBER) != 0; 215 | } 216 | 217 | inline bool isCallOperator(const Token& token) 218 | { 219 | return token.type() == LPAREN || 220 | token.type() == LBRACKET || 221 | token.type() == LDBRACKET; 222 | } 223 | 224 | inline bool isAssignmentOperator(const Token& token) 225 | { 226 | switch (token.type()) 227 | { 228 | case OPERATOR_ASSIGN_LEFT: 229 | case OPERATOR_ASSIGN_LEFT_COLON: 230 | case OPERATOR_ASSIGN_LEFT_EQUALS: 231 | case OPERATOR_ASSIGN_LEFT_PARENT: 232 | case OPERATOR_ASSIGN_RIGHT: 233 | case OPERATOR_ASSIGN_RIGHT_PARENT: 234 | return true; 235 | default: 236 | return false; 237 | } 238 | } 239 | 240 | namespace detail { 241 | 242 | inline bool isHexDigit(char c) 243 | { 244 | if (c >= '0' && c <= '9') 245 | return true; 246 | else if (c >= 'a' && c <= 'f') 247 | return true; 248 | else if (c >= 'A' && c <= 'F') 249 | return true; 250 | return false; 251 | } 252 | 253 | inline int hexValue(char c) 254 | { 255 | if (c >= '0' && c <= '9') 256 | return c - '0'; 257 | else if (c >= 'a' && c <= 'f') 258 | return c - 'a' + 10; 259 | else if (c >= 'A' && c <= 'F') 260 | return c - 'A' + 10; 261 | 262 | return 0; 263 | } 264 | 265 | // Parses an octal escape sequence, e.g. '\012'. 266 | inline bool parseOctal(const char*& it, char*& output) 267 | { 268 | // Check for opening escape 269 | if (*it != '\\') 270 | return false; 271 | 272 | // Check for number following 273 | char lookahead = *(it + 1); 274 | if (lookahead < '0' || lookahead > '7') 275 | return false; 276 | ++it; 277 | 278 | // Begin parsing. Consume up to three numbers. 279 | unsigned char result = 0; 280 | const char* end = it + 3; 281 | for (; it != end; ++it) 282 | { 283 | char ch = *it; 284 | if ('0' <= ch && ch <= '7') 285 | result = 8 * result + ch - '0'; 286 | else 287 | break; 288 | } 289 | 290 | // Assign result, and return. 291 | *output++ = result; 292 | return true; 293 | } 294 | 295 | // Parse a hex escape sequence, e.g. '\xFF'. 296 | inline bool parseHex(const char*& it, char*& output) 297 | { 298 | // Check for opening escape. 299 | if (*it != '\\') 300 | return false; 301 | 302 | if (*(it + 1) != 'x') 303 | return false; 304 | 305 | if (!isHexDigit(*(it + 2))) 306 | return false; 307 | 308 | // Begin parsing. 309 | it += 2; 310 | unsigned char value = 0; 311 | const char* end = it + 2; 312 | for (; it != end; ++it) 313 | { 314 | int result = hexValue(*it); 315 | if (result == 0) 316 | break; 317 | value = 16 * value + result; 318 | } 319 | 320 | *output++ = value; 321 | return true; 322 | } 323 | 324 | // Parse a unicode escape sequence. 325 | inline bool parseUnicode(const char*& it, char*& output) 326 | { 327 | if (*it != '\\') 328 | return false; 329 | 330 | char lookahead = *(it + 1); 331 | int size; 332 | if (lookahead == 'u') 333 | size = 4; 334 | else if (lookahead == 'U') 335 | size = 8; 336 | else 337 | return false; 338 | 339 | // Clone the input iterator (only set it on success) 340 | const char* clone = it; 341 | clone += 2; 342 | 343 | // Check for e.g. '\u{...}' 344 | // ^ 345 | bool delimited = *clone == '{'; 346 | clone += delimited; 347 | 348 | // Check for a hex digit. 349 | if (!isHexDigit(*clone)) 350 | return false; 351 | 352 | // Begin parsing hex digits 353 | wchar_t value = 0; 354 | const char* end = clone + size; 355 | for (; clone != end; ++clone) 356 | { 357 | if (!isHexDigit(*clone)) 358 | break; 359 | 360 | int hex = hexValue(*clone); 361 | value = 16 * value + hex; 362 | } 363 | 364 | // Eat a closing '}' if we had a starting '{'. 365 | if (delimited) 366 | { 367 | if (*clone != '}') 368 | return false; 369 | ++clone; 370 | } 371 | 372 | std::mbstate_t state; 373 | std::memset(&state, 0, sizeof(state)); 374 | index_type bytes = std::wcrtomb(output, value, &state); 375 | if (bytes == static_cast(-1)) 376 | return false; 377 | 378 | // Update iterator state 379 | it = clone; 380 | output += bytes; 381 | return true; 382 | } 383 | 384 | } // namespace detail 385 | 386 | inline std::string stringValue(const char* begin, const char* end) 387 | { 388 | if (begin == end) 389 | return std::string(); 390 | 391 | index_type n = end - begin; 392 | scoped_array buffer(new char[n + 1]); 393 | 394 | const char* it = begin; 395 | char* output = buffer; 396 | 397 | while (it < end) 398 | { 399 | if (*it == '\\') 400 | { 401 | if (detail::parseOctal(it, output) || 402 | detail::parseHex(it, output) || 403 | detail::parseUnicode(it, output)) 404 | { 405 | continue; 406 | } 407 | 408 | // Handle the rest 409 | ++it; 410 | switch (*it) 411 | { 412 | case 'a': *output++ = '\a'; break; 413 | case 'b': *output++ = '\b'; break; 414 | case 'f': *output++ = '\f'; break; 415 | case 'n': *output++ = '\n'; break; 416 | case 'r': *output++ = '\r'; break; 417 | case 't': *output++ = '\t'; break; 418 | case 'v': *output++ = '\v'; break; 419 | case '\\': *output++ = '\\'; break; 420 | default: *output++ = *it; break; 421 | } 422 | ++it; 423 | } 424 | else 425 | { 426 | *output++ = *it++; 427 | } 428 | } 429 | 430 | // Ensure null termination, just in case 431 | *output++ = '\0'; 432 | 433 | // Construct the result string and return 434 | std::string result(buffer, output - buffer); 435 | return result; 436 | } 437 | 438 | inline std::string stringValue(const Token& token) 439 | { 440 | switch (token.type()) 441 | { 442 | case STRING: 443 | return stringValue(token.begin() + 1, token.end() - 1); 444 | case SYMBOL: 445 | if (*token.begin() == '`') 446 | return stringValue(token.begin() + 1, token.end() - 1); 447 | default: 448 | return stringValue(token.begin(), token.end()); 449 | } 450 | } 451 | 452 | } // namespace tokens 453 | 454 | inline std::string toString(tokens::TokenType type) 455 | { 456 | using namespace tokens; 457 | 458 | if (type == INVALID) return "invalid"; 459 | else if (type == END) return "end"; 460 | else if (type == EMPTY) return "empty"; 461 | else if (type == MISSING) return "missing"; 462 | else if (type == SEMI) return "semi"; 463 | else if (type == COMMA) return "comma"; 464 | else if (type == SYMBOL) return "symbol"; 465 | else if (type == COMMENT) return "comment"; 466 | else if (type == WHITESPACE) return "whitespace"; 467 | else if (type == STRING) return "string"; 468 | else if (type == NUMBER) return "number"; 469 | 470 | else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_BRACKET_MASK)) 471 | return "bracket"; 472 | else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_KEYWORD_MASK)) 473 | return "keyword"; 474 | else if (SOURCE_TOOLS_CHECK_MASK(type, SOURCE_TOOLS_OPERATOR_MASK)) 475 | return "operator"; 476 | 477 | return "unknown"; 478 | } 479 | 480 | inline std::string toString(const tokens::Token& token) 481 | { 482 | std::string contents; 483 | if (token.isType(tokens::END)) 484 | contents = ""; 485 | else if (token.isType(tokens::EMPTY)) 486 | contents = ""; 487 | else if (token.isType(tokens::MISSING)) 488 | contents = ""; 489 | else 490 | contents = token.contents(); 491 | 492 | static const int N = 1024; 493 | if (contents.size() > N / 2) 494 | contents = contents.substr(0, N / 2); 495 | char buff[N]; 496 | std::snprintf(buff, 497 | N, 498 | "[%4ld:%4ld]: %s", 499 | static_cast(token.row()), 500 | static_cast(token.column()), 501 | contents.c_str()); 502 | return buff; 503 | } 504 | 505 | inline std::ostream& operator<<(std::ostream& os, const tokens::Token& token) 506 | { 507 | return os << toString(token); 508 | } 509 | 510 | inline std::ostream& operator<<(std::ostream& os, const std::vector& tokens) 511 | { 512 | for (std::vector::const_iterator it = tokens.begin(); 513 | it != tokens.end(); 514 | ++it) 515 | { 516 | os << *it << std::endl; 517 | } 518 | 519 | return os; 520 | } 521 | 522 | } // namespace sourcetools 523 | 524 | #endif /* SOURCETOOLS_TOKENIZATION_TOKEN_H */ 525 | -------------------------------------------------------------------------------- /inst/include/sourcetools/tokenization/Tokenizer.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_TOKENIZATION_TOKENIZER_H 2 | #define SOURCETOOLS_TOKENIZATION_TOKENIZER_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #include 9 | #include 10 | #include 11 | 12 | namespace sourcetools { 13 | namespace tokenizer { 14 | 15 | class Tokenizer 16 | { 17 | private: 18 | typedef tokens::Token Token; 19 | typedef cursors::TextCursor TextCursor; 20 | typedef tokens::TokenType TokenType; 21 | 22 | private: 23 | 24 | // Tokenization ---- 25 | 26 | void consumeToken(TokenType type, 27 | index_type length, 28 | Token* pToken) 29 | { 30 | *pToken = Token(cursor_, type, length); 31 | cursor_.advance(length); 32 | } 33 | 34 | template 35 | void consumeUntil(char ch, 36 | TokenType type, 37 | Token* pToken) 38 | { 39 | TextCursor lookahead = cursor_; 40 | 41 | bool success = false; 42 | index_type distance = 0; 43 | 44 | while (lookahead != lookahead.end()) { 45 | lookahead.advance(); 46 | ++distance; 47 | 48 | if (SkipEscaped && lookahead.peek() == '\\') { 49 | lookahead.advance(); 50 | ++distance; 51 | continue; 52 | } 53 | 54 | if (lookahead.peek() == ch) { 55 | success = true; 56 | break; 57 | } 58 | } 59 | 60 | if (success) { 61 | consumeToken(type, distance + 1, pToken); 62 | } else { 63 | consumeToken( 64 | InvalidOnError ? tokens::INVALID : type, 65 | distance, 66 | pToken 67 | ); 68 | } 69 | } 70 | 71 | void consumeUserOperator(Token* pToken) 72 | { 73 | consumeUntil('%', tokens::OPERATOR_USER, pToken); 74 | } 75 | 76 | void consumeComment(Token* pToken) 77 | { 78 | consumeUntil('\n', tokens::COMMENT, pToken); 79 | } 80 | 81 | void consumeQuotedSymbol(Token* pToken) 82 | { 83 | consumeUntil('`', tokens::SYMBOL, pToken); 84 | } 85 | 86 | void consumeQString(Token* pToken) 87 | { 88 | consumeUntil('\'', tokens::STRING, pToken); 89 | } 90 | 91 | void consumeQQString(Token* pToken) 92 | { 93 | consumeUntil('"', tokens::STRING, pToken); 94 | } 95 | 96 | void consumeRawString(Token* pToken) 97 | { 98 | // clone cursor 99 | TextCursor cursor = cursor_; 100 | 101 | // save current position 102 | index_type start = cursor.offset(); 103 | 104 | // consume a leading 'r' or 'R' 105 | char ch = cursor.peek(); 106 | bool ok = ch == 'r' || ch == 'R'; 107 | if (!ok) 108 | return consumeToken(tokens::INVALID, 1, pToken); 109 | cursor.advance(); 110 | 111 | // consume a quote, saving what we saw 112 | char quote; 113 | switch (cursor.peek()) 114 | { 115 | case '"': 116 | quote = '"'; 117 | break; 118 | case '\'': 119 | quote = '\''; 120 | break; 121 | default: 122 | return consumeToken(tokens::INVALID, 2, pToken); 123 | } 124 | cursor.advance(); 125 | 126 | // consume dashes, counting the number of dashes seen 127 | int dashes = 0; 128 | while (cursor.peek() == '-') 129 | { 130 | dashes += 1; 131 | cursor.advance(); 132 | } 133 | 134 | // consume the delimiter, saving what we saw 135 | char lhs; 136 | switch (cursor.peek()) 137 | { 138 | case '(': 139 | case '{': 140 | case '[': 141 | lhs = cursor.peek(); 142 | break; 143 | default: 144 | return consumeToken(tokens::INVALID, 145 | cursor.offset() - start + 1, 146 | pToken); 147 | } 148 | cursor.advance(); 149 | 150 | // compute complement for delimiter 151 | char rhs; 152 | switch (lhs) 153 | { 154 | case '(': rhs = ')'; break; 155 | case '{': rhs = '}'; break; 156 | case '[': rhs = ']'; break; 157 | default: 158 | return consumeToken(tokens::INVALID, 159 | cursor.offset() - start + 1, 160 | pToken); 161 | } 162 | 163 | // start consuming things until we find the closing delimiter 164 | for (; cursor.peek() != '\0'; cursor.advance()) 165 | { 166 | // check for right delimiter 167 | if (cursor.peek() != rhs) 168 | goto AGAIN; 169 | cursor.advance(); 170 | 171 | // consume dashes 172 | for (int i = 0; i < dashes; i++) 173 | { 174 | if (cursor.peek() != '-') 175 | goto AGAIN; 176 | cursor.advance(); 177 | } 178 | 179 | // check for matching quote 180 | if (cursor.peek() != quote) 181 | goto AGAIN; 182 | cursor.advance(); 183 | 184 | // if we got this far, we successfully matched the raw string 185 | return consumeToken( 186 | tokens::STRING, 187 | cursor.offset() - start, 188 | pToken 189 | ); 190 | 191 | // if we got here, we need to restart the loop 192 | AGAIN: ; 193 | } 194 | 195 | // if we got here, we failed to match 196 | return consumeToken( 197 | tokens::INVALID, 198 | cursor.offset() - start, 199 | pToken 200 | ); 201 | 202 | } 203 | 204 | bool isStartOfRawString(const TextCursor& cursor) 205 | { 206 | char ch = '\0'; 207 | 208 | // check for leading 'r' or 'R' 209 | ch = cursor.peek(0); 210 | bool ok = ch == 'r' || ch == 'R'; 211 | if (!ok) 212 | return false; 213 | 214 | // check for quote 215 | ch = cursor.peek(1); 216 | return ch == '\'' || ch == '"'; 217 | } 218 | 219 | // NOTE: Don't tokenize '-' or '+' as part of number; instead 220 | // it's parsed as a unary operator. 221 | bool isStartOfNumber() 222 | { 223 | char ch = cursor_.peek(); 224 | if (utils::isDigit(ch)) 225 | return true; 226 | if (ch == '.') 227 | return utils::isDigit(cursor_.peek(1)); 228 | return false; 229 | } 230 | 231 | bool isStartOfSymbol() 232 | { 233 | return utils::isValidForStartOfRSymbol(cursor_.peek()); 234 | } 235 | 236 | bool consumeHexadecimalNumber(Token* pToken) 237 | { 238 | index_type distance = 0; 239 | 240 | // Detect the leading '0'. 241 | if (cursor_.peek(distance) != '0') 242 | return false; 243 | ++distance; 244 | 245 | // Detect a 'x' or 'X'. 246 | if (!(cursor_.peek(distance) == 'x' || cursor_.peek(distance) == 'X')) 247 | return false; 248 | ++distance; 249 | 250 | // Check and consume all alphanumeric characters. 251 | // The number is valid if the characters are valid 252 | // hexadecimal characters (0-9, a-f, A-F). The number 253 | // can also end with an 'i' (for an imaginary number) 254 | // or with an 'L' for an integer. 255 | if (!utils::isHexDigit(cursor_.peek(distance))) 256 | { 257 | consumeToken(tokens::INVALID, distance, pToken); 258 | return false; 259 | } 260 | 261 | bool success = true; 262 | char peek = cursor_.peek(distance); 263 | while (utils::isAlphaNumeric(peek) && peek != '\0') { 264 | 265 | // If we encounter an 'i' or an 'L', assume 266 | // that this ends the identifier. 267 | if (peek == 'i' || peek == 'L') 268 | { 269 | ++distance; 270 | break; 271 | } 272 | 273 | if (!utils::isHexDigit(peek)) 274 | success = false; 275 | 276 | ++distance; 277 | peek = cursor_.peek(distance); 278 | } 279 | 280 | consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken); 281 | return true; 282 | } 283 | 284 | void consumeNumber(Token* pToken) 285 | { 286 | bool success = true; 287 | index_type distance = 0; 288 | 289 | // NOTE: A leading '-' or '+' is not consumed as part of 290 | // the number. 291 | 292 | // Try parsing this as a hexadecimal number first (e.g. '0xabc'). 293 | if (consumeHexadecimalNumber(pToken)) 294 | return; 295 | 296 | // Consume digits 297 | while (utils::isDigit(cursor_.peek(distance))) 298 | ++distance; 299 | 300 | // Consume a dot for decimals 301 | // Note: '.5' is a valid specification for a number 302 | // So is '100.'; ie, with a trailing decimal. 303 | if (cursor_.peek(distance) == '.') { 304 | ++distance; 305 | while (utils::isDigit(cursor_.peek(distance))) 306 | ++distance; 307 | } 308 | 309 | // Consume 'e', 'E' for exponential notation 310 | if (cursor_.peek(distance) == 'e' || cursor_.peek(distance) == 'E') { 311 | ++distance; 312 | 313 | // Consume a '-' or a '+' for a negative number 314 | if (cursor_.peek(distance) == '-' || cursor_.peek(distance) == '+') 315 | ++distance; 316 | 317 | // Parse another set of numbers following the E 318 | success = utils::isDigit(cursor_.peek(distance)); 319 | while (utils::isDigit(cursor_.peek(distance))) 320 | ++distance; 321 | 322 | // Consume '.' and following numbers. Note that this is 323 | // not really a valid number for R but it's better to tokenize 324 | // this is a single entity (and then report failure later) 325 | if (cursor_.peek(distance) == '.') { 326 | success = false; 327 | ++distance; 328 | while (utils::isDigit(cursor_.peek(distance))) 329 | ++distance; 330 | } 331 | } 332 | 333 | // Consume a final 'L' for integer literals, 334 | // or a final 'i' for complex numbers. 335 | if (cursor_.peek(distance) == 'L' || 336 | cursor_.peek(distance) == 'i') 337 | { 338 | ++distance; 339 | } 340 | 341 | consumeToken(success ? tokens::NUMBER : tokens::INVALID, distance, pToken); 342 | } 343 | 344 | void consumeSymbol(Token* pToken) 345 | { 346 | index_type distance = 1; 347 | char ch = cursor_.peek(distance); 348 | while (utils::isValidForRSymbol(ch)) { 349 | ++distance; 350 | ch = cursor_.peek(distance); 351 | } 352 | 353 | const char* ptr = &*(cursor_.begin() + cursor_.offset()); 354 | consumeToken(tokens::symbolType(ptr, distance), distance, pToken); 355 | } 356 | 357 | public: 358 | 359 | Tokenizer(const char* code, index_type n) 360 | : cursor_(code, n) 361 | { 362 | } 363 | 364 | bool tokenize(Token* pToken) 365 | { 366 | if (cursor_ >= cursor_.end()) 367 | { 368 | *pToken = Token(tokens::END); 369 | return false; 370 | } 371 | 372 | char ch = cursor_.peek(); 373 | int n = 0; 374 | 375 | // Block-related tokens 376 | if (ch == '{') 377 | consumeToken(tokens::LBRACE, 1, pToken); 378 | else if (ch == '}') 379 | consumeToken(tokens::RBRACE, 1, pToken); 380 | else if (ch == '(') 381 | consumeToken(tokens::LPAREN, 1, pToken); 382 | else if (ch == ')') 383 | consumeToken(tokens::RPAREN, 1, pToken); 384 | else if (ch == '[') { 385 | if (cursor_.peek(1) == '[') { 386 | tokenStack_.push(tokens::LDBRACKET); 387 | consumeToken(tokens::LDBRACKET, 2, pToken); 388 | } else { 389 | tokenStack_.push(tokens::LBRACKET); 390 | consumeToken(tokens::LBRACKET, 1, pToken); 391 | } 392 | } else if (ch == ']') { 393 | if (tokenStack_.empty()) { 394 | consumeToken(tokens::INVALID, 1, pToken); 395 | } else if (tokenStack_.top() == tokens::LDBRACKET) { 396 | tokenStack_.pop(); 397 | if (cursor_.peek(1) == ']') 398 | consumeToken(tokens::RDBRACKET, 2, pToken); 399 | else 400 | consumeToken(tokens::INVALID, 1, pToken); 401 | } else { 402 | tokenStack_.pop(); 403 | consumeToken(tokens::RBRACKET, 1, pToken); 404 | } 405 | } 406 | 407 | // Operators 408 | else if (ch == '<') // <<-, <=, <-, < 409 | { 410 | char next = cursor_.peek(1); 411 | if (next == '-') // <- 412 | consumeToken(tokens::OPERATOR_ASSIGN_LEFT, 2, pToken); 413 | else if (next == '=') // <= 414 | consumeToken(tokens::OPERATOR_LESS_OR_EQUAL, 2, pToken); 415 | else if (next == '<' && cursor_.peek(2) == '-') 416 | consumeToken(tokens::OPERATOR_ASSIGN_LEFT_PARENT, 3, pToken); 417 | else 418 | consumeToken(tokens::OPERATOR_LESS, 1, pToken); 419 | } 420 | 421 | else if (ch == '>') // >=, > 422 | { 423 | if (cursor_.peek(1) == '=') 424 | consumeToken(tokens::OPERATOR_GREATER_OR_EQUAL, 2, pToken); 425 | else 426 | consumeToken(tokens::OPERATOR_GREATER, 1, pToken); 427 | } 428 | else if (ch == '=') // '==', '=>', '=' 429 | { 430 | char next = cursor_.peek(1); 431 | if (next == '>') 432 | consumeToken(tokens::OPERATOR_PIPE_BIND, 2, pToken); 433 | else if (next == '=') 434 | consumeToken(tokens::OPERATOR_EQUAL, 2, pToken); 435 | else 436 | consumeToken(tokens::OPERATOR_ASSIGN_LEFT_EQUALS, 1, pToken); 437 | } 438 | else if (ch == '|') // '||', '|>', '|' 439 | { 440 | char next = cursor_.peek(1); 441 | if (next == '>') 442 | consumeToken(tokens::OPERATOR_PIPE, 2, pToken); 443 | else if (next == '|') 444 | consumeToken(tokens::OPERATOR_OR_SCALAR, 2, pToken); 445 | else 446 | consumeToken(tokens::OPERATOR_OR_VECTOR, 1, pToken); 447 | } 448 | else if (ch == '&') // '&&', '&' 449 | { 450 | if (cursor_.peek(1) == '&') 451 | consumeToken(tokens::OPERATOR_AND_SCALAR, 2, pToken); 452 | else 453 | consumeToken(tokens::OPERATOR_AND_VECTOR, 1, pToken); 454 | } 455 | else if (ch == '*') // **, * 456 | { 457 | if (cursor_.peek(1) == '*') 458 | consumeToken(tokens::OPERATOR_EXPONENTATION_STARS, 2, pToken); 459 | else 460 | consumeToken(tokens::OPERATOR_MULTIPLY, 1, pToken); 461 | } 462 | else if (ch == ':') // ':::', '::', ':=', ':' 463 | { 464 | if (cursor_.peek(1) == ':') 465 | { 466 | if (cursor_.peek(2) == ':') 467 | consumeToken(tokens::OPERATOR_NAMESPACE_ALL, 3, pToken); 468 | else 469 | consumeToken(tokens::OPERATOR_NAMESPACE_EXPORTS, 2, pToken); 470 | } 471 | else if (cursor_.peek(1) == '=') 472 | consumeToken(tokens::OPERATOR_ASSIGN_LEFT_COLON, 2, pToken); 473 | else 474 | consumeToken(tokens::OPERATOR_SEQUENCE, 1, pToken); 475 | } 476 | else if (ch == '!') 477 | { 478 | if (cursor_.peek(1) == '=') 479 | consumeToken(tokens::OPERATOR_NOT_EQUAL, 2, pToken); 480 | else 481 | consumeToken(tokens::OPERATOR_NEGATION, 1, pToken); 482 | } 483 | else if (ch == '-') // '->>', '->', '-' 484 | { 485 | if (cursor_.peek(1) == '>') 486 | { 487 | if (cursor_.peek(2) == '>') 488 | consumeToken(tokens::OPERATOR_ASSIGN_RIGHT_PARENT, 3, pToken); 489 | else 490 | consumeToken(tokens::OPERATOR_ASSIGN_RIGHT, 2, pToken); 491 | } 492 | else 493 | consumeToken(tokens::OPERATOR_MINUS, 1, pToken); 494 | } 495 | else if (ch == '+') 496 | consumeToken(tokens::OPERATOR_PLUS, 1, pToken); 497 | else if (ch == '~') 498 | consumeToken(tokens::OPERATOR_FORMULA, 1, pToken); 499 | else if (ch == '?') 500 | consumeToken(tokens::OPERATOR_HELP, 1, pToken); 501 | else if (ch == '/') 502 | consumeToken(tokens::OPERATOR_DIVIDE, 1, pToken); 503 | else if (ch == '@') 504 | consumeToken(tokens::OPERATOR_AT, 1, pToken); 505 | else if (ch == '$') 506 | consumeToken(tokens::OPERATOR_DOLLAR, 1, pToken); 507 | else if (ch == '^') 508 | consumeToken(tokens::OPERATOR_HAT, 1, pToken); 509 | 510 | // User operators 511 | else if (ch == '%') 512 | consumeUserOperator(pToken); 513 | 514 | // Punctuation-related tokens 515 | else if (ch == ',') 516 | consumeToken(tokens::COMMA, 1, pToken); 517 | else if (ch == ';') 518 | consumeToken(tokens::SEMI, 1, pToken); 519 | 520 | // Whitespace 521 | else if (utils::countWhitespaceBytes(cursor_, &n)) 522 | consumeToken(tokens::WHITESPACE, n, pToken); 523 | 524 | // Strings and symbols 525 | else if (ch == '\'') 526 | consumeQString(pToken); 527 | else if (ch == '"') 528 | consumeQQString(pToken); 529 | else if (ch == '`') 530 | consumeQuotedSymbol(pToken); 531 | else if (isStartOfRawString(cursor_)) 532 | consumeRawString(pToken); 533 | 534 | // Comments 535 | else if (ch == '#') 536 | consumeComment(pToken); 537 | 538 | // Number 539 | else if (isStartOfNumber()) 540 | consumeNumber(pToken); 541 | 542 | // Symbol 543 | else if (isStartOfSymbol()) 544 | consumeSymbol(pToken); 545 | 546 | // Nothing matched -- error 547 | else 548 | consumeToken(tokens::INVALID, 1, pToken); 549 | 550 | return true; 551 | } 552 | 553 | Token peek(index_type lookahead = 1) 554 | { 555 | Tokenizer clone(*this); 556 | 557 | Token result(tokens::END); 558 | for (index_type i = 0; i < lookahead; ++i) { 559 | if (!clone.tokenize(&result)) { 560 | break; 561 | } 562 | } 563 | 564 | return result; 565 | } 566 | 567 | private: 568 | TextCursor cursor_; 569 | std::stack > tokenStack_; 570 | }; 571 | 572 | } // namespace tokenizer 573 | 574 | inline std::vector tokenize(const char* code, index_type n) 575 | { 576 | typedef tokenizer::Tokenizer Tokenizer; 577 | typedef tokens::Token Token; 578 | 579 | std::vector tokens; 580 | if (n == 0) 581 | return tokens; 582 | 583 | Token token; 584 | Tokenizer tokenizer(code, n); 585 | while (tokenizer.tokenize(&token)) 586 | tokens.push_back(token); 587 | 588 | return tokens; 589 | } 590 | 591 | inline std::vector tokenize(const std::string& code) 592 | { 593 | return tokenize(code.data(), code.size()); 594 | } 595 | 596 | } // namespace sourcetools 597 | 598 | #endif /* SOURCETOOLS_TOKENIZATION_TOKENIZER_H */ 599 | -------------------------------------------------------------------------------- /inst/include/sourcetools/tokenization/tokenization.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_TOKENIZATION_TOKENIZATION_H 2 | #define SOURCETOOLS_TOKENIZATION_TOKENIZATION_H 3 | 4 | #include 5 | #include 6 | #include 7 | 8 | #endif /* SOURCETOOLS_TOKENIZATION_TOKENIZATION_H */ 9 | -------------------------------------------------------------------------------- /inst/include/sourcetools/utf8/utf8.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_UTF8_UTF8_H 2 | #define SOURCETOOLS_UTF8_UTF8_H 3 | 4 | #include 5 | 6 | #include 7 | 8 | namespace sourcetools { 9 | namespace utf8 { 10 | 11 | namespace detail { 12 | static const unsigned char mask[] = { 13 | 0, // 00000000 14 | 0x7F, // 01111111 15 | 0x1F, // 00011111 16 | 0x0F, // 00001111 17 | 0x07, // 00000111 18 | 0x03, // 00000011 19 | 0x01 // 00000001 20 | }; 21 | } // namespace detail 22 | 23 | class iterator 24 | { 25 | public: 26 | iterator(const char* data) 27 | : data_(reinterpret_cast(data)), 28 | offset_(0) 29 | { 30 | } 31 | 32 | iterator(const iterator& other) 33 | : data_(other.data_), 34 | offset_(other.offset_) 35 | { 36 | } 37 | 38 | wchar_t operator*() 39 | { 40 | index_type n = size(); 41 | if (n == 0 || n > 6) 42 | return -1; 43 | 44 | const unsigned char* it = data_ + offset_; 45 | wchar_t ch = (*it++) & detail::mask[n]; 46 | for (index_type i = 1; i < n; ++i) 47 | { 48 | ch <<= 6; 49 | ch |= (*it++) & 0x3F; 50 | } 51 | 52 | return ch; 53 | } 54 | 55 | iterator& operator++() 56 | { 57 | offset_ += size(); 58 | return *this; 59 | } 60 | 61 | iterator operator++(int) 62 | { 63 | iterator copy(*this); 64 | operator++(); 65 | return copy; 66 | } 67 | 68 | bool operator==(const iterator& it) 69 | { 70 | return 71 | data_ + offset_ == 72 | it.data_ + it.offset_; 73 | } 74 | 75 | bool operator!=(const iterator& it) 76 | { 77 | return 78 | data_ + offset_ != 79 | it.data_ + it.offset_; 80 | } 81 | 82 | private: 83 | 84 | int size() 85 | { 86 | unsigned char ch = data_[offset_]; 87 | if (ch == 0) 88 | return 0; 89 | else if (ch < 192) 90 | return 1; 91 | else if (ch < 224) 92 | return 2; 93 | else if (ch < 240) 94 | return 3; 95 | else if (ch < 248) 96 | return 4; 97 | else if (ch < 252) 98 | return 5; 99 | else if (ch < 254) 100 | return 6; 101 | 102 | // TODO: on error? 103 | return 1; 104 | } 105 | 106 | private: 107 | 108 | const unsigned char* data_; 109 | index_type offset_; 110 | }; 111 | 112 | } // namespace utf8 113 | } // namespace sourcetools 114 | 115 | #endif /* SOURCETOOLS_UTF8_UTF8_H */ 116 | -------------------------------------------------------------------------------- /inst/include/sourcetools/validation/SyntaxValidator.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_VALIDATION_SYNTAX_VALIDATOR_H 2 | #define SOURCETOOLS_VALIDATION_SYNTAX_VALIDATOR_H 3 | 4 | #include 5 | #include 6 | 7 | #include 8 | #include 9 | #include 10 | 11 | namespace sourcetools { 12 | namespace validators { 13 | 14 | class SyntaxError { 15 | private: 16 | typedef collections::Position Position; 17 | typedef cursors::TokenCursor TokenCursor; 18 | typedef tokens::Token Token; 19 | 20 | public: 21 | 22 | explicit SyntaxError(const Position& position, 23 | const std::string& message) 24 | : position_(position), 25 | message_(message) 26 | {} 27 | 28 | std::string report() 29 | { 30 | std::ostringstream os; 31 | os << "[" << position_.row << ":" << position_.column << "]: " 32 | << message_; 33 | 34 | return os.str(); 35 | } 36 | 37 | index_type row() const { return position_.row; } 38 | index_type column() const { return position_.column; } 39 | const Position& position() const { return position_; } 40 | const std::string& message() const { return message_; } 41 | 42 | private: 43 | Position position_; 44 | std::string message_; 45 | }; 46 | 47 | class SyntaxValidator { 48 | 49 | private: 50 | typedef tokens::Token Token; 51 | typedef cursors::TokenCursor TokenCursor; 52 | typedef tokens::TokenType TokenType; 53 | 54 | void unexpectedToken(const Token& token, const std::string& expected = std::string()) 55 | { 56 | std::string message = "unexpected token '" + token.contents() + "'"; 57 | if (!expected.empty()) 58 | message += " (expected '" + expected + "')"; 59 | 60 | errors_.push_back(SyntaxError(token.position(), message)); 61 | } 62 | 63 | void updateBracketStack(const Token& token, std::vector* pStack) 64 | { 65 | using namespace tokens; 66 | 67 | // Update brace state 68 | if (isLeftBracket(token)) { 69 | pStack->push_back(token.type()); 70 | } else if (isRightBracket(token)) { 71 | index_type size = pStack->size(); 72 | TokenType last = pStack->at(size - 1); 73 | if (size == 1) { 74 | unexpectedToken(token); 75 | } else { 76 | if (!isComplement(token.type(), last)) 77 | unexpectedToken(token, toString(complement(last))); 78 | pStack->pop_back(); 79 | } 80 | } 81 | } 82 | 83 | public: 84 | 85 | explicit SyntaxValidator(const std::vector& tokens) 86 | { 87 | if (tokens.empty()) 88 | return; 89 | 90 | TokenCursor cursor(tokens); 91 | std::vector stack; 92 | stack.push_back(tokens::INVALID); 93 | 94 | const Token* pThisToken = &(cursor.currentToken()); 95 | const Token* pPrevToken = pThisToken; 96 | 97 | while (cursor.moveToNextSignificantToken()) { 98 | 99 | pPrevToken = pThisToken; 100 | pThisToken = &(cursor.currentToken()); 101 | 102 | updateBracketStack(cursor.currentToken(), &stack); 103 | executeValidators(*pPrevToken, *pThisToken); 104 | 105 | } 106 | } 107 | 108 | const std::vector& errors() const { return errors_; } 109 | 110 | private: 111 | 112 | void executeValidators(const tokens::Token& prevToken, 113 | const tokens::Token& thisToken) 114 | { 115 | using namespace tokens; 116 | 117 | if (isOperator(prevToken)) { 118 | 119 | // Operator followed non-unary operator. 120 | if (isNonUnaryOperator(thisToken)) 121 | unexpectedToken(thisToken); 122 | 123 | // Operator (other than =) followed by any kind of right bracket. 124 | // We need to allow e.g. 'parse(text = )'. 125 | if (isRightBracket(thisToken) && !prevToken.isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS)) 126 | unexpectedToken(thisToken); 127 | 128 | // Operator followed by '[' or '[['. 129 | if (thisToken.isType(tokens::LBRACKET) || 130 | thisToken.isType(tokens::LDBRACKET)) 131 | unexpectedToken(thisToken); 132 | } 133 | 134 | else if (isSymbolic(prevToken)) { 135 | 136 | // Two symbols on the same line. 137 | if (isSymbolic(thisToken) && prevToken.row() == thisToken.row()) 138 | unexpectedToken(thisToken); 139 | } 140 | 141 | } 142 | 143 | std::vector errors_; 144 | 145 | }; 146 | 147 | } // namespace validators 148 | } // namespace sourcetools 149 | 150 | #endif /* SOURCETOOLS_VALIDATION_SYNTAX_VALIDATOR_H */ 151 | -------------------------------------------------------------------------------- /inst/include/sourcetools/validation/validation.h: -------------------------------------------------------------------------------- 1 | #ifndef SOURCETOOLS_VALIDATION_VALIDATION_H 2 | #define SOURCETOOLS_VALIDATION_VALIDATION_H 3 | 4 | #include 5 | 6 | #endif /* SOURCETOOLS_VALIDATION_VALIDATION_H */ 7 | -------------------------------------------------------------------------------- /man/read.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sourcetools.R 3 | \name{read} 4 | \alias{read} 5 | \alias{read_lines} 6 | \alias{read_bytes} 7 | \alias{read_lines_bytes} 8 | \title{Read the Contents of a File} 9 | \usage{ 10 | read(path) 11 | 12 | read_lines(path) 13 | 14 | read_bytes(path) 15 | 16 | read_lines_bytes(path) 17 | } 18 | \arguments{ 19 | \item{path}{A file path.} 20 | } 21 | \description{ 22 | Read the contents of a file into a string (or, in the case of 23 | \code{read_lines}, a vector of strings). 24 | } 25 | -------------------------------------------------------------------------------- /man/register_routines.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/register.R 3 | \name{register_routines} 4 | \alias{register_routines} 5 | \title{Register Native Routines} 6 | \usage{ 7 | register_routines(package = ".", prefix = "C_", dynamic.symbols = FALSE) 8 | } 9 | \arguments{ 10 | \item{package}{The path to an \R package.} 11 | 12 | \item{prefix}{The prefix to assign to the \R objects 13 | generated that map to each routine.} 14 | 15 | \item{dynamic.symbols}{Boolean; should dynamic symbol lookup 16 | be enabled?} 17 | } 18 | \description{ 19 | Discover and register native routines in a package. 20 | Functions to be registered should be prefixed with the 21 | `// [[export()]]` attribute. 22 | } 23 | -------------------------------------------------------------------------------- /man/tokenize-methods.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sourcetools.R 3 | \name{tokenize_file} 4 | \alias{tokenize_file} 5 | \alias{tokenize_string} 6 | \alias{tokenize} 7 | \title{Tokenize R Code} 8 | \usage{ 9 | tokenize_file(path) 10 | 11 | tokenize_string(string) 12 | 13 | tokenize(file = "", text = NULL) 14 | } 15 | \arguments{ 16 | \item{file, path}{A file path.} 17 | 18 | \item{text, string}{\R code as a character vector of length one.} 19 | } 20 | \value{ 21 | A \code{data.frame} with the following columns: 22 | 23 | \tabular{ll}{ 24 | \code{value} \tab The token's contents, as a string. \cr 25 | \code{row} \tab The row where the token is located. \cr 26 | \code{column} \tab The column where the token is located. \cr 27 | \code{type} \tab The token type, as a string. \cr 28 | } 29 | } 30 | \description{ 31 | Tools for tokenizing \R code. 32 | } 33 | \note{ 34 | Line numbers are determined by existence of the \code{\\n} 35 | line feed character, under the assumption that code being tokenized 36 | will use either \code{\\n} to indicate newlines (as on modern 37 | Unix systems), or \code{\\r\\n} as on Windows. 38 | } 39 | \examples{ 40 | tokenize_string("x <- 1 + 2") 41 | } 42 | -------------------------------------------------------------------------------- /man/validate_syntax.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sourcetools.R 3 | \name{validate_syntax} 4 | \alias{validate_syntax} 5 | \title{Find Syntax Errors} 6 | \usage{ 7 | validate_syntax(string) 8 | } 9 | \arguments{ 10 | \item{string}{A character vector (of length one).} 11 | } 12 | \description{ 13 | Find syntax errors in a string of \R code. 14 | } 15 | -------------------------------------------------------------------------------- /notes/notes-tdop.R: -------------------------------------------------------------------------------- 1 | ## An adapation of http://effbot.org/zone/simple-top-down-parsing.htm 2 | ## for R. We examine a super simple language (calculator) 3 | ## that consists only of '+', '*', and single-digit numbers 4 | ## (no whitespace). For example, "1+2*3+4" is a valid 5 | ## program, doing what you expect. More comments are 6 | ## included inline to help make sense of what's going on. 7 | ## 8 | ## While simple, this example showcases the main points 9 | ## needed to understand top-down operator precedence 10 | ## parsing. 11 | 12 | ## First, some simple utility functions. 13 | 14 | # Check whether a token is an operator (ie, '+' or '*') 15 | is_operator <- function(token) { 16 | token == "+" || token == "*" 17 | } 18 | 19 | # Single-digit numbers 20 | is_number <- function(token) { 21 | token %in% as.character(0:9) 22 | } 23 | 24 | # We 'tokenize' a program (string of code) just by splitting 25 | # it. So "1+2" becomes c("1", "+", "2"). Obviously a 'real' 26 | # tokenizer would tokenize incrementally and separate words 27 | # etc. but tokenization is not the interesting part of this 28 | # example, so we just keep it simple. 29 | tokenize <- function(program) { 30 | strsplit(program, "", fixed = TRUE)[[1]] 31 | } 32 | 33 | # A simple tokenizer 'class' that accepts a program, 34 | # tokenizes it, and returns a method that accesses the next 35 | # token (if available). Elements postfixed with '_' are 36 | # 'private' (hidden in the closure); access to them is made 37 | # available through 'public' functions exported as part of 38 | # the list object. 39 | Tokenizer <- function(program) { 40 | tokens_ <- tokenize(program) 41 | index_ <- 0 42 | n_ <- length(tokens_) 43 | list( 44 | tokenize = function() { 45 | index_ <<- index_ + 1 46 | if (index_ <= n_) 47 | tokens_[[index_]] 48 | else 49 | "" 50 | } 51 | ) 52 | } 53 | 54 | # Our 'Parser' class will be used to construct 55 | # our parse tree (an AST). 56 | Parser <- function(tokenizer) { 57 | 58 | tokenizer_ <- tokenizer 59 | 60 | # We save a lookahead token, to help inform what action we 61 | # should take as we parse. It needs to exist as a private 62 | # variable so that the various recursing functions see the 63 | # correct 'state' of the program. 64 | lookahead_ <- tokenizer_$tokenize() 65 | 66 | # A hacky helper function for printing debug output when 67 | # running our parser. Don't worry too much about this. 68 | indent <- function() { 69 | paste( 70 | paste(character(length(sys.calls())), collapse = "-"), 71 | "-> ", 72 | sep = "" 73 | ) 74 | } 75 | 76 | # The left-binding precedence for a token. The important 77 | # thing is that '*' has a higher precedence than '+'. This 78 | # function either receives operators, or a special 'end of 79 | # line' token, implying that there is nothing left to 80 | # parse. We give it a left-binding precedence of 0, to 81 | # indicate that parsing should end now. 82 | precedence <- function(token) { 83 | if (token == "+") 84 | 10 85 | else if (token == "*") 86 | 20 87 | else if (token == "") 88 | 0 89 | else 90 | stop("unexpected token '", token, "'; expected operator or end-of-parse") 91 | } 92 | 93 | # Handling of 'null denotation' tokens. This is for tokens 94 | # that are discovered at the start of an expression; ie, 95 | # unary operators, or regular old numbers. Note how for 96 | # the '+' operator, we simply construct a single-element 97 | # node with "+" on the left-hand side, and the new 98 | # expression on the right-hand side. 99 | parsePrefixExpression <- function(token) { 100 | if (token == "+") 101 | call(token, parseTopLevelExpression(100)) 102 | else if (is_number(token)) 103 | as.numeric(token) 104 | else 105 | stop("unexpected token '", token, "'") 106 | } 107 | 108 | # 'led', for 'left denotation', is used when a token 109 | # appears within a construct (ie, when a binary operator 110 | # is encountered). This function will be called once a 111 | # binary operator is encountered, with 'lhs' being that 112 | # operator, and 'rhs' being the current parse tree. Each 113 | # call to 'led' constructs a new node, with our 'lhs' 114 | # operator as the parent, the current parse tree ('rhs') 115 | # as the left child, and the next part of the expression 116 | # as the right child. 117 | parseInfixExpression <- function(lhs, rhs) { 118 | if (!is_operator(lhs)) 119 | stop("unexpected token '", lhs, "'; expecting an operator") 120 | call(lhs, rhs, parseTopLevelExpression(precedence(lhs))) 121 | } 122 | 123 | # This is the entry-point that parses a whole expression. 124 | parseTopLevelExpression <- function(rbp = 0) { 125 | 126 | # Save the current token in 'token', and advance to the 127 | # next token. 128 | # 129 | # Why do we need to save the token in a 'global' 130 | # variable? When the various parse recursions end, we 131 | # need to make sure those routines are seeing the 132 | # current state, rather than their own state. 133 | token <- lookahead_ 134 | lookahead_ <<- tokenizer_$tokenize() 135 | 136 | # Parse the 'null denotation' expression. This 137 | # represents tokens that are discovered at the beginning 138 | # of an expression. We expect this to handle both unary 139 | # operators (wherein 'nud' will recurse until 140 | # discovering a non-operator token), and numeric tokens 141 | # (which end the recursion). 142 | cat(indent(), "lhs <- parsePrefixExpression(", format(token), ")\n", sep = "") 143 | node <- parsePrefixExpression(token) 144 | 145 | # Now, we need to construct the right-hand side of this 146 | # expression. The 'lbp' tells us whether we can continue 147 | # 'joining' expressions into the current parse tree. 148 | # TODO: make this more clear 149 | while (rbp < precedence(lookahead_)) { 150 | 151 | # Save the current token, and get the next token. 152 | token <- lookahead_ 153 | lookahead_ <<- tokenizer$tokenize() 154 | 155 | # Construct a new 'node' for our tree. Notice how we 156 | # 'grow' the left-hand side here. 157 | cat(indent(), "lhs <- parseInfixExpression(", format(token), ", ", format(node), ")\n", sep = "") 158 | node <- parseInfixExpression(token, node) 159 | } 160 | 161 | # Return our parse tree. 162 | node 163 | } 164 | 165 | list(parse = parseTopLevelExpression) 166 | } 167 | 168 | # Let's test it out! 169 | program <- "1+2*3*4+5" 170 | tokens <- tokenize(program) 171 | tokenizer <- Tokenizer(program) 172 | parser <- Parser(tokenizer) 173 | expr <- parser$parse() 174 | stopifnot(eval(parse(text = program)) == eval(expr)) 175 | 176 | ## Other materials: 177 | ## 178 | ## http://journal.stuffwithstuff.com/2011/03/19/pratt-parsers-expression-parsing-made-easy/ 179 | ## http://eli.thegreenplace.net/2010/01/02/top-down-operator-precedence-parsing 180 | ## 181 | -------------------------------------------------------------------------------- /sourcetools.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: knitr 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS = -I../inst/include 2 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS = -I../inst/include 2 | -------------------------------------------------------------------------------- /src/NSE.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace sourcetools; 3 | using namespace sourcetools::r; 4 | 5 | extern "C" SEXP sourcetools_performs_nse(SEXP fnSEXP) 6 | { 7 | if (TYPEOF(fnSEXP) == VECSXP || TYPEOF(fnSEXP) == EXPRSXP) 8 | { 9 | Protect protect; 10 | index_type n = Rf_length(fnSEXP); 11 | SEXP resultSEXP = protect(Rf_allocVector(LGLSXP, n)); 12 | for (index_type i = 0; i < n; ++i) 13 | { 14 | SEXP elSEXP = VECTOR_ELT(fnSEXP, i); 15 | LOGICAL(resultSEXP)[i] = Rf_isFunction(elSEXP) 16 | ? nse::performsNonStandardEvaluation(elSEXP) 17 | : 0; 18 | } 19 | return resultSEXP; 20 | } 21 | 22 | bool result = Rf_isFunction(fnSEXP) 23 | ? nse::performsNonStandardEvaluation(fnSEXP) 24 | : false; 25 | 26 | return Rf_ScalarLogical(result); 27 | } 28 | -------------------------------------------------------------------------------- /src/Parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define R_NO_REMAP 4 | #include 5 | #include 6 | 7 | namespace sourcetools { 8 | 9 | void log(parser::ParseNode* pNode, int depth) 10 | { 11 | if (!pNode) 12 | return; 13 | 14 | for (int i = 0; i < depth; ++i) 15 | Rprintf(" "); 16 | 17 | Rprintf("%s\n", toString(pNode->token()).c_str()); 18 | 19 | using parser::ParseNode; 20 | const std::vector& children = pNode->children(); 21 | for (std::vector::const_iterator it = children.begin(); 22 | it != children.end(); 23 | ++it) 24 | { 25 | log(*it, depth + 1); 26 | } 27 | } 28 | 29 | namespace { 30 | 31 | class SEXPConverter 32 | { 33 | private: 34 | typedef parser::ParseNode ParseNode; 35 | 36 | static SEXP asKeywordSEXP(const tokens::Token& token) 37 | { 38 | using namespace tokens; 39 | 40 | switch (token.type()) 41 | { 42 | case KEYWORD_FALSE: return Rf_ScalarLogical(0); 43 | case KEYWORD_TRUE: return Rf_ScalarLogical(1); 44 | case KEYWORD_Inf: return Rf_ScalarReal(R_PosInf); 45 | case KEYWORD_NA: return Rf_ScalarLogical(NA_LOGICAL); 46 | case KEYWORD_NA_character_: return Rf_ScalarString(NA_STRING); 47 | // case KEYWORD_NA_complex_: return NA_COM 48 | case KEYWORD_NA_integer_: return Rf_ScalarInteger(NA_INTEGER); 49 | case KEYWORD_NA_real_: return Rf_ScalarReal(NA_REAL); 50 | case KEYWORD_NaN: return Rf_ScalarReal(R_NaN); 51 | case KEYWORD_NULL: return R_NilValue; 52 | default: return Rf_install(token.contents().c_str()); 53 | } 54 | } 55 | 56 | static SEXP asFunctionCallSEXP(const ParseNode* pNode) 57 | { 58 | using namespace tokens; 59 | 60 | const Token& token = pNode->token(); 61 | 62 | // Figure out the 'head' of this language object. 63 | // '[' and '[[' get these tokens as-is, while '(' 64 | // instead uses the name of the first child. 65 | SEXP langSEXP; 66 | if (token.isType(LBRACKET)) 67 | langSEXP = Rf_lang1(Rf_install("[")); 68 | else if (token.isType(LDBRACKET)) 69 | langSEXP = Rf_lang1(Rf_install("[[")); 70 | else 71 | langSEXP = Rf_lang1(R_NilValue); 72 | 73 | // Start appending the child nodes to our list. 74 | r::Protect protect; 75 | SEXP headSEXP = protect(langSEXP); 76 | for (std::vector::const_iterator it = pNode->children().begin(); 77 | it != pNode->children().end(); 78 | ++it) 79 | { 80 | const ParseNode* node = *it; 81 | const Token& token = node->token(); 82 | if (token.isType(EMPTY)) 83 | break; 84 | else if (token.isType(MISSING)) 85 | SETCDR(langSEXP, Rf_lang1(R_MissingArg)); 86 | 87 | else if (token.isType(tokens::OPERATOR_ASSIGN_LEFT_EQUALS)) 88 | { 89 | const ParseNode* lhs = node->children()[0]; 90 | const ParseNode* rhs = node->children()[1]; 91 | 92 | if (rhs->token().isType(MISSING)) 93 | SETCDR(langSEXP, Rf_lang1(R_MissingArg)); 94 | else 95 | SETCDR(langSEXP, Rf_lang1(asSEXP(rhs))); 96 | 97 | const Token& token = lhs->token(); 98 | SEXP nameSEXP = Rf_install(tokens::stringValue(token).c_str()); 99 | SET_TAG(CDR(langSEXP), nameSEXP); 100 | } 101 | else 102 | { 103 | SETCDR(langSEXP, Rf_lang1(asSEXP(node))); 104 | } 105 | 106 | langSEXP = CDR(langSEXP); 107 | } 108 | 109 | SEXP resultSEXP = CAR(headSEXP) == R_NilValue 110 | ? CDR(headSEXP) 111 | : headSEXP; 112 | 113 | // Convert strings to symbols at head position 114 | if (TYPEOF(CAR(resultSEXP)) == STRSXP) 115 | SETCAR(resultSEXP, Rf_install(CHAR(STRING_ELT(CAR(resultSEXP), 0)))); 116 | 117 | return resultSEXP; 118 | } 119 | 120 | static SEXP asFunctionArgumentListSEXP(const ParseNode* pNode) 121 | { 122 | index_type n = pNode->children().size(); 123 | if (n == 0) 124 | return R_NilValue; 125 | 126 | r::Protect protect; 127 | SEXP listSEXP = protect(Rf_allocList(n)); 128 | SEXP headSEXP = listSEXP; 129 | for (std::vector::const_iterator it = pNode->children().begin(); 130 | it != pNode->children().end(); 131 | ++it) 132 | { 133 | const ParseNode* pChild = *it; 134 | const tokens::Token& token = pChild->token(); 135 | 136 | if (tokens::isOperator(token)) 137 | { 138 | const ParseNode* pLhs = pChild->children()[0]; 139 | const ParseNode* pRhs = pChild->children()[1]; 140 | 141 | if (pLhs->token().isType(tokens::SYMBOL)) 142 | SET_TAG(headSEXP, Rf_install(tokens::stringValue(pLhs->token()).c_str())); 143 | SETCAR(headSEXP, asSEXP(pRhs)); 144 | } 145 | else if (token.isType(tokens::SYMBOL)) 146 | { 147 | SETCAR(headSEXP, R_MissingArg); 148 | SET_TAG(headSEXP, Rf_install(tokens::stringValue(token).c_str())); 149 | } 150 | 151 | headSEXP = CDR(headSEXP); 152 | } 153 | 154 | return listSEXP; 155 | } 156 | 157 | static SEXP asFunctionDeclSEXP(const ParseNode* pNode) 158 | { 159 | if (pNode->children().size() != 2) 160 | return R_NilValue; 161 | 162 | r::Protect protect; 163 | SEXP argsSEXP = protect(asFunctionArgumentListSEXP(pNode->children()[0])); 164 | SEXP bodySEXP = protect(asSEXP(pNode->children()[1])); 165 | SEXP fnSEXP = Rf_install("function"); 166 | SEXP resultSEXP = Rf_lang4(fnSEXP, argsSEXP, bodySEXP, R_NilValue); 167 | return resultSEXP; 168 | } 169 | 170 | static SEXP asNumericSEXP(const tokens::Token& token) 171 | { 172 | if (*(token.end() - 1) == 'L') 173 | return Rf_ScalarInteger(::atof(token.begin())); 174 | else 175 | return Rf_ScalarReal(::atof(token.begin())); 176 | } 177 | 178 | static bool isFunctionCall(const ParseNode* pNode) 179 | { 180 | const tokens::Token& token = pNode->token(); 181 | if (token.isType(tokens::LBRACKET) || token.isType(tokens::LDBRACKET)) 182 | return true; 183 | 184 | // Differentiate between '(a)' and 'a()'. 185 | if (token.isType(tokens::LPAREN)) 186 | return pNode->children().size() > 1; 187 | 188 | return false; 189 | } 190 | 191 | public: 192 | static SEXP asSEXP(const ParseNode* pNode) 193 | { 194 | using namespace tokens; 195 | 196 | if (!pNode) 197 | return R_NilValue; 198 | 199 | if (pNode->token().isType(tokens::ROOT)) 200 | { 201 | const std::vector& children = pNode->children(); 202 | index_type n = pNode->children().size(); 203 | r::Protect protect; 204 | SEXP exprSEXP = protect(Rf_allocVector(EXPRSXP, n)); 205 | for (index_type i = 0; i < n; ++i) 206 | SET_VECTOR_ELT(exprSEXP, i, asSEXP(children[i])); 207 | return exprSEXP; 208 | } 209 | 210 | // Handle function calls specially 211 | if (isFunctionCall(pNode)) 212 | return asFunctionCallSEXP(pNode); 213 | 214 | const tokens::Token& token = pNode->token(); 215 | if (token.isType(KEYWORD_FUNCTION)) 216 | return asFunctionDeclSEXP(pNode); 217 | 218 | SEXP elSEXP; 219 | r::Protect protect; 220 | if (token.isType(MISSING)) 221 | elSEXP = R_MissingArg; 222 | else if (token.isType(OPERATOR_EXPONENTATION_STARS)) 223 | elSEXP = Rf_install("^"); 224 | else if (token.isType(KEYWORD_BREAK)) 225 | elSEXP = Rf_lang1(Rf_install("break")); 226 | else if (token.isType(KEYWORD_NEXT)) 227 | elSEXP = Rf_lang1(Rf_install("next")); 228 | else if (isKeyword(token)) 229 | elSEXP = asKeywordSEXP(token); 230 | else if (isOperator(token) || isLeftBracket(token)) 231 | elSEXP = Rf_install(token.contents().c_str()); 232 | else if (isNumeric(token)) 233 | elSEXP = asNumericSEXP(token); 234 | else if (isSymbol(token)) 235 | elSEXP = Rf_install(tokens::stringValue(token).c_str()); 236 | else if (isString(token)) 237 | elSEXP = Rf_mkString(tokens::stringValue(token).c_str()); 238 | else 239 | elSEXP = Rf_mkString(token.contents().c_str()); 240 | 241 | if (pNode->children().empty()) 242 | return elSEXP; 243 | 244 | SEXP headSEXP = protect(Rf_lang1(protect(elSEXP))); 245 | SEXP listSEXP = headSEXP; 246 | for (std::vector::const_iterator it = pNode->children().begin(); 247 | it != pNode->children().end(); 248 | ++it) 249 | { 250 | const ParseNode* child = *it; 251 | if (!child->token().isType(EMPTY)) 252 | listSEXP = SETCDR(listSEXP, Rf_lang1(asSEXP(child))); 253 | } 254 | 255 | return headSEXP; 256 | } 257 | 258 | static SEXP asSEXP(const std::vector& expression) 259 | { 260 | index_type n = expression.size(); 261 | r::Protect protect; 262 | SEXP exprSEXP = protect(Rf_allocVector(EXPRSXP, n)); 263 | for (index_type i = 0; i < n; ++i) 264 | SET_VECTOR_ELT(exprSEXP, i, asSEXP(expression[i])); 265 | return exprSEXP; 266 | } 267 | 268 | }; 269 | 270 | void reportErrors(const std::vector& errors) 271 | { 272 | if (errors.empty()) 273 | return; 274 | 275 | std::stringstream ss; 276 | ss << "\n "; 277 | typedef std::vector::const_iterator Iterator; 278 | for (Iterator it = errors.begin(); 279 | it != errors.end(); 280 | ++it) 281 | { 282 | ss << "[" << it->start().row << ":" << it->start().column << "]: " 283 | << it->message() << std::endl << " "; 284 | } 285 | 286 | std::string message = ss.str(); 287 | Rf_warning("%s", message.c_str()); 288 | } 289 | 290 | } // anonymous namespace 291 | } // namespace sourcetools 292 | 293 | extern "C" SEXP sourcetools_parse_string(SEXP programSEXP) 294 | { 295 | using namespace sourcetools; 296 | using parser::ParseStatus; 297 | using parser::Parser; 298 | using parser::ParseNode; 299 | 300 | SEXP charSEXP = STRING_ELT(programSEXP, 0); 301 | Parser parser(CHAR(charSEXP), Rf_length(charSEXP)); 302 | 303 | ParseStatus status; 304 | scoped_ptr pRoot(parser.parse(&status)); 305 | 306 | sourcetools::reportErrors(status.getErrors()); 307 | 308 | return sourcetools::SEXPConverter::asSEXP(pRoot); 309 | } 310 | 311 | extern "C" SEXP sourcetools_diagnose_string(SEXP strSEXP) 312 | { 313 | using namespace sourcetools; 314 | using parser::Parser; 315 | using parser::ParseStatus; 316 | using parser::ParseNode; 317 | using r::Protect; 318 | 319 | SEXP charSEXP = STRING_ELT(strSEXP, 0); 320 | Parser parser(CHAR(charSEXP), Rf_length(charSEXP)); 321 | 322 | ParseStatus status; 323 | scoped_ptr pNode(parser.parse(&status)); 324 | 325 | using namespace diagnostics; 326 | scoped_ptr pDiagnostics(createDefaultDiagnosticsSet()); 327 | std::vector diagnostics = pDiagnostics->run(pNode); 328 | return r::create(diagnostics); 329 | } 330 | -------------------------------------------------------------------------------- /src/Reader.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | 6 | #define R_NO_REMAP 7 | #include 8 | #include 9 | 10 | extern "C" SEXP sourcetools_read(SEXP absolutePathSEXP) 11 | { 12 | const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0)); 13 | 14 | std::string contents; 15 | bool result = sourcetools::read(absolutePath, &contents); 16 | if (!result) 17 | { 18 | Rf_warning("Failed to read file"); 19 | return R_NilValue; 20 | } 21 | 22 | sourcetools::r::Protect protect; 23 | SEXP resultSEXP = protect(Rf_allocVector(STRSXP, 1)); 24 | SET_STRING_ELT(resultSEXP, 0, sourcetools::r::createChar(contents)); 25 | return resultSEXP; 26 | } 27 | 28 | extern "C" SEXP sourcetools_read_lines(SEXP absolutePathSEXP) 29 | { 30 | const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0)); 31 | 32 | std::vector lines; 33 | bool result = sourcetools::read_lines(absolutePath, &lines); 34 | if (!result) 35 | { 36 | Rf_warning("Failed to read file"); 37 | return R_NilValue; 38 | } 39 | 40 | sourcetools::index_type n = lines.size(); 41 | sourcetools::r::Protect protect; 42 | SEXP resultSEXP = protect(Rf_allocVector(STRSXP, n)); 43 | for (sourcetools::index_type i = 0; i < n; ++i) 44 | { 45 | SEXP charSEXP = sourcetools::r::createChar(lines[i]); 46 | SET_STRING_ELT(resultSEXP, i, charSEXP); 47 | } 48 | return resultSEXP; 49 | } 50 | 51 | extern "C" SEXP sourcetools_read_bytes(SEXP absolutePathSEXP) 52 | { 53 | const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0)); 54 | 55 | std::string contents; 56 | bool result = sourcetools::read(absolutePath, &contents); 57 | if (!result) 58 | { 59 | Rf_warning("Failed to read file"); 60 | return R_NilValue; 61 | } 62 | 63 | sourcetools::r::Protect protect; 64 | SEXP resultSEXP = protect(Rf_allocVector(RAWSXP, contents.size())); 65 | std::memcpy(RAW(resultSEXP), contents.c_str(), contents.size()); 66 | return resultSEXP; 67 | } 68 | 69 | extern "C" SEXP sourcetools_read_lines_bytes(SEXP absolutePathSEXP) 70 | { 71 | const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0)); 72 | 73 | std::vector lines; 74 | bool result = sourcetools::read_lines(absolutePath, &lines); 75 | if (!result) 76 | { 77 | Rf_warning("Failed to read file"); 78 | return R_NilValue; 79 | } 80 | 81 | sourcetools::index_type n = lines.size(); 82 | sourcetools::r::Protect protect; 83 | SEXP resultSEXP = protect(Rf_allocVector(VECSXP, n)); 84 | for (sourcetools::index_type i = 0; i < n; ++i) 85 | { 86 | SEXP rawSEXP = Rf_allocVector(RAWSXP, lines[i].size()); 87 | std::memcpy(RAW(rawSEXP), lines[i].c_str(), lines[i].size()); 88 | SET_VECTOR_ELT(resultSEXP, i, rawSEXP); 89 | } 90 | return resultSEXP; 91 | } 92 | -------------------------------------------------------------------------------- /src/Tokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | #define R_NO_REMAP 4 | #include 5 | #include 6 | 7 | namespace sourcetools { 8 | namespace { 9 | 10 | void asDataFrame(SEXP listSEXP, int n) 11 | { 12 | r::Protect protect; 13 | SEXP classSEXP = protect(Rf_mkString("data.frame")); 14 | Rf_setAttrib(listSEXP, R_ClassSymbol, classSEXP); 15 | 16 | SEXP rownamesSEXP = protect(Rf_allocVector(INTSXP, 2)); 17 | INTEGER(rownamesSEXP)[0] = NA_INTEGER; 18 | INTEGER(rownamesSEXP)[1] = -n; 19 | Rf_setAttrib(listSEXP, R_RowNamesSymbol, rownamesSEXP); 20 | } 21 | 22 | SEXP asSEXP(const std::vector& tokens) 23 | { 24 | r::Protect protect; 25 | index_type n = tokens.size(); 26 | SEXP resultSEXP = protect(Rf_allocVector(VECSXP, 4)); 27 | 28 | // Set vector elements 29 | SEXP valueSEXP = protect(Rf_allocVector(STRSXP, n)); 30 | SET_VECTOR_ELT(resultSEXP, 0, valueSEXP); 31 | for (index_type i = 0; i < n; ++i) { 32 | const std::string& contents = tokens[i].contents(); 33 | SET_STRING_ELT(valueSEXP, i, r::createChar(contents)); 34 | } 35 | 36 | SEXP rowSEXP = protect(Rf_allocVector(INTSXP, n)); 37 | SET_VECTOR_ELT(resultSEXP, 1, rowSEXP); 38 | for (index_type i = 0; i < n; ++i) 39 | INTEGER(rowSEXP)[i] = tokens[i].row() + 1; 40 | 41 | SEXP columnSEXP = protect(Rf_allocVector(INTSXP, n)); 42 | SET_VECTOR_ELT(resultSEXP, 2, columnSEXP); 43 | for (index_type i = 0; i < n; ++i) 44 | INTEGER(columnSEXP)[i] = tokens[i].column() + 1; 45 | 46 | SEXP typeSEXP = protect(Rf_allocVector(STRSXP, n)); 47 | SET_VECTOR_ELT(resultSEXP, 3, typeSEXP); 48 | for (index_type i = 0; i < n; ++i) { 49 | const std::string& type = toString(tokens[i].type()); 50 | SET_STRING_ELT(typeSEXP, i, r::createChar(type)); 51 | } 52 | 53 | // Set names 54 | SEXP namesSEXP = protect(Rf_allocVector(STRSXP, 4)); 55 | 56 | SET_STRING_ELT(namesSEXP, 0, Rf_mkChar("value")); 57 | SET_STRING_ELT(namesSEXP, 1, Rf_mkChar("row")); 58 | SET_STRING_ELT(namesSEXP, 2, Rf_mkChar("column")); 59 | SET_STRING_ELT(namesSEXP, 3, Rf_mkChar("type")); 60 | 61 | Rf_setAttrib(resultSEXP, R_NamesSymbol, namesSEXP); 62 | 63 | asDataFrame(resultSEXP, n); 64 | 65 | return resultSEXP; 66 | } 67 | 68 | } // anonymous namespace 69 | } // namespace sourcetools 70 | 71 | extern "C" SEXP sourcetools_tokenize_file(SEXP absolutePathSEXP) 72 | { 73 | typedef sourcetools::tokens::Token Token; 74 | 75 | const char* absolutePath = CHAR(STRING_ELT(absolutePathSEXP, 0)); 76 | std::string contents; 77 | if (!sourcetools::read(absolutePath, &contents)) 78 | { 79 | Rf_warning("Failed to read file"); 80 | return R_NilValue; 81 | } 82 | 83 | if (contents.empty()) return R_NilValue; 84 | const std::vector& tokens = sourcetools::tokenize(contents); 85 | return sourcetools::asSEXP(tokens); 86 | } 87 | 88 | extern "C" SEXP sourcetools_tokenize_string(SEXP stringSEXP) 89 | { 90 | typedef sourcetools::tokens::Token Token; 91 | 92 | if (Rf_length(stringSEXP) == 0) 93 | return sourcetools::asSEXP(std::vector()); 94 | 95 | SEXP charSEXP = STRING_ELT(stringSEXP, 0); 96 | const std::vector& tokens = 97 | sourcetools::tokenize(CHAR(charSEXP), Rf_length(charSEXP)); 98 | return sourcetools::asSEXP(tokens); 99 | } 100 | -------------------------------------------------------------------------------- /src/ValidateSyntax.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using namespace sourcetools; 3 | 4 | namespace { 5 | 6 | typedef sourcetools::validators::SyntaxError Error; 7 | struct RowSetter 8 | { 9 | void operator()(SEXP dataSEXP, index_type i, const Error& error) 10 | { 11 | INTEGER(dataSEXP)[i] = error.row() + 1; 12 | } 13 | }; 14 | 15 | struct ColSetter 16 | { 17 | void operator()(SEXP dataSEXP, index_type i, const Error& error) 18 | { 19 | INTEGER(dataSEXP)[i] = error.column() + 1; 20 | } 21 | }; 22 | 23 | struct ErrSetter 24 | { 25 | void operator()(SEXP dataSEXP, index_type i, const Error& error) 26 | { 27 | const std::string& msg = error.message(); 28 | SEXP charSEXP = sourcetools::r::createChar(msg); 29 | SET_STRING_ELT(dataSEXP, i, charSEXP); 30 | } 31 | }; 32 | 33 | } // anonymous namespace 34 | 35 | extern "C" SEXP sourcetools_validate_syntax(SEXP contentsSEXP) { 36 | using namespace sourcetools; 37 | using namespace sourcetools::tokens; 38 | using namespace sourcetools::validators; 39 | 40 | r::Protect protect; 41 | if (Rf_length(contentsSEXP) == 0) 42 | contentsSEXP = protect(Rf_mkString("")); 43 | 44 | const char* contents = CHAR(STRING_ELT(contentsSEXP, 0)); 45 | const std::vector& tokens = sourcetools::tokenize(contents); 46 | 47 | SyntaxValidator validator(tokens); 48 | const std::vector& errors = validator.errors(); 49 | index_type n = errors.size(); 50 | 51 | r::RObjectFactory factory; 52 | SEXP resultSEXP = factory.create(VECSXP, 3); 53 | SET_VECTOR_ELT(resultSEXP, 0, factory.create(INTSXP, errors, RowSetter())); 54 | SET_VECTOR_ELT(resultSEXP, 1, factory.create(INTSXP, errors, ColSetter())); 55 | SET_VECTOR_ELT(resultSEXP, 2, factory.create(STRSXP, errors, ErrSetter())); 56 | 57 | const char* names[] = {"row", "column", "error"}; 58 | r::util::setNames(resultSEXP, names, 3); 59 | r::util::listToDataFrame(resultSEXP, n); 60 | 61 | return resultSEXP; 62 | } 63 | -------------------------------------------------------------------------------- /src/sourcetools-init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | 6 | /* FIXME: 7 | Check these declarations against the C/Fortran source code. 8 | */ 9 | 10 | /* .Call calls */ 11 | extern SEXP run_testthat_tests(); 12 | extern SEXP sourcetools_diagnose_string(SEXP); 13 | extern SEXP sourcetools_parse_string(SEXP); 14 | extern SEXP sourcetools_performs_nse(SEXP); 15 | extern SEXP sourcetools_read(SEXP); 16 | extern SEXP sourcetools_read_bytes(SEXP); 17 | extern SEXP sourcetools_read_lines(SEXP); 18 | extern SEXP sourcetools_read_lines_bytes(SEXP); 19 | extern SEXP sourcetools_tokenize_file(SEXP); 20 | extern SEXP sourcetools_tokenize_string(SEXP); 21 | extern SEXP sourcetools_validate_syntax(SEXP); 22 | 23 | static const R_CallMethodDef CallEntries[] = { 24 | {"run_testthat_tests", (DL_FUNC) &run_testthat_tests, 0}, 25 | {"sourcetools_diagnose_string", (DL_FUNC) &sourcetools_diagnose_string, 1}, 26 | {"sourcetools_parse_string", (DL_FUNC) &sourcetools_parse_string, 1}, 27 | {"sourcetools_performs_nse", (DL_FUNC) &sourcetools_performs_nse, 1}, 28 | {"sourcetools_read", (DL_FUNC) &sourcetools_read, 1}, 29 | {"sourcetools_read_bytes", (DL_FUNC) &sourcetools_read_bytes, 1}, 30 | {"sourcetools_read_lines", (DL_FUNC) &sourcetools_read_lines, 1}, 31 | {"sourcetools_read_lines_bytes", (DL_FUNC) &sourcetools_read_lines_bytes, 1}, 32 | {"sourcetools_tokenize_file", (DL_FUNC) &sourcetools_tokenize_file, 1}, 33 | {"sourcetools_tokenize_string", (DL_FUNC) &sourcetools_tokenize_string, 1}, 34 | {"sourcetools_validate_syntax", (DL_FUNC) &sourcetools_validate_syntax, 1}, 35 | {NULL, NULL, 0} 36 | }; 37 | 38 | void R_init_sourcetools(DllInfo *dll) 39 | { 40 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 41 | R_useDynamicSymbols(dll, FALSE); 42 | } 43 | -------------------------------------------------------------------------------- /src/test-Parser.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace sourcetools; 5 | using namespace sourcetools::parser; 6 | using namespace sourcetools::cursors; 7 | using namespace sourcetools::collections; 8 | 9 | typedef sourcetools::tokens::Token Token; 10 | 11 | context("Parser") { 12 | 13 | test_that("we can extract partial parse trees from code") 14 | { 15 | std::string code = "foo <- function(a = {1 + 2}) {}"; 16 | 17 | std::vector tokens = tokenize(code); 18 | Parser parser(code); 19 | 20 | ParseStatus status; 21 | scoped_ptr pRoot(parser.parse(&status)); 22 | 23 | TokenCursor cursor(tokens); 24 | expect_true(cursor.findFwd("=")); 25 | 26 | Position position = cursor.currentToken().position(); 27 | ParseNode* pTarget = status.getNodeAtPosition(position); 28 | expect_true((pTarget != NULL)); 29 | if (pTarget == NULL) 30 | return; 31 | 32 | const char* begin; 33 | const char* end; 34 | pTarget->bounds(&begin, &end); 35 | 36 | std::string contents(begin, end); 37 | expect_true(contents == "a = {1 + 2}"); 38 | 39 | expect_true(cursor.findFwd("{")); 40 | pTarget = status.getNodeAtPosition(cursor.position()); 41 | expect_true((pTarget != NULL)); 42 | expect_true((pTarget->token().contentsEqual("{"))); 43 | if (pTarget == NULL) 44 | return; 45 | 46 | pTarget->bounds(&begin, &end); 47 | contents = std::string(begin, end); 48 | expect_true(contents == "{1 + 2}"); 49 | } 50 | 51 | } 52 | -------------------------------------------------------------------------------- /src/test-Tokenizer.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace sourcetools; 5 | using namespace sourcetools::cursors; 6 | 7 | typedef sourcetools::tokens::Token Token; 8 | 9 | namespace { 10 | 11 | class OpenBracketLocator 12 | { 13 | public: 14 | inline bool operator()(TokenCursor* pCursor) const { 15 | 16 | if (pCursor->bwdToMatchingBracket()) 17 | return false; 18 | 19 | return tokens::isLeftBracket(pCursor->currentToken()); 20 | } 21 | }; 22 | 23 | } // anonymous namespace 24 | 25 | context("Tokenizer") { 26 | 27 | test_that("Complements are detected correctly") { 28 | 29 | using namespace sourcetools::tokens; 30 | 31 | expect_true(complement(LPAREN) == RPAREN); 32 | expect_true(complement(LBRACE) == RBRACE); 33 | expect_true(complement(LBRACKET) == RBRACKET); 34 | expect_true(complement(LDBRACKET) == RDBRACKET); 35 | 36 | expect_true(complement(RPAREN) == LPAREN); 37 | expect_true(complement(RBRACE) == LBRACE); 38 | expect_true(complement(RBRACKET) == LBRACKET); 39 | expect_true(complement(RDBRACKET) == LDBRACKET); 40 | 41 | expect_true(isComplement(LPAREN, RPAREN)); 42 | expect_true(isComplement(LBRACE, RBRACE)); 43 | expect_true(isComplement(LBRACKET, RBRACKET)); 44 | expect_true(isComplement(LDBRACKET, RDBRACKET)); 45 | 46 | expect_true(isComplement(RPAREN, LPAREN)); 47 | expect_true(isComplement(RBRACE, LBRACE)); 48 | expect_true(isComplement(RBRACKET, LBRACKET)); 49 | expect_true(isComplement(RDBRACKET, LDBRACKET)); 50 | } 51 | 52 | test_that("Keywords are detected correctly") { 53 | std::string code = "if for while break repeat"; 54 | const std::vector& tokens = sourcetools::tokenize(code); 55 | for (std::vector::const_iterator it = tokens.begin(); 56 | it != tokens.end(); 57 | ++it) 58 | { 59 | const Token& token = *it; 60 | if (isWhitespace(token)) 61 | continue; 62 | expect_true(isKeyword(token)); 63 | 64 | } 65 | } 66 | 67 | test_that("TokenCursor operations work as expected") { 68 | std::string code = "if (foo) { print(bar) } else {}"; 69 | const std::vector& tokens = sourcetools::tokenize(code); 70 | TokenCursor cursor(tokens); 71 | expect_true(cursor.currentToken().contentsEqual("if")); 72 | cursor.moveToNextSignificantToken(); 73 | expect_true(cursor.currentToken().contentsEqual("(")); 74 | expect_true(cursor.fwdToMatchingBracket()); 75 | expect_true(cursor.currentToken().contentsEqual(")")); 76 | } 77 | 78 | test_that("Move to position works as expected") { 79 | std::string code = "if (foo) { print(1) }"; 80 | const std::vector& tokens = sourcetools::tokenize(code); 81 | TokenCursor cursor(tokens); 82 | 83 | // move to 'if' 84 | expect_true(cursor.moveToPosition(0, 0)); 85 | expect_true(cursor.isType(tokens::KEYWORD_IF)); 86 | 87 | // move to whitespace before print 88 | expect_true(cursor.moveToPosition(0, 10)); 89 | expect_true(cursor.currentToken().contentsEqual(" ")); 90 | 91 | // move to 'print' 92 | expect_true(cursor.moveToPosition(0, 11)); 93 | expect_true(cursor.currentToken().contentsEqual("print")); 94 | 95 | // move to 'print' but target in middle 96 | expect_true(cursor.moveToPosition(0, 12)); 97 | expect_true(cursor.currentToken().contentsEqual("print")); 98 | 99 | expect_true(cursor.moveToPosition(0, 13)); 100 | expect_true(cursor.currentToken().contentsEqual("print")); 101 | 102 | expect_true(cursor.moveToPosition(0, 14)); 103 | expect_true(cursor.currentToken().contentsEqual("print")); 104 | 105 | expect_true(cursor.moveToPosition(0, 15)); 106 | expect_true(cursor.currentToken().contentsEqual("print")); 107 | 108 | // move to '(' 109 | expect_true(cursor.moveToPosition(0, 16)); 110 | expect_true(cursor.currentToken().contentsEqual("(")); 111 | } 112 | 113 | test_that("find operations work") 114 | { 115 | std::string code = "(if (foo) { print(1) })"; 116 | const std::vector& tokens = sourcetools::tokenize(code); 117 | TokenCursor cursor(tokens); 118 | 119 | OpenBracketLocator locator; 120 | expect_true(cursor.moveToPosition(0, 13)); 121 | expect_true(cursor.currentToken().contentsEqual("print")); 122 | expect_true(cursor.findBwd(locator)); 123 | expect_true(cursor.currentToken().contentsEqual("{")); 124 | expect_true(cursor.fwdToMatchingBracket()); 125 | expect_true(cursor.currentToken().contentsEqual("}")); 126 | expect_true(cursor.findBwd(locator)); 127 | expect_true(cursor.currentToken().contentsEqual("(")); 128 | } 129 | } 130 | -------------------------------------------------------------------------------- /src/test-multibyte.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | #include 5 | #include 6 | 7 | namespace { 8 | 9 | std::string charType(wchar_t ch) 10 | { 11 | std::string result; 12 | if (std::iswcntrl(ch)) 13 | result += "cntrl,"; 14 | if (std::iswprint(ch)) 15 | result += "print,"; 16 | if (std::iswspace(ch)) 17 | result += "space,"; 18 | #ifdef SOURCETOOLS_COMPILER_CXX11 19 | if (std::iswblank(ch)) 20 | result += "blank,"; 21 | #endif 22 | if (std::iswgraph(ch)) 23 | result += "graph,"; 24 | if (std::iswpunct(ch)) 25 | result += "punct,"; 26 | if (std::iswalnum(ch)) 27 | result += "alnum,"; 28 | if (std::iswalpha(ch)) 29 | result += "alpha,"; 30 | if (std::iswupper(ch)) 31 | result += "upper,"; 32 | if (std::iswlower(ch)) 33 | result += "lower,"; 34 | if (std::iswdigit(ch)) 35 | result += "digit,"; 36 | if (std::iswxdigit(ch)) 37 | result += "xdigit,"; 38 | 39 | if (!result.empty()) 40 | result = result.substr(0, result.size() - 1); 41 | 42 | return result; 43 | } 44 | 45 | } // anonymous namespace 46 | 47 | extern "C" SEXP sourcetools_print_multibyte(SEXP dataSEXP) 48 | { 49 | const char* data = CHAR(STRING_ELT(dataSEXP, 0)); 50 | sourcetools::index_type size = Rf_length(STRING_ELT(dataSEXP, 0)); 51 | 52 | wchar_t ch; 53 | const char* it = data; 54 | while (true) 55 | { 56 | int length = std::mbtowc(&ch, it, MB_CUR_MAX); 57 | if (length == 0) 58 | break; 59 | 60 | if (length == -1) 61 | { 62 | Rf_warning("Invalid multibyte character at index %li\n", (long) (it - data)); 63 | ++it; 64 | continue; 65 | } 66 | 67 | std::string type = charType(ch); 68 | Rprintf("%5i: [%s,%i] '%lc'\n", (int) ch, type.c_str(), length, ch); 69 | 70 | it += length; 71 | } 72 | 73 | return R_NilValue; 74 | } 75 | 76 | extern "C" SEXP sourcetools_print_utf8(SEXP dataSEXP) 77 | { 78 | using namespace sourcetools; 79 | 80 | const char* data = CHAR(STRING_ELT(dataSEXP, 0)); 81 | utf8::iterator it(data); 82 | 83 | wchar_t ch = *it++; 84 | while (true) 85 | { 86 | wchar_t ch = *it++; 87 | if (ch == 0 || ch == -1) 88 | break; 89 | Rprintf("[%i]: %lc\n", (int) ch, ch); 90 | } 91 | 92 | return R_NilValue; 93 | 94 | } 95 | -------------------------------------------------------------------------------- /src/test-r.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | 4 | using namespace sourcetools; 5 | 6 | class StringRecorder : public r::CallRecurser::Operation 7 | { 8 | public: 9 | 10 | virtual void apply(SEXP dataSEXP) 11 | { 12 | if (TYPEOF(dataSEXP) == STRSXP) 13 | strings_.insert(CHAR(STRING_ELT(dataSEXP, 0))); 14 | } 15 | 16 | const std::set& strings() const 17 | { 18 | return strings_; 19 | } 20 | 21 | private: 22 | std::set strings_; 23 | }; 24 | 25 | context("CallRecurser") 26 | { 27 | test_that("The R call recurser works") 28 | { 29 | SEXP fnSEXP = Rf_findFun(Rf_install("all.equal"), R_BaseNamespace); 30 | SEXP bodySEXP = r::util::functionBody(fnSEXP); 31 | 32 | scoped_ptr recorder(new StringRecorder); 33 | 34 | r::CallRecurser recurser(bodySEXP); 35 | recurser.add(recorder); 36 | recurser.run(); 37 | 38 | const std::set& discoveries = recorder->strings(); 39 | 40 | expect_true(discoveries.size() == 1); 41 | expect_true(discoveries.count("all.equal")); 42 | } 43 | 44 | test_that("Functions which perform non-standard evaluation are detected") 45 | { 46 | SEXP fnSEXP; 47 | fnSEXP = Rf_findFun(Rf_install("library"), R_BaseNamespace); 48 | expect_true(r::nse::performsNonStandardEvaluation(fnSEXP)); 49 | 50 | fnSEXP = Rf_findFun(Rf_install(".gtn"), R_BaseNamespace); 51 | expect_false(r::nse::performsNonStandardEvaluation(fnSEXP)); 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/test-runner.cpp: -------------------------------------------------------------------------------- 1 | #define TESTTHAT_TEST_RUNNER 2 | #include 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | if (require("testthat", quietly = TRUE)) { 2 | library(sourcetools) 3 | test_check("sourcetools") 4 | } 5 | -------------------------------------------------------------------------------- /tests/testthat/helper-aaa.R: -------------------------------------------------------------------------------- 1 | with_gctorture <- function(expr) { 2 | gctorture(TRUE) 3 | result <- expr 4 | gctorture(FALSE) 5 | result 6 | } 7 | 8 | compare_tokens <- function(tokens, expected) { 9 | 10 | if (is.character(tokens)) 11 | tokens <- tokenize_string(tokens) 12 | 13 | expect_true( 14 | nrow(tokens) == length(expected), 15 | "different number of tokens" 16 | ) 17 | 18 | for (i in 1:nrow(tokens)) { 19 | expect_true( 20 | tokens$value[[i]] == expected[[i]], 21 | paste0("expected token '", tokens$value[[i]], "'; got '", expected[[i]], "'") 22 | ) 23 | } 24 | 25 | } 26 | 27 | check_parse <- function(R, S = R) { 28 | lhs <- base::parse(text = R, keep.source = FALSE) 29 | rhs <- with_gctorture(parse_string(S)) 30 | check_parse_impl(lhs, rhs) 31 | } 32 | 33 | check_parse_impl <- function(lhs, rhs) { 34 | 35 | lhsType <- typeof(lhs) 36 | rhsType <- typeof(rhs) 37 | 38 | onError <- function(format, ...) { 39 | message <- c( 40 | sprintf(format, ...), 41 | sprintf("R: '%s'", deparse(lhs)), 42 | sprintf("S: '%s'", deparse(rhs)) 43 | ) 44 | stop(paste(message, collapse = "\n"), call. = FALSE) 45 | } 46 | 47 | if (lhsType != rhsType) 48 | onError("TypeError: '%s' != '%s'", lhsType, rhsType) 49 | 50 | if (length(lhs) != length(rhs)) 51 | onError("LengthError: %s != %s", length(lhs), length(rhs)) 52 | 53 | if (is.call(lhs) || is.expression(lhs)) { 54 | lapply(seq_along(lhs), function(i) { 55 | check_parse_impl(lhs[[i]], rhs[[i]]) 56 | }) 57 | } 58 | 59 | if (!identical(lhs, rhs)) 60 | onError("IdenticalError: '%s' != '%s'", lhs, rhs) 61 | 62 | TRUE 63 | } 64 | 65 | expect_parse <- function(R, S = R) { 66 | testthat::expect_true(check_parse(R, S)) 67 | } 68 | 69 | -------------------------------------------------------------------------------- /tests/testthat/helper-utf8.R: -------------------------------------------------------------------------------- 1 | octal <- "\012" 2 | hex <- "\xE2\x99\xA5" 3 | utf8 <- "\u2665" 4 | -------------------------------------------------------------------------------- /tests/testthat/test-catch.R: -------------------------------------------------------------------------------- 1 | context("Catch") 2 | 3 | test_that("C++ tests pass", { 4 | expect_cpp_tests_pass("sourcetools") 5 | }) 6 | -------------------------------------------------------------------------------- /tests/testthat/test-diagnostics.R: -------------------------------------------------------------------------------- 1 | context("Diagnostics") 2 | 3 | expect_no_diagnostics <- function(string) { 4 | diagnostics <- diagnose_string(string) 5 | expect_true(length(diagnostics) == 0) 6 | if (interactive()) print(diagnostics) 7 | } 8 | 9 | expect_diagnostics <- function(string) { 10 | diagnostics <- diagnose_string(string) 11 | expect_true(length(diagnostics) > 0) 12 | if (interactive()) print(diagnostics) 13 | } 14 | 15 | test_that("missing symbols reported appropriately", { 16 | expect_diagnostics("foo <- function(apple) { print(Apple) }") 17 | }) 18 | 19 | test_that("unused computations are reported", { 20 | expect_diagnostics("foo <- function(apple) { apple < 1; print(TRUE) }") 21 | }) 22 | 23 | test_that("use of '=' in if statement is reported", { 24 | expect_diagnostics("if (foo = 1) { print(1) }") 25 | }) 26 | 27 | test_that("use of '&', '|' in 'if' is reported", { 28 | expect_diagnostics("if (1 & 2) print(1)") 29 | expect_diagnostics("if (1 | 2) print(1)") 30 | }) 31 | 32 | test_that("x == NULL is reported", { 33 | expect_diagnostics("status <- print(1) == NULL; print(status)") 34 | }) 35 | -------------------------------------------------------------------------------- /tests/testthat/test-parser.R: -------------------------------------------------------------------------------- 1 | context("Parser") 2 | 3 | test_that("precedence of '?' vs '=' correctly handled", { 4 | 5 | skip_if(getRversion() < "4.0.3") 6 | 7 | expect_parse("foo ? bar = baz") 8 | expect_parse("foo ? bar <- baz") 9 | 10 | }) 11 | 12 | test_that("parser handles simple control flow", { 13 | 14 | expect_parse("if (foo) bar + baz") 15 | expect_parse("while (1) 1 + 2") 16 | expect_parse("repeat 1 + 2") 17 | expect_parse("if (foo) bar else baz") 18 | expect_parse("if (foo) bar else if (baz) bat") 19 | expect_parse("for (i in 1:10) 1 + 10") 20 | 21 | }) 22 | 23 | test_that("parser handles compound expressions", { 24 | 25 | expect_parse("if (foo) while (bar) 1") 26 | expect_parse("if (foo) (1 + 2)") 27 | expect_parse("{1; 2; 3}") 28 | expect_parse("{1 + 2\n3 + 4\n5 + 6}") 29 | 30 | }) 31 | 32 | test_that("parser handles function calls", { 33 | expect_parse("foo <- bar(baz)[[1]]$bat") 34 | expect_parse("foo <- bar() + bam() * bat()") 35 | }) 36 | 37 | test_that("parser handles precedence", { 38 | expect_parse("a$b[[1]]$c") 39 | expect_parse("object <- unclass(object)[i]") 40 | }) 41 | 42 | test_that("parser handles numbers of various forms", { 43 | expect_parse(".15") 44 | expect_parse("15.") 45 | expect_parse("1.5") 46 | # expect_parse("1.5L") #TODO: R warns and parses as numeric 47 | expect_parse("15L") 48 | expect_parse("10E5") 49 | expect_parse("10E5L") 50 | }) 51 | 52 | test_that("parser handles function calls with no args", { 53 | # Did you know? 54 | # 55 | # > length(base::parse(text = "a[]")[[1]]) # [1] 3 56 | # > length(base::parse(text = "a[[]]")[[1]]) # [1] 3 57 | # 58 | # R inserts an empty 'R_MissingArg' argument 59 | # into the third spot. This is sensible, albeit 60 | # a bit surprising when you first see it. 61 | expect_parse("a()") 62 | expect_parse("a[]") 63 | expect_parse("a[[]]") 64 | }) 65 | 66 | test_that("parser recovers from missing commas", { 67 | expect_warning(expect_parse("a(1, 2, 3)", "a(1 2 3)")) 68 | expect_warning(expect_parse("function(a, b, c) 1", "function(a b c) 1")) 69 | }) 70 | 71 | test_that("parser handles missing arguments", { 72 | expect_parse("a(,)") 73 | expect_parse("a[,]") 74 | expect_parse("a[[,]]") 75 | 76 | expect_parse("a(1,)") 77 | expect_parse("a[1,]") 78 | expect_parse("a[[1,]]") 79 | 80 | expect_parse("a(,1)") 81 | expect_parse("a[,1]") 82 | expect_parse("a[[,1]]") 83 | 84 | expect_parse("a(x =, b =)") 85 | expect_parse("quote(expr =)") 86 | expect_parse("a(x = ,)") 87 | }) 88 | 89 | test_that("parser handles chained function calls", { 90 | expect_parse("a(b)(c)(d)(e)") 91 | expect_parse("a[b][c][d][e]") 92 | expect_parse("a[[b]][[c]][[d]][[e]]") 93 | }) 94 | 95 | test_that("parser handles newlines as statement delimiter", { 96 | expect_parse("a <- b\n+1") 97 | expect_parse("a <- 1\n(b)") 98 | expect_parse("a <- foo(1)\n(b)") 99 | expect_parse("(a <- foo(1)\n(b))") 100 | }) 101 | 102 | test_that("parser handles semi-colons as statement delimiter", { 103 | expect_parse("a <- 1; b <- 2; c <- 3") 104 | expect_parse("{a <- 1;}") 105 | expect_parse("{a <- 1;;; b}") 106 | }) 107 | 108 | test_that("parser handles various escapes in strings", { 109 | expect_parse("'a = \\u{A0}'") 110 | expect_parse("a <- ifelse(a, '\\u{A0}', '\\u{A1}')") 111 | }) 112 | 113 | test_that("parser normalizes string names in function calls", { 114 | expect_parse('"["(unclass(object), i)') 115 | expect_parse('"lol"(1, 2)') 116 | }) 117 | 118 | test_that("parser handles if-else", { 119 | 120 | expect_parse("if (foo) {\nbar\n} else if (baz) {\n}") 121 | 122 | }) 123 | 124 | test_that("parser handles various escapes in strings", { 125 | # TODO: when deparsing UTF-8 escapes, Windows just prints 126 | # the code point and so this test fails. E.g. 127 | # 128 | # > format("\u2665") 129 | # [1] "" 130 | skip_on_os("windows") 131 | 132 | contents <- read("helper-utf8.R") 133 | expect_parse(contents) 134 | }) 135 | 136 | test_that("parser handles multi-line strings", { 137 | 138 | expect_parse('"a\nb\nc" + 1') 139 | expect_parse('"a\nb\nc" * 1') 140 | 141 | }) 142 | -------------------------------------------------------------------------------- /tests/testthat/test-read.R: -------------------------------------------------------------------------------- 1 | context("Reader") 2 | 3 | files <- list.files( 4 | pattern = "[.]R$", 5 | full.names = TRUE, 6 | include.dirs = FALSE 7 | ) 8 | 9 | test_that("read_lines and readLines agree on output", { 10 | for (file in files) { 11 | r <- readLines(file, warn = FALSE, encoding = "UTF-8") 12 | s <- sourcetools::read_lines(file) 13 | expect_identical(r, s) 14 | } 15 | }) 16 | 17 | test_that("read and readChar agree on output", { 18 | for (file in files) { 19 | r <- readChar(file, file.info(file)$size, TRUE) 20 | Encoding(r) <- "UTF-8" 21 | s <- sourcetools::read(file) 22 | expect_identical(r, s) 23 | } 24 | }) 25 | 26 | test_that("read_bytes and readBin agree on output", { 27 | for (file in files) { 28 | r <- readBin(file, "raw", file.info(file)$size) 29 | s <- sourcetools::read_bytes(file) 30 | expect_identical(r, s) 31 | } 32 | }) 33 | 34 | test_that("read_lines can handle '\\r' line endings", { 35 | 36 | file <- tempfile() 37 | on.exit(unlink(file), add = TRUE) 38 | 39 | text <- "this\ris\rsome\rtext" 40 | writeLines(text, con = file, useBytes = TRUE) 41 | 42 | r <- readLines(file) 43 | s <- read_lines(file) 44 | expect_identical(r, s) 45 | 46 | }) 47 | 48 | test_that("read_lines can handle '\\r\\n' line endings", { 49 | 50 | file <- tempfile() 51 | on.exit(unlink(file), add = TRUE) 52 | 53 | text <- "this\r\nis\r\nsome\r\ntext\r" 54 | writeBin(charToRaw(text), file) 55 | 56 | r <- readLines(file) 57 | s <- read_lines(file) 58 | expect_identical(r, s) 59 | 60 | }) 61 | 62 | test_that("read_lines can handle mixed line endings", { 63 | 64 | file <- tempfile() 65 | on.exit(unlink(file), add = TRUE) 66 | 67 | text <- "this\ris\nsome\r\ntext\r" 68 | writeBin(charToRaw(text), file) 69 | 70 | r <- readLines(file) 71 | s <- read_lines(file) 72 | expect_identical(r, s) 73 | 74 | }) 75 | -------------------------------------------------------------------------------- /tests/testthat/test-tokenize.R: -------------------------------------------------------------------------------- 1 | context("Tokenizer") 2 | 3 | test_that("Operators are tokenized correctly", { 4 | 5 | operators <- c( 6 | "::", ":::", "$", "@", "[", "[[", "^", "-", "+", ":", 7 | "*", "/", "+", "-", "<", ">", "<=", ">=", "==", "!=", 8 | "!", "&", "&&", "|", "|>", "||", "~", "->", "->>", "<-", "<<-", 9 | "=", "?", "**", "%%", "%for%" 10 | ) 11 | 12 | tokenized <- tokenize_string(paste(operators, collapse = " ")) 13 | 14 | for (operator in operators) { 15 | tokens <- tokenize_string(operator) 16 | expect_true(nrow(tokens) == 1, paste("expected a single token ('", operator, "')")) 17 | } 18 | }) 19 | 20 | test_that("Numbers are tokenized correctly", { 21 | 22 | numbers <- c("1", "1.0", "0.1", ".1", "0.1E1", "1L", "1.0L", "1.5L", 23 | "1E1", "1E-1", "1E-1L", ".100E-105L", "0.", "100.", 24 | "1e+09", "1e+90", "1e-90", "1e-00000000000000009") 25 | 26 | for (number in numbers) { 27 | tokens <- tokenize_string(number) 28 | expect_true(nrow(tokens) == 1, paste("expected a single token ('", number, "')", sep = "")) 29 | token <- as.list(tokens[1, ]) 30 | expect_true(token$type == "number", paste("expected a number ('", token$type, "')", sep = "")) 31 | } 32 | 33 | }) 34 | 35 | test_that("The tokenizer accepts UTF-8 symbols", { 36 | expect_true(nrow(tokenize_string("鬼門")) == 1) 37 | }) 38 | 39 | test_that("The tokenizer works correctly", { 40 | 41 | # TODO: Should newlines be absorbed as part of the comment string? 42 | tokens <- tokenize_string("# A Comment\n") 43 | expected <- "# A Comment\n" 44 | compare_tokens(tokens, expected) 45 | 46 | tokens <- tokenize_string("a <- 1 + 2\n") 47 | compare_tokens( 48 | tokens, 49 | c("a", " ", "<-", " ", "1", " ", "+", " ", "2", "\n") 50 | ) 51 | 52 | compare_tokens( 53 | tokenize_string("a<-1"), 54 | c("a", "<-", "1") 55 | ) 56 | 57 | # NOTE: '-' sign tokenized separately from number 58 | compare_tokens( 59 | tokenize_string("a< -1"), 60 | c("a", "<", " ", "-", "1") 61 | ) 62 | 63 | compare_tokens("1.0E5L", "1.0E5L") 64 | compare_tokens(".1", ".1") 65 | compare_tokens("'\\''", "'\\''") 66 | compare_tokens(".a", ".a") 67 | compare_tokens("...", "...") 68 | compare_tokens(":=", ":=") 69 | compare_tokens("x ** 2", c("x", " ", "**", " ", "2")) 70 | 71 | }) 72 | 73 | test_that("`[[` and `[` are tokenized correctly", { 74 | 75 | compare_tokens("x[[1]]", c("x", "[[", "1", "]]")) 76 | 77 | # not really valid R code, but the tokenizer should still 78 | # get it right 79 | compare_tokens("[[[]]]", c("[[", "[", "]", "]]")) 80 | 81 | compare_tokens( 82 | "x[[a[b[[c[1]]]]]]", 83 | c("x", "[[", "a", "[", "b", "[[", "c", "[", "1", 84 | "]", "]]", "]", "]]") 85 | ) 86 | 87 | }) 88 | 89 | test_that("Failures during number tokenization is detected", { 90 | tokens <- tokenize_string("1.5E---") 91 | expect_true(tokens$type[[1]] == "invalid") 92 | }) 93 | 94 | test_that("invalid number e.g. 1E1.5 tokenized as single entity", { 95 | tokens <- tokenize_string("1E1.5") 96 | expect_true(nrow(tokens) == 1) 97 | expect_true(tokens$type[[1]] == "invalid") 98 | }) 99 | 100 | test_that("keywords are tokenized as keywords", { 101 | 102 | keywords <- c("if", "else", "repeat", "while", "function", 103 | "for", "in", "next", "break", 104 | "TRUE", "FALSE", "NULL", "Inf", "NaN", "NA", 105 | "NA_integer_", "NA_real_", "NA_complex_", "NA_character_") 106 | 107 | tokens <- lapply(keywords, function(keyword) { 108 | tokenize_string(keyword)[1, ] 109 | }) 110 | 111 | types <- unlist(lapply(tokens, `[[`, "type")) 112 | expect_true(all(types == "keyword")) 113 | }) 114 | 115 | test_that("comments without a trailing newline are tokenized", { 116 | tokens <- tokenize_string("# abc") 117 | expect_identical(tokens$type, "comment") 118 | }) 119 | 120 | test_that("tokenization errors handled correctly", { 121 | # previously, these reported an error where a NUL 122 | # byte was accidentally included as part of the 123 | # token value 124 | tokenize_string("'abc") 125 | tokenize_string("\"abc") 126 | tokenize_string("%abc") 127 | expect_true(TRUE, "we didn't segfault") 128 | }) 129 | 130 | test_that("raw tokens are tokenized correctly", { 131 | 132 | prefixes <- c("r", "R") 133 | quotes <- c("'", '"') 134 | dashes <- c("", "-", "--", "---") 135 | lhs <- c("(", "{", "[") 136 | 137 | all <- expand.grid(prefixes, quotes, dashes, lhs, stringsAsFactors = FALSE) 138 | 139 | all$Var5 <- "" 140 | all$Var5[all$Var4 == "("] <- ")" 141 | all$Var5[all$Var4 == "{"] <- "}" 142 | all$Var5[all$Var4 == "["] <- "]" 143 | 144 | all$Var6 <- all$Var3 145 | all$Var7 <- all$Var2 146 | strings <- do.call(paste0, all) 147 | 148 | for (string in strings) { 149 | token <- tokenize_string(string) 150 | expect_true(nrow(token) == 1L) 151 | expect_true(token$type == "string") 152 | } 153 | 154 | }) 155 | -------------------------------------------------------------------------------- /tools/header-guards.R: -------------------------------------------------------------------------------- 1 | # convert a file path to a header guard name 2 | guarded_name <- function(path) { 3 | 4 | transformations <- list( 5 | dots = function(x) gsub(".", "_", x, fixed = TRUE), 6 | camel = function(x) gsub("(?