├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── LICENSE ├── NAMESPACE ├── NEWS ├── R ├── conn.R ├── decode.R └── encode.R ├── README.Rmd ├── README.md ├── gh ├── space.svg └── time.svg ├── inst ├── benchmarking.R ├── benchmarks.RData └── comparison.Rmd ├── man ├── msgConnection.Rd ├── packMsg.Rd └── unpackMsg.Rd ├── src ├── cwpack.c ├── cwpack.h ├── cwpack_defines.h ├── decode.c ├── decode.h ├── encode.c ├── encode.h ├── init.c ├── opts.h ├── utf8.c ├── utf8.h ├── vadr.c └── vadr.h ├── tests ├── testthat.R └── testthat │ ├── test-static.R │ └── test-streams.R └── vignettes ├── comparison.html └── comparison.html.asis /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^Meta$ 2 | ^doc$ 3 | ^.*\.Rproj$ 4 | ^\.Rproj\.user$ 5 | ^vignettes/comparison_cache$ 6 | ^vignettes/comparison_files$ 7 | ^README.* 8 | .git 9 | .travis.yml 10 | gh 11 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | Meta 2 | .Rproj.user 3 | .Rhistory 4 | .RData 5 | *.so 6 | *.o 7 | *.bc 8 | README.html 9 | src/symbols.rds 10 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | cache: packages 3 | r: 4 | - oldrel 5 | - release 6 | - devel 7 | r_github_packages: 8 | - r-lib/covr 9 | 10 | after_success: 11 | - Rscript -e 'covr::codecov()' -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: msgpack 2 | Title: A Compact, High Speed Data Format 3 | Version: 1.0.1 4 | Authors@R: c(person("Peter", "Meilstrup", email="peter.meilstrup@gmail.com", role=c("aut", "cre", "cph")), 5 | person("Claes", "Wihlborg", role=c("ctr", "cph")), 6 | person("Björn", "Höhrmann", role=c("ctr", "cph"))) 7 | Description: A fast C-based encoder and streaming decoder for the 8 | 'messagepack' data format. 'Messagepack' is similar in structure 9 | to 'JSON' but uses a more compact binary encoding. Based on the 10 | 'CWPack' C library. 11 | Depends: 12 | R (>= 3.3.0) 13 | License: MIT + file LICENSE 14 | Encoding: UTF-8 15 | LazyData: true 16 | Suggests: 17 | testthat, R.rsp 18 | VignetteBuilder: R.rsp 19 | RoxygenNote: 6.1.0 20 | Roxygen: list(markdown=TRUE) 21 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2017-2018 2 | COPYRIGHT HOLDER: Peter Meilstrup 3 | 4 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | S3method(close,msgConnection) 4 | S3method(partial,msgConnection) 5 | S3method(prepack,data.frame) 6 | S3method(prepack,default) 7 | S3method(readMsg,msgConnection) 8 | S3method(readMsgs,msgConnection) 9 | S3method(seek,msgConnection) 10 | S3method(status,msgConnection) 11 | S3method(summary,msgConnection) 12 | S3method(writeMsg,connection) 13 | S3method(writeMsg,msgConnection) 14 | S3method(writeMsgs,connection) 15 | S3method(writeMsgs,msgConnection) 16 | export(msgConnection) 17 | export(packMsg) 18 | export(packMsgs) 19 | export(partial) 20 | export(prepack) 21 | export(readMsg) 22 | export(readMsgs) 23 | export(status) 24 | export(unpackMsg) 25 | export(unpackMsgs) 26 | export(writeMsg) 27 | export(writeMsgs) 28 | useDynLib(msgpack, .registration = TRUE) 29 | -------------------------------------------------------------------------------- /NEWS: -------------------------------------------------------------------------------- 1 | msgpack 1.0.1 2 | ========== 3 | 4 | Version 1.0.1 5 | 6 | * Updated an example which prevented `R CMD check` from passing on Windows. 7 | * Fixed a protection issue flagged by `rchk`. 8 | 9 | msgpack 1.0 10 | ========== 11 | 12 | Initial release. 13 | -------------------------------------------------------------------------------- /R/conn.R: -------------------------------------------------------------------------------- 1 | #' Read and write msgpack formatted messages over R connections. 2 | #' 3 | #' A `msgConnection` object decodes msgpack messages from an 4 | #' underlying R raw connection. 5 | #' 6 | #' @param con A [connection] object open in binary mode. 7 | #' @param max_size The largest partial message to store, in 8 | #' bytes. `NA` means do not enforce a limit. 9 | #' @param read_size How many bytes to read at a time. 10 | #' @param ... Unpacking options (see [unpackMsg]). 11 | #' @return `msgConnection()` returns an object of class 12 | #' `msgConnection`. 13 | #' 14 | #' @examples 15 | #' out <- rawConnection(raw(0), open="wb") 16 | #' apply(quakes, 1, function(x) writeMsg(x, out)) # one message for each row 17 | #' length(rawConnectionValue(out)) 18 | #' inn <- msgConnection(rawConnection(rawConnectionValue(out), open="rb")) 19 | #' close(out) 20 | #' readMsg(inn) 21 | #' readMsgs(inn, 3) 22 | #' length(readMsgs(inn)) 23 | #' close(inn) 24 | #' @export 25 | msgConnection <- function(con, read_size=2^16, max_size=NA, ...) { 26 | partial <- raw(0) 27 | status <- "ok" 28 | bread <- 0 29 | bwrite <- 0 30 | 31 | reader <- function(desired) { 32 | # ignore "desired" and just read non-blockingly. 33 | readRaw(con, read_size) 34 | } 35 | 36 | readMsgs <- function(n=NA, read_size = parent.env(environment())$read_size) { 37 | if (is.na(n)) n <- .Machine$integer.max 38 | msgs_bytes <- unpackMsgs(partial, n, max_size = max_size, reader = reader) 39 | partial <<- msgs_bytes$remaining 40 | status <<- msgs_bytes$status 41 | bread <<- bread + msgs_bytes$bytes_read 42 | msgs_bytes$msgs 43 | } 44 | 45 | doClose <- function(...) { 46 | close(con, ...) 47 | } 48 | 49 | #msgConnection object is just the orig object with this 50 | #environment dangled off it. 51 | structure(addClass(con, "msgConnection"), reader = environment()) 52 | } 53 | 54 | #' @export 55 | summary.msgConnection <- function(object, ...) { 56 | s <- NextMethod("summary") 57 | c(s, list(status = status(object))) 58 | } 59 | 60 | 61 | #' @rdname msgConnection 62 | #' @export 63 | close.msgConnection <- function(con, ...) { 64 | attr(con, "reader")$doClose(...) 65 | } 66 | 67 | catenator <- function(val=c()) { 68 | # An in-memory FIFO type object. 69 | #tracemem(val) 70 | start <- 0 71 | end <- length(val) 72 | function(x, action="store", ..., opts) { 73 | 74 | switch(action, 75 | store = { 76 | lx <- length(x) 77 | l <- length(val) 78 | if (lx > 0) { 79 | #check for overflow 80 | if (end + lx > l && start > 0) { 81 | # rotate back to start 82 | if (start > 0 && end != start) { 83 | val[1:(end-start)] <- val[(start+1):end] 84 | } 85 | end <<- end - start 86 | start <<- 0 87 | } 88 | if (end + lx > l) { 89 | # double array length 90 | length(val) <<- max(end + lx, 2 * l); 91 | } 92 | 93 | #inject new values 94 | val[ (end + 1):(end + lx) ] <<- x 95 | end <<- end + lx 96 | } 97 | dbg("lx", lx, "start", start, "end", end, "\n") 98 | x 99 | }, 100 | 101 | read = { 102 | if (end > start) { 103 | val[(start+1):end] 104 | } else val[c()] 105 | }, 106 | 107 | buf = { 108 | val 109 | }, 110 | 111 | start = start, 112 | 113 | length = end - start, 114 | 115 | end = end, 116 | 117 | contents = { 118 | list(val, start, end) 119 | }, 120 | 121 | reset = { 122 | val <<- x 123 | start <<- 0 124 | end <<- length(x) 125 | }, 126 | 127 | drop = { 128 | if (x <= end - start && x >= 0) { 129 | start <<- start + x 130 | } else { 131 | stop("have ", end - start, ", can't drop ", x) 132 | } 133 | }) 134 | } 135 | } 136 | 137 | lister <- function(val = list()) { 138 | n <- length(val) 139 | function(x, action="store") { 140 | switch(action, 141 | store = { 142 | if (n > length(val)) 143 | length(val) <<- max(1, 2 * length(val)) 144 | n <<- n + 1 145 | val[[n]] <<- x 146 | }, 147 | read = { 148 | length(val) <<- n 149 | val 150 | }, 151 | length = { 152 | n 153 | }, 154 | clear = { 155 | n <<- 0 156 | length(val) <<- 0 157 | } 158 | ) 159 | } 160 | } 161 | 162 | addClass <- function(x, classes) structure(x, class = c(classes, class(x))) 163 | 164 | #' @return `partial(con)` returns any data that has been read ahead of 165 | #' the last decoded message. 166 | #' @rdname msgConnection 167 | #' @export 168 | partial <- function(con) UseMethod("partial") 169 | 170 | #' @rdname msgConnection 171 | #' @export 172 | partial.msgConnection <- function(con) { 173 | attr(con, "reader")$partial 174 | } 175 | 176 | #' @rdname msgConnection 177 | #' @export 178 | #' @param n The maximum number of messages to read. A value of NA 179 | #' means to parse all available messages until end of input. 180 | #' @return `readMsgs(con, n)` returns a list of up to `n` decoded messages. 181 | readMsgs <- function(con, n = NA, ...) { 182 | UseMethod("readMsgs") 183 | } 184 | 185 | #' @export 186 | readMsgs.msgConnection <- function(con, n = NA, ...) { 187 | attr(con, "reader")$readMsgs(n, ...) 188 | } 189 | 190 | #' @rdname msgConnection 191 | #' @return `status(con)` returns the status of msgpack decoding on the 192 | #' connection. A value of `"ok"` indicates all requested messages 193 | #' were read, `"buffer underflow"` for a non-blocking connection 194 | #' indicates that only part of a message has been received, and 195 | #' `"end of input"` means the last available message has been read. 196 | #' Other values indicate errors encountered in decoding, which will 197 | #' effectively halt reading. 198 | #' @export 199 | status <- function(con) UseMethod("status") 200 | 201 | #' @rdname msgConnection 202 | #' @export 203 | status.msgConnection <- function(con) { 204 | attr(con, "reader")$status 205 | } 206 | 207 | 208 | #' @rdname msgConnection 209 | #' @return `seek(con)` returns the number of bytes that have been 210 | #' successfully read or written, depending on the mode of the 211 | #' connection. (Repositioning is not supported.) 212 | #' @param rw See [seek()]. 213 | #' @export 214 | seek.msgConnection <- function(con, rw = summary(con)$mode, ...) { 215 | rw <- pmatch(rw, c("read", "write"), 0L) 216 | switch(rw, 217 | attr(con, "reader")$bread, 218 | attr(con, "reader")$bwrite, 219 | ) 220 | } 221 | 222 | #' `readMsg(con)` reads exactly one message from a 223 | #' msgConnection, or throws an error. 224 | #' 225 | #' @rdname msgConnection 226 | #' @return `readMsg(con)` returns one decoded message. 227 | #' @export 228 | readMsg <- function(con, ...) { 229 | UseMethod("readMsg", con) 230 | } 231 | 232 | #' @export 233 | readMsg.msgConnection <- function(con, ...) { 234 | x <- readMsgs(con, 1, ...) 235 | if (length(x) < 1) { 236 | stop(status(con)) 237 | } 238 | x[[1]] 239 | } 240 | 241 | #' `writeMsg(x, con)` writes a single message to a msgConnection. 242 | #' 243 | #' @rdname msgConnection 244 | #' @param obj An R object. 245 | #' @export 246 | writeMsg <- function(obj, con, ...) { 247 | UseMethod("writeMsg", con) 248 | } 249 | 250 | #' @export 251 | writeMsg.connection <- function(obj, con, ...) { 252 | writeMsgs(list(obj), con, ...) 253 | } 254 | 255 | #' @export 256 | writeMsg.msgConnection <- function(obj, con, ...) { 257 | writeMsgs(list(obj), con, ...) 258 | } 259 | 260 | #' `writeMsgs(l, conn)` writes a list of 261 | #' messages to a connection. That is, `writeMsg(1:10, conn)` writes one 262 | #' message containing an array, while `writeMsgs(1:10, conn)` writes 263 | #' ten consecutive messages each containing one integer. 264 | #' 265 | #' `writeMsg` will work with any R connection in raw mode, but reading 266 | #' requires a msgConnection object. 267 | #' 268 | #' Because msgpack messages have unpredictable length, the decoder 269 | #' reads ahead in chunks, then finds the boundaries between messages. 270 | #' Therefore when reading over a socket or a fifo it is best to use a 271 | #' nonblocking connection, and it will not work to mix readMsg and 272 | #' readBin on the same connection. 273 | #' 274 | #' If you are reading data from a not completely trusted source you 275 | #' should specify options `max_size` and `max_depth` (see 276 | #' [unpackOpts]). Without it, some deeply nested or cleverly designed 277 | #' messages can cause a stack overflow or out-of-memory error. With 278 | #' these options set, you will get an R exception instead. 279 | 280 | #' @rdname msgConnection 281 | #' @param objs A list of R objects. 282 | #' @export 283 | writeMsgs <- function(objs, con, ...) { 284 | UseMethod("writeMsgs", con) 285 | } 286 | 287 | #' @export 288 | writeMsgs.connection <- function(objs, con, ...) { 289 | writeRaw(packMsgs(objs, ...), con) 290 | } 291 | 292 | #' @export 293 | writeMsgs.msgConnection <- function(objs, con, ...) { 294 | buf <- packMsgs(objs, ...) 295 | result <- writeRaw(buf, con) 296 | attr(con, "reader")$bwrite <- attr(con, "reader")$bwrite + length(buf) 297 | invisible(result) 298 | } 299 | 300 | ## To support test harness, we use "readRaw" and "writeRaw" 301 | ## internally, instead of "readBin" which is not an S3 method 302 | readRaw <- function(con, n, ...) { 303 | UseMethod("readRaw") 304 | } 305 | 306 | writeRaw <- function(object, con, ...) { 307 | UseMethod("writeRaw", con) 308 | } 309 | 310 | readRaw.connection <- function(con, n) { 311 | # I get errors thrown (sometimes) when reading at the end of 312 | # a nonblocking fifo. 313 | tryCatch({ 314 | readBin(con, 'raw', n) 315 | }, 316 | error = function(e) { 317 | # warning("Ignoring ", e) 318 | raw(0) 319 | }) 320 | } 321 | 322 | writeRaw.connection <- function(object, con, ...) { 323 | writeBin(object, con, ...) 324 | } 325 | 326 | 327 | ## An inefficient double ended byte buffer for test harness purposes 328 | rawBuffer <- function(object = raw(0)) { 329 | open <- "r" 330 | bytes <- length(object) 331 | buf <- rawConnection(object, open = "r") 332 | object <- NULL 333 | 334 | write <- function(object) { 335 | switch(open, 336 | "w" = { 337 | writeBin(object, buf) 338 | }, 339 | "r" = { 340 | data <- readBin(buf, 'raw', bytes - seek(buf)) 341 | close(buf) 342 | buf <<- rawConnection(data, 343 | open = "w") 344 | open <<- "w" 345 | writeBin(data, buf) 346 | write(object) 347 | } 348 | ) 349 | } 350 | 351 | read <- function(n) { 352 | switch(open, 353 | "r" = { 354 | readBin(buf, 'raw', n) 355 | }, 356 | "w" = { 357 | ##convert a write buffer into a read buffer 358 | val <- rawConnectionValue(buf) 359 | close(buf) 360 | buf <<- rawConnection(val, open = "r") 361 | bytes <<- length(val) 362 | open <<- "r" 363 | read(n) 364 | } 365 | ) 366 | } 367 | 368 | doClose <- function(n) { 369 | close(buf) 370 | buf <- NULL 371 | } 372 | 373 | structure(list(environment()), class = "rawBuffer") 374 | } 375 | 376 | writeRaw.rawBuffer <- function(object, con) { 377 | con[[1]]$write(object) 378 | } 379 | 380 | readRaw.rawBuffer <- function(con, n) { 381 | con[[1]]$read(n) 382 | } 383 | 384 | close.rawBuffer <- function(con) { 385 | con[[1]]$doClose() 386 | } 387 | -------------------------------------------------------------------------------- /R/decode.R: -------------------------------------------------------------------------------- 1 | #' Decode msgpack messages. 2 | #' 3 | #' `unpackMsg` converts a raw array containing one message in msgpack 4 | #' format into the corresponding R data structure. 5 | #' 6 | #' @param x A [raw()] object, perhaps read from a file or socket. 7 | #' @param ... Options passed to [unpackOpts]. 8 | #' @return `unpackMsg(x)` returns one decoded message (which might be 9 | #' shorter than the input raw), or throws an error. 10 | #' @useDynLib msgpack, .registration = TRUE 11 | #' @examples 12 | #' msg <- as.raw(c(0x82, 0xa7, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x63, 0x74, 0xc3, 13 | #' 0xa6, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x00)) 14 | #' unpackMsg(msg) 15 | #' @export 16 | unpackMsg <- function(x, ...) { 17 | .Call("_unpack_msg", x, unpackOpts(...)) 18 | } 19 | 20 | dbg <- alist 21 | #dbg <- cat 22 | 23 | #' `unpackMsgs` extracts a number of msgpack messages from a raw object. 24 | #' 25 | #' @param n How many messages to read. An "NA" here means to read as 26 | #' much as possible. 27 | #' @param reader For implementing connections; a function that takes 28 | #' no arguments and returns a raw containing more data. 29 | #' @return `unpackMsgs(r, n)` returns a list `X` with four elements: 30 | #' * `X$msgs` is a list of the messages unpacked. 31 | #' * `X$remaining` is data remaining to be parsed. 32 | #' * `X$status` is a status message, typically "ok", "end of input", 33 | #' or "buffer underflow". 34 | #' * `x$bytes_read` the number of bytes consumed. 35 | #' 36 | #' @examples 37 | #' x <- packMsgs(list("one", "two", "three")) 38 | #' unpackMsgs(x, 2) 39 | #' @rdname unpackMsg 40 | #' @export 41 | unpackMsgs <- function(x, n = NA, reader = NULL, ...) { 42 | if (is.na(n)) { 43 | n <- .Machine$integer.max 44 | } 45 | 46 | bread <- 0 # bytes read 47 | saveMessage <- lister() # messages read so far 48 | saveData <- catenator(x) # bytes pending 49 | offset <- 0 # position within buffer 50 | 51 | status <- "ok" 52 | 53 | readMore <- function(current, desired) { 54 | dbg("readMore:", "current =", current, "desired =", desired, "\n") 55 | start <- saveData(action="start") 56 | saveData(reader(desired)) 57 | new_start <- saveData(action="start") 58 | current <- new_start - start + current 59 | result <- c(saveData(action="contents"), current) 60 | # cat("after read: "); print(saveData(action="contents")) 61 | result 62 | } 63 | 64 | opts = unpackOpts(..., 65 | underflow_handler = if (is.null(reader)) NULL else readMore) 66 | 67 | # cat("buffer: "); print(saveData(action="contents")) 68 | tryCatch( 69 | while(saveMessage(action="length") < n) { 70 | last_start <- saveData(action="start") 71 | result <- .Call("_unpack_msg_partial", 72 | saveData(action="buf"), 73 | offset, 74 | saveData(action="end"), 75 | opts) 76 | ## result is a pairlist( message, status, new_offset ) 77 | # cat("unpack result: "); print(result) 78 | status <- result[[1]] 79 | if (status == "ok") { 80 | # got a good message, 81 | saveMessage(result[[2]]) 82 | message_length <- ((result[[3]] - offset) + 83 | (saveData(action="start") - last_start)) 84 | bread <- bread + message_length 85 | saveData(message_length, action="drop") 86 | offset <- result[[3]] 87 | # dbg("after 1 msg, buffer: "); print(saveData(action="contents")) 88 | } else { 89 | stop(status) 90 | } 91 | }, 92 | error = function(e) { 93 | dbg("Stopping with exception: ", e$message, "in", deparse(e$call), "\n") 94 | status <<- e$message 95 | } 96 | ) 97 | # dbg("after failure, buffer: "); print(saveData(action="contents")) 98 | 99 | list(msgs = saveMessage(action="read"), 100 | remaining = saveData(action="read"), 101 | status = status, 102 | bytes_read = bread) 103 | } 104 | 105 | #' [unpackOpts()] interprets is passed to `...` in [unpackMsgs()], 106 | #' [unpackMsg()], and [msgConnection()]. It is not exported. 107 | #' 108 | #' @param parent When an environment is given, (such as [emptyenv()]), 109 | #' unpack msgpack dicts into environment objects, with the given 110 | #' value as parent. This option overrides `use_df=TRUE`. Otherwise, 111 | #' unpack dicts into named vectors / lists. 112 | #' @param df When `TRUE`, convert msgpack dicts, whose elements are 113 | #' all arrays having the same length, into [data.frame()]s. 114 | #' @param simplify If `TRUE`, simplify msgpack lists into primitive 115 | #' vectors. 116 | #' @param max_size The maximum length of message to decode. 117 | #' @param max_depth The maximum degree of nesting to support. 118 | #' @param underflow_handler Used internally. 119 | #' 120 | #' @details 121 | #' The msgpack format does not have typed arrays, so all msgpack 122 | #' arrays are effectively lists from the R perspective. However, if an 123 | #' array containing compatibly typed elements is read, `unpack` will 124 | #' return a logical, integer, real or string vector as 125 | #' appropriate. This behavior is disabled with `simplify=FALSE`. The 126 | #' coercion used is more conservative than R's coercion: Integer 127 | #' values may be converted to real, but boolean values will not be 128 | #' cast to numeric, nor any types to string. If conversion from a 129 | #' large integer to real loses precision, a warning is printed. 130 | #' 131 | #' Msgpack also does not distinguish between `NA` and `NULL`. All nils 132 | #' will be decoded as NA. 133 | #' 134 | #' Strings are assumed to be UTF-8 encoded. If a msgpack string does 135 | #' not appear to be valid UTF-8, a warning is printed and a raw object 136 | #' is produced instead. 137 | #' 138 | #' Msgpack allows any type to be the key of a dict, but R only 139 | #' supports strings. If a non-string appears as key in a msgpack dict, 140 | #' it will be converted to string with [deparse()]. 141 | #' 142 | #' Extension types will be decoded as raw objects with a class like 143 | #' `"ext120"` and a warning. 144 | #' 145 | #' @rdname unpackMsg 146 | unpackOpts <- function(parent = NULL, 147 | df = TRUE, 148 | simplify = TRUE, 149 | max_size = NA, 150 | max_depth = NA, 151 | underflow_handler = NULL) { 152 | .Call("_unpack_opts", 153 | parent, 154 | df, 155 | simplify, 156 | parent.env(environment()), 157 | max_size, 158 | max_depth, 159 | underflow_handler); 160 | } 161 | 162 | # Come up with a name when a non-string is used as key. 163 | repr <- function(x) { 164 | paste0(deparse(x, control = c("keepNA")), collapse="") 165 | } 166 | -------------------------------------------------------------------------------- /R/encode.R: -------------------------------------------------------------------------------- 1 | #' Convert R objects to msgpack format. 2 | #' 3 | #' @param x An R object, which can be null, a vector, list, 4 | #' environment, raw, or any combinations thereof. 5 | #' @param ... Options passed to [packOpts()] 6 | #' @return An object of class "raw". 7 | #' @details 8 | #' Strings are re-encoded to UTF-8 if necessary. Real numbers 9 | #' taking integral values may be emitted as integers to save space. 10 | #' 11 | #' Normally an R vector of length 1 will be unboxed, e.g. packMsg(1) 12 | #' will make a msgpack integer, but packMsg(c(1,2)) will make a 13 | #' msgpack list. To prevent this and produce a list of length 1 in the 14 | #' first case, specify `as_is = TRUE`. Objects of class `AsIs` or 15 | #' `data.frame` will always be encoded as-is. 16 | #' 17 | #' A hook for pre-processing R objects before packing is supported, by 18 | #' giving the object an S3 [class] and implementing a method 19 | #' `prepack`. For instance, `prepack.data.frame(x)` simply adds the 20 | #' `"AsIs"` class to `x`. 21 | #' 22 | #' Environment objects are written out with the keys in sorted order, 23 | #' but named vectors are written in the order which the entries 24 | #' appear. 25 | #' 26 | #' Object attributes other than `names` and `class` are ignored. 27 | #' 28 | #' @examples 29 | #' packMsg( list(compact=TRUE, schema=0) ) 30 | #' @export 31 | packMsg <- function(x, ...) { 32 | .Call("_pack_msg", x, packOpts(...)) 33 | } 34 | 35 | #' @param xs a list of objects to pack. 36 | #' @rdname packMsg 37 | #' @examples 38 | #' x <- packMsgs(list("one", "two", "three")) 39 | #' unpackMsgs(x, 2) 40 | #' @export 41 | packMsgs <- function(xs, ...) { 42 | opts <- packOpts(...) 43 | unlist(lapply(xs, function(xx) .Call("_pack_msg", xx, opts))) 44 | } 45 | 46 | #' [packOpts()] interprets the `...` argument of packMsg and 47 | #' packMsgs. it is not exported. 48 | #' 49 | #' @param compatible If TRUE, emitted bytes conform to version 1.0 of 50 | #' msgpack encoding. This means that msgpack strings are used for 51 | #' raw objects. 52 | #' @param as_is If TRUE, singletons (R primitive vectors of length 1 53 | #' having no names attribute) are encoded as msgpack arrays of 54 | #' length 1. Otherwise singletons are simplified to msgpack scalars. 55 | #' @param use_dict If TRUE, vectors having a "names" attribute are 56 | #' encoded as dicts. If false, they are encoded as arrays and the 57 | #' names are discarded. 58 | #' @param max_size The largest buffer that will be allocated. 59 | #' @param buf_size The initial amount of memory, in bytes, to allocate 60 | #' for packing each message. This will be dynamically grown if a 61 | #' larger message is passed, so there is little reason to change 62 | #' this. 63 | #' @rdname packMsg 64 | packOpts <- function(compatible = FALSE, 65 | as_is = FALSE, 66 | use_dict = TRUE, 67 | max_size = NA, 68 | buf_size = 512) { 69 | .Call("_pack_opts", 70 | compatible, 71 | as_is, 72 | use_dict, 73 | max_size, 74 | buf_size, 75 | parent.env(environment())) 76 | } 77 | 78 | #' @rdname packMsg 79 | #' @export 80 | prepack <- function(x) UseMethod("prepack") 81 | 82 | #' @rdname packMsg 83 | #' @export 84 | prepack.default <- function(x) unclass(x) 85 | 86 | #' @rdname packMsg 87 | #' @export 88 | prepack.data.frame <- function(x) I(unclass(x)) 89 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "msgpack for R" 3 | output: github_document 4 | author: Peter Meilstrup 5 | --- 6 | 7 | [![CRAN version badge](http://www.r-pkg.org/badges/version/msgpack)](https://cran.r-project.org/package=msgpack) 8 | [![Travis build status](http://travis-ci.org/crowding/msgpack-r.svg?branch=master)](https://travis-ci.org/crowding/msgpack-r) 9 | [![Code coverage](https://codecov.io/gh/crowding/msgpack-r/branch/master/graph/badge.svg)](https://codecov.io/gh/crowding/msgpack-r) 10 | 11 | 12 | This is a high speed [msgpack](https://msgpack.org) encoder 13 | and decoder for R, based on the [CWPack](https://github.com/clwi/CWPack) C 14 | implementation. 15 | 16 | `msgpack` is a binary data format with data structures similar to 17 | `JSON` and a compact binary encoding. It can be a drop-in replacement 18 | for `JSON` in most applications. It is designed to be fast to parse 19 | and compact to transmit and store. 20 | 21 | ```{R echo=FALSE} 22 | knitr::opts_chunk$set(collapse=TRUE) 23 | ``` 24 | 25 | ## Installation 26 | 27 | From CRAN: 28 | 29 | ```{R eval=FALSE} 30 | install.packages("msgpack") 31 | ``` 32 | 33 | From Github: 34 | 35 | ```{R eval=FALSE} 36 | library(devtools) 37 | install_github("crowding/msgpack-r") 38 | ``` 39 | 40 | ## Usage 41 | 42 | ```{R} 43 | library(msgpack) 44 | x <- packMsg( list(compact=TRUE, schema=0) ) 45 | x 46 | dput(unpackMsg( x )) 47 | ``` 48 | 49 | ### Connections / Streaming 50 | 51 | Write messages one or several at a time: 52 | 53 | ```{r} 54 | conOut <- rawConnection(raw(0), open = "w") # or socketConnection, etc 55 | writeMsg("one", conOut) 56 | writeMsgs(list(2, c(buckle=TRUE), c(owner="my", type="shoe")), conOut) 57 | ``` 58 | 59 | Use a `msgConnection` object to read messages one or several at a time: 60 | 61 | ```{r} 62 | conIn <- msgConnection(rawConnection(rawConnectionValue(conOut), open = "r")) 63 | dput(readMsgs(conIn, 2)) 64 | dput(readMsg(conIn)) 65 | dput(readMsgs(conIn)) 66 | ``` 67 | 68 | ### Performance 69 | 70 | Msgpack is fast and compact. See the [benchmarking vignette](inst/doc/comparison.html). 71 | 72 | ![Plot of time taken to transmit dataset, vs size of dataset, for each encoder under four conditions.](gh/space.svg) 73 | 74 | ![Comparison of space used by each encoder to encode a test dataset.](gh/time.svg) 75 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | msgpack for R 2 | ================ 3 | Peter Meilstrup 4 | 5 | [![CRAN version 6 | badge](http://www.r-pkg.org/badges/version/msgpack)](https://cran.r-project.org/package=msgpack) 7 | [![Travis build 8 | status](http://travis-ci.org/crowding/msgpack-r.svg?branch=master)](https://travis-ci.org/crowding/msgpack-r) 9 | [![Code 10 | coverage](https://codecov.io/gh/crowding/msgpack-r/branch/master/graph/badge.svg)](https://codecov.io/gh/crowding/msgpack-r) 11 | 12 | This is a high speed [msgpack](https://msgpack.org) encoder and decoder 13 | for R, based on the [CWPack](https://github.com/clwi/CWPack) C 14 | implementation. 15 | 16 | `msgpack` is a binary data format with data structures similar to `JSON` 17 | and a compact binary encoding. It can be a drop-in replacement for 18 | `JSON` in most applications. It is designed to be fast to parse and 19 | compact to transmit and store. 20 | 21 | ## Installation 22 | 23 | From CRAN: 24 | 25 | ``` r 26 | install.packages("msgpack") 27 | ``` 28 | 29 | From Github: 30 | 31 | ``` r 32 | library(devtools) 33 | install_github("crowding/msgpack-r") 34 | ``` 35 | 36 | ## Usage 37 | 38 | ``` r 39 | library(msgpack) 40 | x <- packMsg( list(compact=TRUE, schema=0) ) 41 | x 42 | ## [1] 82 a7 63 6f 6d 70 61 63 74 c3 a6 73 63 68 65 6d 61 00 43 | dput(unpackMsg( x )) 44 | ## structure(list(compact = TRUE, schema = 0L), .Names = c("compact", 45 | ## "schema")) 46 | ``` 47 | 48 | ### Connections / Streaming 49 | 50 | Write messages one or several at a time: 51 | 52 | ``` r 53 | conOut <- rawConnection(raw(0), open = "w") # or socketConnection, etc 54 | writeMsg("one", conOut) 55 | writeMsgs(list(2, c(buckle=TRUE), c(owner="my", type="shoe")), conOut) 56 | ``` 57 | 58 | Use a `msgConnection` object to read messages one or several at a 59 | time: 60 | 61 | ``` r 62 | conIn <- msgConnection(rawConnection(rawConnectionValue(conOut), open = "r")) 63 | dput(readMsgs(conIn, 2)) 64 | ## list("one", 2L) 65 | dput(readMsg(conIn)) 66 | ## structure(TRUE, .Names = "buckle") 67 | dput(readMsgs(conIn)) 68 | ## list(structure(c("my", "shoe"), .Names = c("owner", "type"))) 69 | ``` 70 | 71 | ### Performance 72 | 73 | Msgpack is fast and compact. See the [benchmarking 74 | vignette](inst/doc/comparison.html). 75 | 76 | ![Plot of time taken to transmit dataset, vs size of dataset, for each 77 | encoder under four conditions.](gh/space.svg) 78 | 79 | ![Comparison of space used by each encoder to encode a test 80 | dataset.](gh/time.svg) 81 | -------------------------------------------------------------------------------- /gh/space.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | msgpack 50 | rjson 51 | csv 52 | dput 53 | RJSONIO 54 | serialize 55 | yaml 56 | msgpackR 57 | jsonlite 58 | 59 | 60 | 61 | 62 | 63 | 0 64 | 25 65 | 50 66 | 75 67 | 100 68 | 69 | 70 | 71 | 72 | 73 | MiB 74 | Space used to encode nycflights13 (shorter is better) 75 | 76 | -------------------------------------------------------------------------------- /inst/benchmarking.R: -------------------------------------------------------------------------------- 1 | ## ----- setup 2 | ensureExists <- function(packages){ 3 | installed <- library()$results[,1] 4 | needs.install <- setdiff(packages, installed) 5 | if(length(needs.install) > 0) { 6 | install.packages(needs.install) 7 | } 8 | } 9 | 10 | if (exists("clust") && !is.null(clust)) { 11 | stopCluster(clust) 12 | clust <- NULL 13 | } 14 | 15 | ensureExists(c( 16 | "knitr", 17 | "rmarkdown", 18 | "purrr", 19 | "dplyr", 20 | "magrittr", 21 | "tibble", 22 | "printr", 23 | "ggplot2", 24 | "nycflights13", 25 | "parallel", 26 | "ssh.utils", 27 | "tools", 28 | "msgpack", 29 | "jsonlite", 30 | "msgpackR", 31 | "rjson", 32 | "RJSONIO", 33 | "yaml", 34 | "broom" 35 | )) 36 | 37 | library(knitr) 38 | library(rmarkdown) 39 | library(purrr) 40 | library(dplyr) 41 | library(magrittr) 42 | library(tibble) 43 | library(printr) 44 | library(ggplot2) 45 | 46 | ## ---- dataset 47 | 48 | library(nycflights13) 49 | dataset <- as.list(as.environment("package:nycflights13")) 50 | 51 | subsample <- function(dataset, rate) { 52 | lapply(dataset, function(df) { 53 | df[1:(1 + round(nrow(df) * rate)), ] 54 | }) 55 | } 56 | 57 | onerow <- subsample(dataset, 0) 58 | 59 | ## ---- definitions 60 | 61 | remoteHost <- "paradoxus.local" 62 | port <- 42170 63 | 64 | bufferBytes <- function(f) function(con, chunk = 65536L) { 65 | buf <- rawConnection(raw(0), open="w") 66 | repeat { 67 | result <- readBin(con = con, what = "raw", n = chunk) 68 | if (length(result) == 0) break 69 | writeBin(result, buf) 70 | } 71 | f(rawConnectionValue(buf)) 72 | } 73 | 74 | bufferBytes2 <- function(f) function(con, chunk = 65536L) { 75 | buf <- msgpack:::catenator(raw(0)) # this is faster 76 | repeat { 77 | result <- readBin(con = con, what = "raw", n = chunk) 78 | if (length(result) == 0) break 79 | buf(result) 80 | } 81 | f(buf(action="read")) 82 | } 83 | 84 | bufferRawConn <- function(f) function(con, chunk = 65536L) { 85 | buf <- msgpack:::catenator(raw(0)) # this is faster 86 | repeat { 87 | result <- readBin(con = con, what = "raw", n = chunk) 88 | if (length(result) == 0) break 89 | buf(result) 90 | } 91 | f(rawConnection(buf(action="read"), open="r")) 92 | } 93 | 94 | 95 | bufferLines <- function(reader) function(con) { 96 | lines <- msgpack:::catenator(character(0)) 97 | repeat { 98 | l <- readLines(con) 99 | if (length(l) == 0) break() 100 | lines(l) 101 | } 102 | buf <- textConnection(lines(action="read")) 103 | on.exit(close(buf)) 104 | reader(buf) 105 | } 106 | 107 | forked <- function(ifParent, ifChild, catch=TRUE) { 108 | # Fork the R process. In the parent process call ifParent(), in 109 | # the child process call ifChild(). Return a list of return value of 110 | # both functions, or error messages. 111 | other <- parallel:::mcfork() 112 | if (inherits(other, "masterProcess")) { 113 | # we are child process 114 | if (catch) { 115 | tryCatch({ 116 | result <- ifChild(x) 117 | parallel:::mcexit(0, send = result) 118 | }, 119 | error = 120 | function(e) { 121 | parallel:::mcexit(1, send = list(error=e, calls=sys.calls())) 122 | } 123 | ) 124 | } else { 125 | result <- ifChild(x) 126 | parallel:::mcexit(0, send = result) 127 | } 128 | } else { 129 | # we are master process 130 | if (catch) { 131 | mine <- tryCatch(ifParent(x), 132 | error = if (catch) function(e) list(error=e, calls = sys.calls()) else NULL) 133 | } else { 134 | mine <- ifParent(x) 135 | } 136 | theirs <- tryCatch( 137 | unserialize(parallel:::readChild(other)), 138 | error = if(catch) function(e) list(error=e, calls=sys.calls()) else NULL) 139 | child_pids <- vapply(parallel:::children(), function(x) x$pid, 0) 140 | if (other$pid %in% child_pids) { 141 | warning("Killing child process ", deparse(other)) 142 | parallel:::mckill(other, tools::SIGTERM) 143 | } 144 | c(theirs, mine) 145 | } 146 | } 147 | 148 | return_result <- FALSE 149 | # Function accepts a set of sys.time readings and computes timings 150 | times <- function(start.write, end.write, 151 | start.read, end.read, 152 | bytes, result, 153 | start.parent, end.parent, ...) { 154 | c(list(extra=list(list(...))), 155 | if (missing(end.read) || missing(end.write)) 156 | list() 157 | else 158 | c(read.user = end.read[["user.self"]] - start.read[["user.self"]], 159 | read.sys = end.read[["sys.self"]] - start.read[["sys.self"]], 160 | read.elapsed = end.read[["elapsed"]] - start.read[["elapsed"]], 161 | write.user = end.write[["user.self"]] - start.write[["user.self"]], 162 | write.sys = end.write[["sys.self"]] - start.write[["sys.self"]], 163 | write.elapsed = end.write[["elapsed"]] - start.write[["elapsed"]], 164 | total.user = 165 | ( end.read[["user.self"]] + end.write[["user.self"]] 166 | - start.write[["user.self"]] - start.read[["user.self"]]), 167 | total.sys = 168 | ( end.read[["sys.self"]] + end.write[["sys.self"]] 169 | - start.write[["sys.self"]] - start.read[["sys.self"]]), 170 | total.elapsed = max(end.read[["elapsed"]] - start.read[["elapsed"]], 171 | end.write[["elapsed"]] - start.write[["elapsed"]])), 172 | bytes = if (missing(bytes)) list() else bytes, 173 | result = if (missing(result) || !return_result) list() else list(result), 174 | parent = if (missing(start.parent)) list() 175 | else c(user = (end.parent[["user.self"]] - start.parent[["user.self"]]), 176 | sys = (end.parent[["sys.self"]] - start.parent[["sys.self"]]), 177 | elapsed = (end.parent[["elapsed"]] - start.parent[["elapsed"]]))) 178 | } 179 | 180 | bytes <- function(x) { 181 | switch(mode(x), 182 | raw = { 183 | length(x) 184 | }, 185 | character = { 186 | length(x) + sum(nchar(x)) 187 | }) 188 | } 189 | 190 | timeConvert <- function(data, 191 | from = unserialize, 192 | to = function(data) serialize(data,NULL), 193 | wrap = identity, ...) { 194 | force(data) 195 | start.write <- proc.time() 196 | enc <- to(data) 197 | end.write <- proc.time() 198 | as.read <- from(enc) 199 | end.read <- proc.time() 200 | bytes <- bytes(enc) 201 | times(start.write, end.write, 202 | end.write, end.read, 203 | bytes(enc), as.read) 204 | } 205 | 206 | 207 | timeConnection <- function(..., raw = TRUE) { 208 | if (raw) { 209 | timeRawConnection(...) 210 | } else { 211 | timeTextConnection(...) 212 | } 213 | } 214 | 215 | timeRawConnection <- function(data, 216 | reader = unserialize, 217 | writer = serialize, 218 | wrap = identity, ...) { 219 | force(data) 220 | conn <- wrap(rawConnection(raw(0), open="wb")) 221 | on.exit(close(conn), add=TRUE) 222 | start.write <- proc.time() 223 | writer(data, conn) 224 | end.write <- proc.time() 225 | bytes <- rawConnectionValue(conn) 226 | conn2 <- wrap(rawConnection(bytes, open="rb")) 227 | on.exit(close(conn2), add=TRUE) 228 | start.read <- proc.time() 229 | as.read <- reader(conn2) 230 | end.read <- proc.time() 231 | times(start.write, end.write, 232 | end.write, end.read, 233 | length(bytes), as.read) 234 | } 235 | 236 | timeTextConnection <- function(data, 237 | reader = function(x) source(x, TRUE), 238 | writer = function(x, conn) dump("x", conn), 239 | wrap = identity, ...) { 240 | force(data) 241 | theText <- character(0) 242 | conn <- textConnection(NULL, open="w") 243 | on.exit(close(conn), add=TRUE) 244 | start.write <- proc.time() 245 | writer(data, conn) 246 | end.write <- proc.time() 247 | theText <- textConnectionValue(conn) 248 | nbytes <- bytes(theText) 249 | conn2 <- wrap(textConnection(theText, open="r")) 250 | on.exit(close(conn2), add=TRUE) 251 | start.read <- proc.time() 252 | as.read <- reader(conn2) 253 | end.read <- proc.time() 254 | times(start.write, end.write, 255 | end.write, end.read, 256 | nbytes, as.read) 257 | } 258 | 259 | timeFileIO <- function(data, 260 | reader = unserialize, 261 | writer = serialize, 262 | raw = TRUE, 263 | wrap = identity, ...) { 264 | force(data) 265 | fnam <- tempfile() 266 | on.exit(unlink(fnam, force=TRUE)) 267 | con <- file(fnam, open=paste0("w", if(raw) "b" else ""), raw=raw) 268 | start.write <- proc.time() 269 | writer(data, con) 270 | nbytes <- seek(con) 271 | close(con) 272 | end.write <- proc.time() 273 | con <- wrap(file(fnam, open=paste0("r", if(raw) "b" else ""), raw=raw)) 274 | on.exit(close(con), add=TRUE) 275 | start.read <- proc.time() 276 | as.read <- reader(con) 277 | end.read <- proc.time() 278 | times(start.write, end.write, 279 | start.read, end.read, 280 | nbytes, as.read) 281 | } 282 | 283 | timeSocketTransfer <- function(data, 284 | reader = unserialize, 285 | writer = serialize, 286 | wrap = identity, 287 | raw = TRUE, 288 | catch = FALSE, ...) { 289 | force(data) 290 | doRead <- function(other) { 291 | conn <- wrap(socketConnection(port = port, 292 | server = TRUE, 293 | blocking = TRUE, 294 | open = paste0("r", if(raw) "b" else ""))) 295 | on.exit(close(conn)) 296 | start.read <- proc.time() 297 | as.read <- reader(conn) 298 | end.read <- proc.time() 299 | list(start.read = start.read, 300 | end.read = end.read) 301 | } 302 | doWrite <- function(other) { 303 | Sys.sleep(0.5) 304 | conn <- wrap(socketConnection(port = port, server = FALSE, 305 | blocking = TRUE, 306 | open = paste0("w", if(raw) "b" else ""))) 307 | start.write <- proc.time() 308 | on.exit(close(conn)) 309 | writer(data, conn) 310 | flush(conn) 311 | end.write <- proc.time() 312 | list(start.write = start.write, 313 | end.write = end.write, bytes=NA) 314 | } 315 | start.parent <- proc.time() 316 | results <- forked(ifChild = doWrite, ifParent = doRead, catch = catch) 317 | end.parent <- proc.time() 318 | 319 | do.call(times, c(results, 320 | list(start.parent = start.parent, end.parent = end.parent)), 321 | quote=TRUE) 322 | } 323 | 324 | timeFifoTransfer <- function(data, 325 | reader = unserialize, 326 | writer = serialize, 327 | wrap = identity, 328 | catch = FALSE, ...) { 329 | force(data) 330 | fnam <- tempfile() 331 | on.exit(unlink(fnam, force = TRUE)) 332 | system(paste("mkfifo", fnam)) 333 | 334 | doRead <- function(other) { 335 | Sys.sleep(0.5) 336 | conn <- wrap(fifo(fnam, open = "rb", blocking = TRUE)) 337 | on.exit({ 338 | close(conn) 339 | }) 340 | start.read <- proc.time() 341 | as.read <- reader(conn) 342 | end.read <- proc.time() 343 | list(start.read = start.read, 344 | end.read = end.read) 345 | } 346 | doWrite <- function(other) { 347 | conn <- fifo(fnam, open = "wb", blocking = TRUE) 348 | on.exit({ 349 | close(conn) 350 | }) 351 | start.write <- proc.time() 352 | writer(data, conn) 353 | flush(conn) 354 | end.write <- proc.time() 355 | list(start.write = start.write, 356 | end.write = end.write, 357 | bytes = NA) 358 | } 359 | 360 | start.parent <- proc.time() 361 | results <- forked(ifChild = doWrite, ifParent = doRead, catch = catch) 362 | end.parent <- proc.time() 363 | 364 | do.call(times, c(results, 365 | list(start.parent = start.parent, 366 | end.parent = end.parent)), 367 | quote=TRUE) 368 | } 369 | 370 | clust <<- NULL 371 | startRemote <- function() { 372 | message("starting remote") 373 | ssh.utils::run.remote("killall R", remoteHost) 374 | clust <- makePSOCKcluster(nnodes=1, c(remoteHost), rscript = "Rscript") 375 | rtmp <- clusterCall(clust, tempdir) 376 | message("rtmp is ", rtmp) 377 | message("getwd is ", getwd()) 378 | 379 | ssh.utils::cp.remote("", "../inst/benchmarking.R", 380 | remoteHost, paste0(rtmp, "/benchmarking.R")) 381 | clusterCall(clust, options, repos = getOption("repos")) 382 | parallel::clusterCall(clust, source, paste0(rtmp, "/benchmarking.R")) 383 | clust <<- clust 384 | message("started remote") 385 | } 386 | 387 | doRemoteWrite <- function(data, host, port, wrap, raw, writer) { 388 | tryCatch({ 389 | Sys.sleep(1) 390 | conn <- wrap(socketConnection(host = host, 391 | port = port, 392 | server = FALSE, 393 | blocking = TRUE, 394 | open = paste0("w", if(raw) "b" else ""))) 395 | start.write <- proc.time() 396 | on.exit(close(conn)) 397 | writer(data, conn) 398 | flush(conn) 399 | end.write <- proc.time() 400 | list(start.write = start.write, 401 | end.write = end.write, bytes=NA) 402 | }, 403 | error = function(e) list(error=e, calls=sys.calls())) 404 | } 405 | 406 | doRemoteRead <- function(port, wrap, raw, reader) { 407 | # place logging statements in here... 408 | 409 | tryCatch({ 410 | sock <- socketConnection(port = port, 411 | server = TRUE, 412 | blocking = TRUE, 413 | open = paste0("r", if(raw) "b" else ""), 414 | timeout = 60) 415 | conn <- wrap(sock) 416 | on.exit(close(conn)) 417 | start.read <- proc.time() 418 | as.read <- reader(conn) 419 | end.read <- proc.time() 420 | list(start.read = start.read, 421 | end.read = end.read) 422 | }, 423 | error = function(e) list(error=e, calls=sys.calls()) 424 | ) 425 | } 426 | 427 | timeRemoteTransfer <- function(data, 428 | reader = bufferBytes(unserialize), 429 | writer = serialize, 430 | wrap = identity, 431 | raw = TRUE, 432 | ...) { 433 | force(data) 434 | if (is.null(clust)) { 435 | startRemote() 436 | } 437 | 438 | parallel:::sendCall(clust[[1]], 439 | doRemoteRead, 440 | list(port, wrap, raw, reader)) 441 | local_results <- tryCatch( 442 | doRemoteWrite(data, remoteHost, port, wrap, raw, writer), 443 | error = function(e) list(error = e, calls = sys.calls())) 444 | remote_results <- parallel:::recvResult(clust[[1]]) 445 | do.call(times, c(local_results, remote_results), quote=TRUE) 446 | } 447 | 448 | timeCurve <- function(dataset, 449 | method, 450 | timeout = 60, 451 | start = 0.01, 452 | max = 1, ... 453 | ) { 454 | results <- data_frame() 455 | current <- start 456 | if (missing(method)) { 457 | message("method missing???") 458 | return(data.frame()) 459 | } 460 | while (current <= max) { 461 | message(paste0("size = ", current)) 462 | data <- subsample(dataset, current) 463 | result <- method(data, ...) 464 | results <- bind_rows(results, as_tibble(c(size = current, result))) 465 | if ("total.elapsed" %in% names(result)) { 466 | if (result$total.elapsed > timeout) break() else NULL 467 | } else { 468 | break() 469 | } 470 | if (current == max) break() 471 | current = min(current * sqrt(2), max) 472 | } 473 | results 474 | } 475 | 476 | arg_df <- function(tests) (tests 477 | # produces an arg data frame: the labels in columns and the argument 478 | # data structure in column "args" 479 | %>% map(names) 480 | %>% cross_df() 481 | %>% pmap_dfr(function(...) { 482 | labels <- list(...) 483 | arglist <- pmap(list(labels, names(labels)), 484 | function(label, column) list(tests[[column]][[label]])) 485 | c(labels, args = list(list(arglist))) 486 | }) 487 | %>% mutate(args = (args # unlist the args twice 488 | %>% map(. 489 | %>% unlist(recursive = FALSE, use.names = FALSE) 490 | %>% unlist(recursive = FALSE, use.names = TRUE)))) 491 | ) 492 | 493 | `%*%` <- intersect 494 | 495 | run_tests <- function(arg_df) { 496 | pmap_dfr(arg_df, function(..., args) { 497 | labels <- list(...) 498 | message(paste0(collapse = "\n", deparse(labels, control=c()))) 499 | # what I want is a bind syntax like 500 | ## bind({ 501 | ## list(strategy = ?fun, ??args) <- args 502 | ## fun(!!args) 503 | ## }) 504 | fun <- args$strategy 505 | args <- args[names(args) != "strategy"] 506 | the_env <- list2env(args, parent = environment()) 507 | 508 | # this dance is to avoid calling do.call with a giant unnamed dataset 509 | # (which causes R to spin several minutes on writing a traceback) 510 | call <- as.call(c(quote(fun), 511 | structure(map(names(args), as.name), 512 | names = names(args)))) 513 | results <- eval(call, the_env) 514 | as_data_frame(c(labels, results)) 515 | }) 516 | } 517 | 518 | store <- function(data, dataset) { 519 | # key on all character columns the two tables hav in common 520 | existing.keys <- names(benchmarks)[map_lgl(benchmarks, is.character)] 521 | new.keys <- names(data)[map_lgl(data, is.character)] 522 | keys <- intersect(existing.keys, new.keys) 523 | message(paste("Replacing on keys:", paste0("\"", keys, "\"", collapse=", "))) 524 | if (length(keys) > 0) { 525 | dataset <- (dataset # update benchmarks 526 | %>% anti_join(data, by=keys) 527 | %>% bind_rows(data) 528 | ) 529 | } else { 530 | dataset <- data 531 | } 532 | dataset 533 | } 534 | 535 | common.options <- list( 536 | strategy = list( 537 | timeCurve = list(strategy = timeCurve, timeout = 10, start = 0.001)), 538 | dataset = list( 539 | nycflights13 = list(data = dataset))) 540 | 541 | conversion.methods <- list( 542 | method = list( 543 | convert = list(method = timeConvert)) 544 | ) 545 | 546 | synchronous.methods <- list( 547 | method = list( 548 | conn = list(method = timeConnection), 549 | file = list(method = timeFileIO) 550 | )) 551 | 552 | concurrent.methods <- list( 553 | method = list( 554 | remote = list(method = timeRemoteTransfer), 555 | socket = list(method = timeSocketTransfer), 556 | fifo = list(method = timeFifoTransfer)) 557 | ) 558 | 559 | connection.methods <- list( 560 | method = c(concurrent.methods$method, 561 | synchronous.methods$method) 562 | ) 563 | 564 | convert.common.options <- c( 565 | common.options, 566 | conversion.methods 567 | ) 568 | 569 | raw.common.options <- c( 570 | common.options, 571 | conversion.methods, 572 | raw = list('TRUE' = list(raw = TRUE)) 573 | ) 574 | 575 | text.common.options <- c( 576 | common.options, 577 | conversion.methods, 578 | raw = list('FALSE' = list(raw = FALSE)) 579 | ) 580 | 581 | all.common.options <- c( 582 | common.options, 583 | list( 584 | method = c(connection.methods$method, 585 | conversion.methods$method)) 586 | ) 587 | 588 | buffer_read_options <- function(reader = identity, 589 | raw = FALSE, 590 | buffer = (if (raw) bufferBytes2 else bufferLines)) { 591 | c(common.options, 592 | list(raw = structure(list(list(raw = raw)), names = as.character(raw))), 593 | list(method = list( 594 | conn = list(method = timeConnection, 595 | reader = reader), 596 | file = list(method = timeFileIO, 597 | reader = reader), 598 | fifo = list(method = timeFifoTransfer, 599 | reader = buffer(reader)), 600 | socket = list(method = timeSocketTransfer, 601 | reader = buffer(reader)), 602 | remote = list(method = timeRemoteTransfer, 603 | reader = buffer(reader)) 604 | )) 605 | ) 606 | } 607 | 608 | combine_opts <- function(x, ...) { 609 | for (l in list(...)) { 610 | for (n in names(l)) { 611 | if (n %in% names(x)) { 612 | x[[n]] <- c(x[[n]], l[[n]]) 613 | } else { 614 | x[[n]] <- c(x[[n]], l[[n]]) 615 | } 616 | } 617 | } 618 | x; 619 | } 620 | 621 | `%but%` <- function(l, r) { 622 | l[names(r)] <- r; 623 | l 624 | } 625 | 626 | showTimings <- function(timings, label, ...) { 627 | stats <- with(timings, c( 628 | "total.elapsed (s)" = total.elapsed, 629 | "read.cpu (s)" = read.user + read.sys, 630 | "read.elapsed (s)" = read.elapsed, 631 | "write.cpu (s)" = write.user + write.sys, 632 | "write.elapsed (s)" = write.elapsed, 633 | "data size (MB)" = bytes / 0x100000 634 | )) 635 | kable(stats, col.names = label, digits=2) 636 | } 637 | 638 | ## list(x = ?first, ??rest) -> list() 639 | 640 | ## f <- function(a, call) bind({ 641 | ## list(?first, ??rest) <- callList 642 | ## }) 643 | 644 | ## Local Variables: 645 | ## ess-r-package-info: ("msgpack" . "~/msgpack/") 646 | ## End: 647 | -------------------------------------------------------------------------------- /inst/benchmarks.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/crowding/msgpack-r/c77554d7c5fe0e10bb46d5284d451337813734e8/inst/benchmarks.RData -------------------------------------------------------------------------------- /inst/comparison.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Which Data Format is Fastest Data Format?" 3 | output: 4 | "rmarkdown::html_vignette": 5 | fig_width: 6 6 | fig_height: 4 7 | toc: true 8 | df_print: tibble 9 | self_contained: true 10 | dev: "svglite" 11 | vignette: > 12 | %\VignetteIndexEntry{Which Data Format is Fastest Data Format\?} 13 | %\VignetteEngine{R.rsp::asis} 14 | %\VignetteKeyword{IO} 15 | %\VignetteKeyword{file} 16 | %\VignetteKeyword{benchmark} 17 | %\VignetteKeyword{connection} 18 | %\VignetteKeyword{interface} 19 | %\VignetteKeyword{msgpack} 20 | %\VignetteKeyword{vignette} 21 | %\VignetteKeyword{package} 22 | --- 23 | 24 | It's good to get data into and out of R! More compatibility, more 25 | data, more better. But which is fastest? 26 | 27 | Of course, speed might not be your only criterion. Different data 28 | formats have different capabilities and purposes: 29 | 30 | * `serialize()` is your best best for getting R objects out of and 31 | back into R the way you had them, but doesn't do much for you when 32 | communicating with other systems. 33 | * `dump()` is your next best bet, while being a somewhat 34 | human-readable text format. 35 | * Neither of above are good for exchanging data with other entities 36 | you don't trust. 37 | * *JSON* is widely used on the web, but because it is based on 38 | Javascript data structures, it doesn't seamlessly represent R 39 | objects. R attributes like `dims` and `class` don't have equivalents 40 | in Javascript. Different packages also take different approaches in 41 | representing JSON objects in R, and vice versa. 42 | * *CSV* is ubiquitous and can be read by most anything but only represents 43 | tabular data. Its data types are ambiguous. 44 | * *Msgpack* has a data model compatible with JSON but is a binary 45 | format. For web applications, replacing JSON with msgpack is an easy 46 | way to save bandwidth and CPU usage. I'm considering msgpack as a 47 | convenient wire format for embedded devices. 48 | 49 | 50 | ```{R setup, cache=FALSE, echo=FALSE, message=FALSE} 51 | library(knitr) 52 | opts_chunk$set(cache = TRUE, autodep = TRUE, message = FALSE, warning = FALSE) 53 | message("cache path is '", opts_chunk$get("cache.path"), "'") 54 | ``` 55 | 56 | All that said.... Which is fastest? Which is most efficient? Under 57 | which set of arbitrarily chosen tests? I'll give more details below, 58 | but first I'll show the results. In both graphs, lower is better. 59 | 60 | ```{R, fig_width=6, fig_height=6, fig.asp = 1, cache=FALSE} 61 | load_cache("timings","timing.plot") 62 | ``` 63 | 64 | On the horizontal axis is the normalized size of the object being 65 | transmitted. On the vertical is the time taken to transmit the test 66 | dataset. There are four scenarios corresponding to how the dataset is 67 | transmitted or stored. All tests are using the default options 68 | for each respective library. 69 | 70 | This following graph shows the amount of storage used to encode the 71 | test dataset. 72 | 73 | ```{R, cache=FALSE} 74 | load_cache("datasize","datasize.plot") 75 | ``` 76 | 77 | # Benchmarking Data Export / Import 78 | 79 | This document is included as a static vignette, because running 80 | benchmark code would not be kind to package builders. 81 | 82 | I'll try picking a reference dataset and timing how long it takes to 83 | encode and decode it. I'll start with the dataset `nycflights13`, 84 | consisting of five data frames, since it's large enough to take a 85 | measurable time to process. 86 | 87 | The file `inst/benchmarking.R` contains code for test generation and 88 | timing. 89 | 90 | ```{R cache = FALSE, results = "hide"} 91 | source(system.file("benchmarking.R", package = "msgpack")) 92 | ``` 93 | 94 | I will test each encoder under the following scenarios: 95 | 96 | * `convert` is for just converting an R object to an R raw or 97 | character object and back, in memory. 98 | * `conn` also does an in-memory conversion but with R `textConnection` 99 | or `rawConnection` objects. (The `rawConnection` objects seem to 100 | work faster, even with text formats.) 101 | * `file` writes to a temp file and then reads it back. 102 | * `fifo` forks the R process and writes from one while reading 103 | from the other over a UNIX named pipe. 104 | * `tcp` forks the R process and writes from one to the other using a 105 | TCP socket. 106 | * `remote` connects to a R process on a remote machine, and sends 107 | data from the local to the remote over a TCP socket. 108 | 109 | Each test will record the OS-reported CPU time, as well as elapsed 110 | clock time for reading, writing and total. 111 | 112 | For `fifo`, `tcp`, and `remote` tests, the reading and writing may 113 | happen concurrently, so that `total.elapsed < read.elapsed + 114 | write.elapsed`. On the other hand, some packages cannot cope with 115 | reading from asynchronous connections (more on this in the 116 | details). In these cases I need to make sure all data is read into a 117 | buffer before calling the decoder. 118 | 119 | These packages turn out to vary in performance by a couple orders of 120 | magnitude. So the test harness has to dynamically vary tne input size 121 | so as to not take forever to run. It starts with a small subsample 122 | of the dataset and increases its size until the encode and decode 123 | takes 10 seconds (or the entire dataset is transmitted). (See function 124 | `testCurve` in `inst/benchmarking.R`.). 125 | 126 | On to the per-package benchmarks. 127 | 128 | ## R serialization 129 | 130 | `serialize` and `unserialize` produce faithful replications of R 131 | objects including R-specific structures like closures, environments, 132 | and attributes. But generally only R code can read it. It is useful 133 | for communicating between R processes. 134 | 135 | For instance: 136 | 137 | ```{R} 138 | unserialize.inmem <- timeConvert(dataset, 139 | unserialize, 140 | function(data) serialize(data, NULL)) 141 | showTimings(unserialize.inmem, "R serialization (in memory)") 142 | ``` 143 | 144 | This is reasonably speedy, too. 145 | 146 | However, I run into a problem if I try to transfer too large an object over 147 | a fifo or socket and read it with `unserialize().` 148 | 149 | ```{R, error=TRUE} 150 | unserialize.bad <- timeFifoTransfer(dataset, unserialize, serialize, catch=FALSE) 151 | ``` 152 | 153 | This appears to be because `unserialize` doesn't operate concurrently, 154 | in the sense that it doesn't recover from finding the end of the line 155 | having only read part of a message. Meanwhile, on my machine `fifo()` 156 | and `socketConnection()` do not seem to block even if `blocking = 157 | TRUE` is set. They always wait for at least one byte, but may return 158 | fewer than requested. So `unserialize` does not work easily with socket 159 | connections. 160 | 161 | One workaround is to exhaustively read the connection before handing 162 | off the data to `unserialize`. I handle that off screen in 163 | `bufferBytes` in `inst/benchmarking.R`. 164 | 165 | ```{R} 166 | unserialize.socket <- timeSocketTransfer(dataset, 167 | bufferBytes(unserialize), 168 | serialize) 169 | 170 | showTimings(unserialize.socket, 171 | c("R serialization (over TCP, same host, blocking read)")) 172 | ``` 173 | 174 | But this strategy only works for transmitting one object per 175 | connection, and for one connection at a time. 176 | 177 | Another way you can do it is to wrap R serialization with 178 | `msgpack::msgConnection`. This only adds a few bytes to each message, 179 | and msgpack will handle assembling complete messages. This also allows 180 | you to send several values per connection, or poll several connections 181 | connections until one of them returns a decoded message. 182 | 183 | ```{R} 184 | unserialize.wrapped_socket <- timeSocketTransfer(dataset, 185 | unserialize, 186 | serialize, 187 | wrap = msgpack::msgConnection) 188 | ``` 189 | 190 | ```{R} 191 | showTimings(unserialize.wrapped_socket, 192 | "R serialization over msgpack over TCP (same host)") 193 | ``` 194 | 195 | Surprisingly, this actually works faster. `R 196 | unserialize.wrapped_socket$total.elapsed < 197 | unserialize.socket$total.elapsed || stop()), include=FALSE`. One 198 | hypothesis that might account for this is that there are fewer write 199 | syscalls if the object is serialized into memory before sending, and 200 | fewer read syscalls if the object is prepended with its length before 201 | reading. 202 | 203 | Now I'll start collecting benchmarks systematically. Since the methods 204 | we will explore have such a wide variance of performance, we will test 205 | them with successively larger datasets, until they exceed a 206 | timeout. The `serialize.spec` data structure annotated below specifies 207 | how to test R serialization and how to label the results. See the code 208 | file `inst/benchmarking.R` for more details. 209 | 210 | ```{R serialization} 211 | # A spec is a named list of test factors. 212 | # A test factor is a named list of arguments that will be passed to the 213 | # test function at each factor level. 214 | # (Actually go look at the simpler test spec for msgpack before trying to 215 | # grok this) 216 | serialize.spec <- combine_opts( # options of the same name concatenated 217 | list( 218 | method = list( # the factor "method" 219 | convert = list(method = timeConvert)), # has a level "convert" 220 | # that passes timeConvert 221 | # to the "method" 222 | # argument of the test 223 | # function 224 | encoder = list( # the factor "encoder" 225 | serialize = list( # has a level named "serialize" 226 | writer = serialize, # with these arguments specifying, reader, writer, to, from 227 | from = unserialize, 228 | to = function(data) serialize(data, NULL)))), 229 | # Combined with these common options (that include connection-based 230 | # test methods 231 | buffer_read_options(reader = unserialize, raw = TRUE) 232 | ) 233 | 234 | # arg_df takes the above spec and takes its outer product, generating 235 | # a data frame with case labels and arguments used 236 | serialize.calls <- arg_df(serialize.spec) 237 | ``` 238 | 239 | ```{R} 240 | serialize.timings <- run_tests(serialize.calls) 241 | ``` 242 | 243 | ```{R include=FALSE, results="hide"} 244 | benchmarks <- data.frame() 245 | ``` 246 | 247 | ```{R include=FALSE, results="hide"} 248 | benchmarks <- store(serialize.timings, benchmarks) 249 | ``` 250 | 251 | ```{R} 252 | test.plot <- (ggplot( 253 | filter(benchmarks, encoder=="serialize")) 254 | + aes(x = size, y = total.elapsed, color = method) 255 | + geom_line() 256 | + scale_x_continuous(limits = c(0, 1), name = "Fraction of nycflights13 transmitted") 257 | + scale_y_continuous(limits = c(0, NA), name = "Elapsed time (s)") 258 | ) 259 | test.plot + labs(title="R serialization performance") 260 | ``` 261 | 262 | Here the horizontal axis is the size of the test dataset, and the 263 | vertical axis is the time taken to transmit and receive. 264 | 265 | R's implementation of fifo connections seem to be consistently slower 266 | than other methods on the test system (OS X). The next slowest, in the 267 | case of R serialization, is the test case where data is transmitted 268 | over a gigabit link (if it matters, the reading computer is an Ubuntu 269 | box slower than the writing computer.) 270 | 271 | ## Dput/source 272 | 273 | The `dput` and `deparse` functions render R data objects to an ASCII 274 | connection in R-like syntax. The idea is that the text output "looks 275 | like" the code it takes to construct the object, to the extent that 276 | the mechanism for reading objects back in is to `eval` or `source` the 277 | text. (Hopefully one does not do this with untrusted data. A better 278 | technique may be to evaluate the data in a limited environment that 279 | just contains the needed constructors like `structure`, `list` and `c` 280 | etc.) 281 | 282 | ```{R} 283 | ## Annoyingly, the behavior of "dump" and "dput" depend on this global option. 284 | options(deparse.max.lines = NULL) 285 | 286 | #the output mimics the input 287 | dput(control=c(), 288 | list(1, "2", verb=quote(buckle), my=c("s", "h", "o", "e"))) 289 | ``` 290 | 291 | (Note how `dput` fails on transmitting language objects; if we try to 292 | eval the above we will try to evaluate "buckle" instead of getting 293 | just the name object. `as.name("buckle")`.) 294 | 295 | Performance-wise, `dput` and `source` should not be used for large 296 | datasets, because they display an O(n^2) characteristic in terms of 297 | the data size. 298 | 299 | ```{R dput} 300 | options(deparse.max.lines = NULL) 301 | 302 | dput.timings <- run_tests(arg_df(c( 303 | all.common.options, 304 | list( 305 | encoder = list( 306 | dput = list( 307 | from = function(t) eval(parse(text=t)), 308 | to = deparse, 309 | reader = function(c) eval(parse(c)), 310 | writer = function(x, c) dput(x, file=c))))))) 311 | ``` 312 | 313 | ```{R, include = FALSE, results = "hide"} 314 | benchmarks <- store(dput.timings, benchmarks) 315 | ``` 316 | 317 | ```{R} 318 | (test.plot + labs(title="dput performance")) %+% filter(benchmarks, encoder == "dput") 319 | ``` 320 | 321 | It's interesting that `deparse` (which is used for the "in-memory 322 | conversion" test method labeled `conn`) is much slower than the 323 | connection-based `dput`. 324 | 325 | ## jsonlite 326 | 327 | The `jsonlite` package includes a fromJSON and toJSON implementation. It also 328 | supports streaming reads and write, but only of records consisting of 329 | one data frame per message. Data frames are sent row-wise. 330 | 331 | Since the test dataset consists of several data frames, I will send 332 | one after the other over the lifetime of one connection. 333 | 334 | ```{R jsonlite} 335 | jsonlite_reader <- function(conn) { 336 | append <- msgpack:::catenator() 337 | jsonlite::stream_in(conn, verbose = FALSE, handler = function(x) append(list(x))) 338 | append(action="read") 339 | } 340 | 341 | jsonlite_writer <- function(l, conn) { 342 | lapply(l, function(x) jsonlite::stream_out(x, verbose=FALSE, conn)) 343 | } 344 | 345 | jsonlite.spec <- c( 346 | all.common.options, 347 | list( 348 | encoder = list( 349 | jsonlite = list( 350 | to = jsonlite::toJSON, 351 | from = jsonlite::fromJSON, 352 | reader = jsonlite_reader, 353 | writer = jsonlite_writer, 354 | raw = FALSE)))) 355 | 356 | jsonlite.timings <- run_tests(arg_df(jsonlite.spec)) 357 | ``` 358 | 359 | ```{R echo=FALSE, results="hide"} 360 | getElapsed <- function(d) filter(d, size == 1, method=="convert")$total.elapsed[[1]] 361 | getRemote <- function(d) filter(d, size == 1, method=="remote")$total.elapsed[[1]] 362 | `%digits%` <- function(x, y) format(x, digits=y) 363 | ``` 364 | jsonlite performs reasonably well, but is several times slower than 365 | serialization. 366 | 367 | ```{R, results = "hide"} 368 | benchmarks <- store(jsonlite.timings, benchmarks) 369 | ``` 370 | 371 | ```{R} 372 | (test.plot + labs(title="jsonlite performance")) %+% filter(benchmarks, encoder == "jsonlite") 373 | ``` 374 | 375 | ## Msgpack 376 | 377 | `msgpack` is the package this vignette is written for. 378 | 379 | ```{R msgpack_remote} 380 | msgpack_remote.spec <- c( 381 | common.options, 382 | list(method = list(remote = list(method=timeRemoteTransfer)), 383 | encoder = list( 384 | msgpack = list( 385 | reader = msgpack::readMsg, 386 | writer = msgpack::writeMsg, 387 | wrap = msgpack::msgConnection)))) 388 | 389 | msgpackR_remote.timings <- run_tests(arg_df(msgpack_remote.spec)) 390 | (test.plot + labs(title="msgpack")) %+% msgpackR_remote.timings 391 | 392 | ``` 393 | 394 | ```{R msgpack} 395 | msgpack.spec <- c( 396 | all.common.options, 397 | list( 398 | encoder = list( 399 | msgpack = list( 400 | wrap = msgpack::msgConnection, 401 | reader = msgpack::readMsg, 402 | writer = msgpack::writeMsg, 403 | to = msgpack::packMsg, 404 | from = msgpack::unpackMsg)))) 405 | ``` 406 | ```{R} 407 | msgpack.timings <- run_tests(arg_df(msgpack.spec)) 408 | ``` 409 | 410 | ```{R, results = "hide"} 411 | benchmarks <- store(msgpack.timings, benchmarks) 412 | ``` 413 | 414 | ```{R} 415 | (test.plot + labs(title="msgpack performance")) %+% filter(benchmarks, encoder == "msgpack") 416 | ``` 417 | 418 | Implementing the streaming-mode callbacks has helped a lot, but there 419 | is still a quadratic characteristic going on here. Need to do some 420 | profiling of memory allocation. 421 | 422 | Interestingly, msgpack is already faster than serialize for the remote 423 | use case (modulo some network glitches affecting one or two 424 | datapoints.) 425 | 426 | ## msgpackR 427 | 428 | There is an older pure-R implementation of msgpack on CRAN. One quirk is that 429 | it doesn't accept `NA` in R vectors. 430 | 431 | ```{R, error = TRUE} 432 | msgpackR::pack(c(1, 2, 3)) 433 | msgpackR::pack(c(1, 2, 3, NA)) 434 | ``` 435 | 436 | As a workaround I'll substitute out all the NA values in the dataset. 437 | 438 | ```{R} 439 | dataset_mungenull <- map(dataset, map_dfc, 440 | function(col) ifelse(is.na(col), 9999, col)) 441 | ``` 442 | 443 | ```{R msgpackR} 444 | msgpackR.spec <- c( 445 | all.common.options %but% list( 446 | dataset = list(nycflights13 = list(data = dataset_mungenull)), 447 | note = list("no NAs" = list())), 448 | list( 449 | encoder = list( 450 | msgpackR = list( 451 | from = msgpackR::unpack, 452 | to = msgpackR::pack, 453 | reader = bufferBytes(msgpackR::unpack), 454 | writer = function(data, conn) writeBin(msgpackR::pack(data), conn))))) 455 | ``` 456 | 457 | ```{R, results = "hide"} 458 | msgpackR.timings <- run_tests(arg_df(msgpackR.spec)) 459 | ``` 460 | 461 | ```{R, include=FALSE, results="hide"} 462 | benchmarks <- store(msgpackR.timings, benchmarks) 463 | ``` 464 | 465 | ```{R} 466 | (test.plot + labs(title="msgpackR performance")) %+% filter(benchmarks, encoder == "msgpackR") 467 | ``` 468 | 469 | Performance-wise, it is quite slow, but a deeper concern is that if I 470 | send larger objects and inspect the results it sometimes looks garbled, and there 471 | are intermittent errors like: 472 | 473 | ```{R error=TRUE} 474 | str(subsample(dataset_mungenull, .000044) %>% (msgpackR::pack) %>% (msgpackR::unpack)) 475 | ``` 476 | 477 | ## rjson 478 | 479 | ```{R rjson} 480 | rjson.spec <- combine_opts( 481 | list( 482 | method = list( 483 | convert = list(method = timeConvert)), 484 | encoder = list( 485 | rjson = list( 486 | from = rjson::fromJSON, 487 | to = rjson::toJSON, 488 | writer = function(data, con) writeChar(rjson::toJSON(data), con)))), 489 | buffer_read_options(reader = function(con) rjson::fromJSON(file=con), 490 | raw = TRUE, 491 | buffer = bufferRawConn)) 492 | rjson.timings <- run_tests(arg_df(rjson.spec)) 493 | ``` 494 | 495 | ```{R include=FALSE, echo=FALSE} 496 | inmemory.factor <- (getElapsed(rjson.timings) / getElapsed(msgpack.timings)) 497 | remote.factor <- (getRemote(rjson.timings) / getRemote(msgpack.timings)) 498 | ``` 499 | 500 | `rjson` is the fastest JSON implementation -- only `r inmemory.factor %digits% 2` 501 | times slower than `msgpack`, in memory, and `r remote.factor %digits% 2` 502 | times slower across the wire. It does not 503 | support streaming reads, so we must byte-buffer to read from 504 | connections. But that turns out to be quite fast as well. 505 | 506 | ```{R results="hide"} 507 | benchmarks <- store(rjson.timings, benchmarks) 508 | ``` 509 | 510 | ```{R} 511 | (test.plot + labs(title="rjson performance")) %+% filter(benchmarks, encoder == "rjson") 512 | ``` 513 | 514 | ## RJSONIO 515 | 516 | I am getting the following intermittent error in RJSONIO (i.e. this 517 | document fails to render once in a while.) 518 | 519 | ``` 520 | Error in RJSONIO::readJSONStream(con) : failed to parse json at 10240 521 | ``` 522 | 523 | ```{R RJSONIO} 524 | RJSONIO.spec <- c( 525 | all.common.options, 526 | list( 527 | encoder = list( 528 | RJSONIO = list( 529 | from = RJSONIO::fromJSON, 530 | to = RJSONIO::toJSON, 531 | writer = function(x, con) writeBin(RJSONIO::toJSON(x), con), 532 | reader = RJSONIO::readJSONStream, 533 | raw = TRUE)))) 534 | 535 | RJSONIO.timings <- run_tests(arg_df(RJSONIO.spec)) 536 | ``` 537 | ```{R, include = FALSE, results = "hide"} 538 | benchmarks <- store(RJSONIO.timings, benchmarks) 539 | ``` 540 | 541 | RJSONIO offers a function to do streaming reads from connection, but 542 | it has much overhead compared with in-memory conversion. 543 | 544 | ```{R} 545 | (test.plot + labs(title="RJSONIO performance")) %+% filter(benchmarks, encoder == "RJSONIO") 546 | ``` 547 | 548 | ## YAML 549 | 550 | YAML is kind of "JSON, but more like Markdown." Allegedly easier to 551 | read but with a more complex grammar. It's popular for config files. 552 | 553 | ```{R} 554 | cat(yaml::as.yaml(list(compact=TRUE, schema = 0))) 555 | ``` 556 | 557 | Unfortunately, the YAML package produces a protection stack overflow 558 | when decoding too large a message. 559 | 560 | ```{R, error = TRUE} 561 | oops <- yaml::yaml.load(yaml::as.yaml(subsample(dataset, 0.15))) 562 | ``` 563 | 564 | ```{R} 565 | yaml.timings <- run_tests(arg_df(c( 566 | all.common.options, 567 | list( 568 | note = list("Max 0.14" = list(max = 0.14)), 569 | encoder = list( 570 | yaml = list( 571 | reader = yaml::yaml.load_file, 572 | writer = function(data, conn) writeChar(yaml::as.yaml(data), conn), 573 | to = yaml::as.yaml, 574 | from = yaml::yaml.load, 575 | raw = TRUE)))))) 576 | ``` 577 | 578 | ```{R, results = "hide"} 579 | benchmarks <- store(yaml.timings, benchmarks) 580 | ``` 581 | 582 | ```{R} 583 | (test.plot + labs(title="yaml performance")) %+% filter(benchmarks, encoder == "yaml") 584 | ``` 585 | 586 | ## write.csv 587 | 588 | We're going here aren't we. Let's see if we can send messages with 589 | CSV. To send several CSV tables over one connection, I'll prepend to 590 | each message a header saying how many rows to read following. 591 | 592 | ```{R} 593 | writeCsvs <- function(data, con) { 594 | for (nm in names(data)) { 595 | write.csv(as_tibble(list(name = nm, nrows = nrow(data[[nm]]))), con) 596 | write.csv(data[[nm]], con, row.names = FALSE) 597 | } 598 | } 599 | 600 | readCsvs <- function(data, con) { 601 | output <- list() 602 | tryCatch( 603 | repeat { 604 | header <- read.csv(con2, nrows=1, stringsAsFactors = FALSE) 605 | output[[header$name]] <- read.csv(con2, nrows=header$nrows, stringsAsFactors = FALSE) 606 | }, 607 | error = force) 608 | output 609 | } 610 | ``` 611 | 612 | I have just had a sinking feeling that a lot of people actually build 613 | web services that talk in CSV. 614 | 615 | Unfortunately `read.table` can't cope with non-blocking connections, so 616 | I have to buffer the read into memory when reading from a fifo or socket. 617 | 618 | ```{R, results="hide"} 619 | csv.timings <- run_tests( 620 | arg_df(c( 621 | list(encoder = list(csv = list(writer = writeCsvs))), 622 | buffer_read_options(reader = readCsvs, raw = TRUE)))) 623 | ``` 624 | 625 | ```{R, results = "hide"} 626 | benchmarks <- store(csv.timings, benchmarks) 627 | ``` 628 | 629 | ```{R} 630 | (test.plot + labs(title="R csv performance")) %+% filter(benchmarks, encoder == "csv") 631 | ``` 632 | 633 | Surprisingly fast at writing to files. 634 | 635 | ## Comparison of Encoder Speed 636 | 637 | ```{R results="hide", include=FALSE} 638 | ## Derive some kind of ordering from fastest to slowest (for choosing our 639 | ## color palette): 640 | library(gnm) 641 | library(broom) 642 | library(stringr) 643 | 644 | # order the factors from high to low by ad-hoc regression model... 645 | m <- gnm(data = benchmarks, 646 | total.elapsed ~ Mult(size, method, encoder)) 647 | 648 | getEncoder <- function(term) { 649 | match <- str_match(term, "\\.(size|method|encoder)(\\w*)") 650 | match <- match[,-1] 651 | dimnames(match) <- list(NULL, list("param", "value")) 652 | as_data_frame(match) 653 | } 654 | 655 | coefs <- (m %>% tidy %>% bind_cols(., getEncoder(.$term))) 656 | 657 | encoder_order <- (coefs 658 | %>% filter(param == "encoder") 659 | %>% arrange(desc(abs(estimate))) 660 | %>% .$value 661 | ) 662 | 663 | method_order <- (coefs 664 | %>% filter(param == "method") 665 | %>% arrange(desc(abs(estimate))) 666 | %>% .$value 667 | ) 668 | ``` 669 | 670 | The most importent scenarios are conversion in memory, writing to 671 | file, writing to another process on the same host, and writing to 672 | remote host. 673 | 674 | ```{R timings, fig_width=6, fig_height=6, fig.asp = 1} 675 | library(directlabels) 676 | timing.plot <- ( benchmarks 677 | %>% subset(method %in% c("convert", "socket", "remote", "file")) 678 | %>% mutate(method = c(convert="Conversion in memory", socket="TCP (same host)", 679 | remote="TCP (over LAN)", file = "File I/O")[method]) 680 | %>% mutate(encoder = factor(encoder, levels=encoder_order)) 681 | %>% { (ggplot(.) 682 | + aes(y = total.elapsed, x = size, color = encoder, group = encoder) 683 | + facet_wrap(~method) 684 | + geom_point() 685 | + geom_line() 686 | + theme(aspect.ratio=1) 687 | + scale_x_continuous(name = "Fraction of nycflights13 transmitted") 688 | + scale_y_continuous(limits = c(0, NA), name = "Elapsed time (s)") 689 | + labs(title="Elapsed time to encode, write, read, decode, by package") 690 | + expand_limits(x=1.5, y=30) 691 | + geom_dl(aes(label=encoder), 692 | debug=FALSE, 693 | method = list( 694 | "last.points", 695 | rot=45, 696 | "bumpup", 697 | dl.trans(x=x+0.2, y=y+0.2))) 698 | + guides(color=FALSE) 699 | )}) 700 | timing.plot 701 | ``` 702 | 703 | ## Comparison of Encoder Data Usage 704 | 705 | ```{R datasize} 706 | # We didn't have time to test each encoder with the full 707 | # dataset, so we'll extrapolate from the largest data set tested 708 | 709 | needed <- (benchmarks 710 | %>% filter(!is.na(bytes)) 711 | %>% select(., encoder, method, size, bytes) 712 | %>% group_by(encoder) 713 | %>% filter(row_number() == 1) 714 | %>% arrange(desc(size), .by_group=TRUE) 715 | %>% slice(1) 716 | %>% select(encoder, method) 717 | ) 718 | 719 | model <- ( needed 720 | %>% inner_join(benchmarks) 721 | %>% select(bytes, encoder, size) 722 | %>% lm(formula = bytes ~ encoder * size)) 723 | 724 | predicted <- (needed 725 | %>% mutate(size = 1) 726 | %>% augment(model, newdata=.) 727 | %>% rename(bytes = .fitted) 728 | ) 729 | 730 | datasize.plot <- (predicted 731 | %>% mutate(encoder = factor(encoder, levels=encoder_order)) 732 | %>% ggplot 733 | %>% +aes(x = reorder(encoder, bytes), y=bytes/2^20, fill=encoder) 734 | %>% +geom_col() 735 | %>% +labs(y = "MiB", x = NULL, 736 | title = "Space used to encode nycflights13 (shorter is better)") 737 | %>% +geom_dl(aes(label=encoder, group=encoder), 738 | method=c(dl.trans(y=y+0.1), "top.bumpup")) 739 | %>% +guides(fill = FALSE) 740 | %>% +theme(axis.title.x=element_blank(), 741 | axis.text.x=element_blank(), 742 | axis.ticks.x=element_blank() 743 | ) 744 | ) 745 | datasize.plot 746 | ``` 747 | 748 | The difference in size between different JSON and msgpack 749 | implementations could use some investigation. It may down to 750 | whitespace, sending data row-wise vs. col-wise, differences in the 751 | mapping between R and data format types, or bugs. 752 | 753 | ```{R} 754 | save(benchmarks, file="../inst/benchmarks.RData") 755 | ``` 756 | -------------------------------------------------------------------------------- /man/msgConnection.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/conn.R 3 | \name{msgConnection} 4 | \alias{msgConnection} 5 | \alias{close.msgConnection} 6 | \alias{partial} 7 | \alias{partial.msgConnection} 8 | \alias{readMsgs} 9 | \alias{status} 10 | \alias{status.msgConnection} 11 | \alias{seek.msgConnection} 12 | \alias{readMsg} 13 | \alias{writeMsg} 14 | \alias{writeMsgs} 15 | \title{Read and write msgpack formatted messages over R connections.} 16 | \usage{ 17 | msgConnection(con, read_size = 2^16, max_size = NA, ...) 18 | 19 | \method{close}{msgConnection}(con, ...) 20 | 21 | partial(con) 22 | 23 | \method{partial}{msgConnection}(con) 24 | 25 | readMsgs(con, n = NA, ...) 26 | 27 | status(con) 28 | 29 | \method{status}{msgConnection}(con) 30 | 31 | \method{seek}{msgConnection}(con, rw = summary(con)$mode, ...) 32 | 33 | readMsg(con, ...) 34 | 35 | writeMsg(obj, con, ...) 36 | 37 | writeMsgs(objs, con, ...) 38 | } 39 | \arguments{ 40 | \item{con}{A \link{connection} object open in binary mode.} 41 | 42 | \item{read_size}{How many bytes to read at a time.} 43 | 44 | \item{max_size}{The largest partial message to store, in 45 | bytes. \code{NA} means do not enforce a limit.} 46 | 47 | \item{...}{Unpacking options (see \link{unpackMsg}).} 48 | 49 | \item{n}{The maximum number of messages to read. A value of NA 50 | means to parse all available messages until end of input.} 51 | 52 | \item{rw}{See \code{\link[=seek]{seek()}}.} 53 | 54 | \item{obj}{An R object.} 55 | 56 | \item{objs}{A list of R objects.} 57 | } 58 | \value{ 59 | \code{msgConnection()} returns an object of class 60 | \code{msgConnection}. 61 | 62 | \code{partial(con)} returns any data that has been read ahead of 63 | the last decoded message. 64 | 65 | \code{readMsgs(con, n)} returns a list of up to \code{n} decoded messages. 66 | 67 | \code{status(con)} returns the status of msgpack decoding on the 68 | connection. A value of \code{"ok"} indicates all requested messages 69 | were read, \code{"buffer underflow"} for a non-blocking connection 70 | indicates that only part of a message has been received, and 71 | \code{"end of input"} means the last available message has been read. 72 | Other values indicate errors encountered in decoding, which will 73 | effectively halt reading. 74 | 75 | \code{seek(con)} returns the number of bytes that have been 76 | successfully read or written, depending on the mode of the 77 | connection. (Repositioning is not supported.) 78 | 79 | \code{readMsg(con)} returns one decoded message. 80 | } 81 | \description{ 82 | A \code{msgConnection} object decodes msgpack messages from an 83 | underlying R raw connection. 84 | 85 | \code{writeMsg} will work with any R connection in raw mode, but reading 86 | requires a msgConnection object. 87 | } 88 | \details{ 89 | Because msgpack messages have unpredictable length, the decoder 90 | reads ahead in chunks, then finds the boundaries between messages. 91 | Therefore when reading over a socket or a fifo it is best to use a 92 | nonblocking connection, and it will not work to mix readMsg and 93 | readBin on the same connection. 94 | 95 | If you are reading data from a not completely trusted source you 96 | should specify options \code{max_size} and \code{max_depth} (see 97 | \link{unpackOpts}). Without it, some deeply nested or cleverly designed 98 | messages can cause a stack overflow or out-of-memory error. With 99 | these options set, you will get an R exception instead. 100 | } 101 | \examples{ 102 | out <- rawConnection(raw(0), open="wb") 103 | apply(quakes, 1, function(x) writeMsg(x, out)) # one message for each row 104 | length(rawConnectionValue(out)) 105 | inn <- msgConnection(rawConnection(rawConnectionValue(out), open="rb")) 106 | close(out) 107 | readMsg(inn) 108 | readMsgs(inn, 3) 109 | length(readMsgs(inn)) 110 | close(inn) 111 | } 112 | -------------------------------------------------------------------------------- /man/packMsg.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/encode.R 3 | \name{packMsg} 4 | \alias{packMsg} 5 | \alias{packMsgs} 6 | \alias{packOpts} 7 | \alias{prepack} 8 | \alias{prepack.default} 9 | \alias{prepack.data.frame} 10 | \title{Convert R objects to msgpack format.} 11 | \usage{ 12 | packMsg(x, ...) 13 | 14 | packMsgs(xs, ...) 15 | 16 | packOpts(compatible = FALSE, as_is = FALSE, use_dict = TRUE, 17 | max_size = NA, buf_size = 512) 18 | 19 | prepack(x) 20 | 21 | \method{prepack}{default}(x) 22 | 23 | \method{prepack}{data.frame}(x) 24 | } 25 | \arguments{ 26 | \item{x}{An R object, which can be null, a vector, list, 27 | environment, raw, or any combinations thereof.} 28 | 29 | \item{...}{Options passed to \code{\link[=packOpts]{packOpts()}}} 30 | 31 | \item{xs}{a list of objects to pack.} 32 | 33 | \item{compatible}{If TRUE, emitted bytes conform to version 1.0 of 34 | msgpack encoding. This means that msgpack strings are used for 35 | raw objects.} 36 | 37 | \item{as_is}{If TRUE, singletons (R primitive vectors of length 1 38 | having no names attribute) are encoded as msgpack arrays of 39 | length 1. Otherwise singletons are simplified to msgpack scalars.} 40 | 41 | \item{use_dict}{If TRUE, vectors having a "names" attribute are 42 | encoded as dicts. If false, they are encoded as arrays and the 43 | names are discarded.} 44 | 45 | \item{max_size}{The largest buffer that will be allocated.} 46 | 47 | \item{buf_size}{The initial amount of memory, in bytes, to allocate 48 | for packing each message. This will be dynamically grown if a 49 | larger message is passed, so there is little reason to change 50 | this.} 51 | } 52 | \value{ 53 | An object of class "raw". 54 | } 55 | \description{ 56 | Convert R objects to msgpack format. 57 | 58 | \code{\link[=packOpts]{packOpts()}} interprets the \code{...} argument of packMsg and 59 | packMsgs. it is not exported. 60 | } 61 | \details{ 62 | Strings are re-encoded to UTF-8 if necessary. Real numbers 63 | taking integral values may be emitted as integers to save space. 64 | 65 | Normally an R vector of length 1 will be unboxed, e.g. packMsg(1) 66 | will make a msgpack integer, but packMsg(c(1,2)) will make a 67 | msgpack list. To prevent this and produce a list of length 1 in the 68 | first case, specify \code{as_is = TRUE}. Objects of class \code{AsIs} or 69 | \code{data.frame} will always be encoded as-is. 70 | 71 | A hook for pre-processing R objects before packing is supported, by 72 | giving the object an S3 \link{class} and implementing a method 73 | \code{prepack}. For instance, \code{prepack.data.frame(x)} simply adds the 74 | \code{"AsIs"} class to \code{x}. 75 | 76 | Environment objects are written out with the keys in sorted order, 77 | but named vectors are written in the order which the entries 78 | appear. 79 | 80 | Object attributes other than \code{names} and \code{class} are ignored. 81 | } 82 | \examples{ 83 | packMsg( list(compact=TRUE, schema=0) ) 84 | x <- packMsgs(list("one", "two", "three")) 85 | unpackMsgs(x, 2) 86 | } 87 | -------------------------------------------------------------------------------- /man/unpackMsg.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/decode.R 3 | \name{unpackMsg} 4 | \alias{unpackMsg} 5 | \alias{unpackMsgs} 6 | \alias{unpackOpts} 7 | \title{Decode msgpack messages.} 8 | \usage{ 9 | unpackMsg(x, ...) 10 | 11 | unpackMsgs(x, n = NA, reader = NULL, ...) 12 | 13 | unpackOpts(parent = NULL, df = TRUE, simplify = TRUE, 14 | max_size = NA, max_depth = NA, underflow_handler = NULL) 15 | } 16 | \arguments{ 17 | \item{x}{A \code{\link[=raw]{raw()}} object, perhaps read from a file or socket.} 18 | 19 | \item{...}{Options passed to \link{unpackOpts}.} 20 | 21 | \item{n}{How many messages to read. An "NA" here means to read as 22 | much as possible.} 23 | 24 | \item{reader}{For implementing connections; a function that takes 25 | no arguments and returns a raw containing more data.} 26 | 27 | \item{parent}{When an environment is given, (such as \code{\link[=emptyenv]{emptyenv()}}), 28 | unpack msgpack dicts into environment objects, with the given 29 | value as parent. This option overrides \code{use_df=TRUE}. Otherwise, 30 | unpack dicts into named vectors / lists.} 31 | 32 | \item{df}{When \code{TRUE}, convert msgpack dicts, whose elements are 33 | all arrays having the same length, into \code{\link[=data.frame]{data.frame()}}s.} 34 | 35 | \item{simplify}{If \code{TRUE}, simplify msgpack lists into primitive 36 | vectors.} 37 | 38 | \item{max_size}{The maximum length of message to decode.} 39 | 40 | \item{max_depth}{The maximum degree of nesting to support.} 41 | 42 | \item{underflow_handler}{Used internally.} 43 | } 44 | \value{ 45 | \code{unpackMsg(x)} returns one decoded message (which might be 46 | shorter than the input raw), or throws an error. 47 | 48 | \code{unpackMsgs(r, n)} returns a list \code{X} with four elements: 49 | \itemize{ 50 | \item \code{X$msgs} is a list of the messages unpacked. 51 | \item \code{X$remaining} is data remaining to be parsed. 52 | \item \code{X$status} is a status message, typically "ok", "end of input", 53 | or "buffer underflow". 54 | \item \code{x$bytes_read} the number of bytes consumed. 55 | } 56 | } 57 | \description{ 58 | \code{unpackMsg} converts a raw array containing one message in msgpack 59 | format into the corresponding R data structure. 60 | } 61 | \details{ 62 | The msgpack format does not have typed arrays, so all msgpack 63 | arrays are effectively lists from the R perspective. However, if an 64 | array containing compatibly typed elements is read, \code{unpack} will 65 | return a logical, integer, real or string vector as 66 | appropriate. This behavior is disabled with \code{simplify=FALSE}. The 67 | coercion used is more conservative than R's coercion: Integer 68 | values may be converted to real, but boolean values will not be 69 | cast to numeric, nor any types to string. If conversion from a 70 | large integer to real loses precision, a warning is printed. 71 | 72 | Msgpack also does not distinguish between \code{NA} and \code{NULL}. All nils 73 | will be decoded as NA. 74 | 75 | Strings are assumed to be UTF-8 encoded. If a msgpack string does 76 | not appear to be valid UTF-8, a warning is printed and a raw object 77 | is produced instead. 78 | 79 | Msgpack allows any type to be the key of a dict, but R only 80 | supports strings. If a non-string appears as key in a msgpack dict, 81 | it will be converted to string with \code{\link[=deparse]{deparse()}}. 82 | 83 | Extension types will be decoded as raw objects with a class like 84 | \code{"ext120"} and a warning. 85 | } 86 | \examples{ 87 | msg <- as.raw(c(0x82, 0xa7, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x63, 0x74, 0xc3, 88 | 0xa6, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x00)) 89 | unpackMsg(msg) 90 | x <- packMsgs(list("one", "two", "three")) 91 | unpackMsgs(x, 2) 92 | } 93 | -------------------------------------------------------------------------------- /src/cwpack.c: -------------------------------------------------------------------------------- 1 | /* CWPack - cwpack.c */ 2 | /* 3 | The MIT License (MIT) 4 | 5 | Copyright (c) 2017 Claes Wihlborg 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 8 | software and associated documentation files (the "Software"), to deal in the Software 9 | without restriction, including without limitation the rights to use, copy, modify, 10 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 11 | persons to whom the Software is furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all copies or 14 | substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING 17 | BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 19 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | 25 | #include "cwpack.h" 26 | #include "cwpack_defines.h" 27 | 28 | 29 | 30 | /************************* C S Y S T E M L I B R A R Y ****************/ 31 | 32 | #ifdef FORCE_NO_LIBRARY 33 | 34 | static void *memcpy(void *dst, const void *src, size_t n) 35 | { 36 | unsigned int i; 37 | uint8_t *d=(uint8_t*)dst, *s=(uint8_t*)src; 38 | for (i=0; istart = pack_context->current = (uint8_t*)data; 77 | pack_context->end = pack_context->start + length; 78 | pack_context->be_compatible = false; 79 | pack_context->err_no = 0; 80 | pack_context->handle_pack_overflow = hpo; 81 | pack_context->return_code = test_byte_order(); 82 | return pack_context->return_code; 83 | } 84 | 85 | void cw_pack_set_compatibility (cw_pack_context* pack_context, bool be_compatible) 86 | { 87 | pack_context->be_compatible = be_compatible; 88 | } 89 | 90 | 91 | 92 | /* Packing routines -------------------------------------------------------------------------------- */ 93 | 94 | 95 | void cw_pack_unsigned(cw_pack_context* pack_context, uint64_t i) 96 | { 97 | if (pack_context->return_code) 98 | return; 99 | 100 | if (i < 128) 101 | tryMove0(i); 102 | 103 | if (i < 256) 104 | tryMove1(0xcc, i); 105 | 106 | if (i < 0x10000L) 107 | { 108 | tryMove2(0xcd, i); 109 | } 110 | if (i < 0x100000000LL) 111 | tryMove4(0xce, i); 112 | 113 | tryMove8(0xcf,i); 114 | } 115 | 116 | 117 | void cw_pack_signed(cw_pack_context* pack_context, int64_t i) 118 | { 119 | if (pack_context->return_code) 120 | return; 121 | 122 | if (i >127) 123 | { 124 | if (i < 256) 125 | tryMove1(0xcc, i); 126 | 127 | if (i < 0x10000L) 128 | tryMove2(0xcd, i); 129 | 130 | if (i < 0x100000000LL) 131 | tryMove4(0xce, i); 132 | 133 | tryMove8(0xcf,i); 134 | } 135 | 136 | if (i >= -32) 137 | tryMove0(i); 138 | 139 | if (i >= -128) 140 | tryMove1(0xd0, i); 141 | 142 | if (i >= -32768) 143 | tryMove2(0xd1,i); 144 | 145 | if (i >= (int64_t)0xffffffff80000000LL) 146 | tryMove4(0xd2,i); 147 | 148 | tryMove8(0xd3,i); 149 | } 150 | 151 | 152 | void cw_pack_float(cw_pack_context* pack_context, float f) 153 | { 154 | if (pack_context->return_code) 155 | return; 156 | 157 | uint32_t tmp = *((uint32_t*)&f); 158 | tryMove4(0xca,tmp); 159 | } 160 | 161 | 162 | void cw_pack_double(cw_pack_context* pack_context, double d) 163 | { 164 | if (pack_context->return_code) 165 | return; 166 | 167 | uint64_t tmp = *((uint64_t*)&d); 168 | tryMove8(0xcb,tmp); 169 | } 170 | 171 | 172 | void cw_pack_real (cw_pack_context* pack_context, double d) 173 | { 174 | float f = (float)d; 175 | double df = f; 176 | if (df == d) 177 | cw_pack_float (pack_context, f); 178 | else 179 | cw_pack_double (pack_context, d); 180 | } 181 | 182 | 183 | void cw_pack_nil(cw_pack_context* pack_context) 184 | { 185 | if (pack_context->return_code) 186 | return; 187 | 188 | tryMove0(0xc0); 189 | } 190 | 191 | 192 | void cw_pack_true (cw_pack_context* pack_context) 193 | { 194 | if (pack_context->return_code) 195 | return; 196 | 197 | tryMove0(0xc3); 198 | } 199 | 200 | 201 | void cw_pack_false (cw_pack_context* pack_context) 202 | { 203 | if (pack_context->return_code) 204 | return; 205 | 206 | tryMove0(0xc2); 207 | } 208 | 209 | 210 | void cw_pack_boolean(cw_pack_context* pack_context, bool b) 211 | { 212 | if (pack_context->return_code) 213 | return; 214 | 215 | tryMove0(b? 0xc3: 0xc2); 216 | } 217 | 218 | 219 | void cw_pack_array_size(cw_pack_context* pack_context, uint32_t n) 220 | { 221 | if (pack_context->return_code) 222 | return; 223 | 224 | if (n < 16) 225 | tryMove0(0x90 | n); 226 | 227 | if (n < 65536) 228 | tryMove2(0xdc, n); 229 | 230 | tryMove4(0xdd, n); 231 | } 232 | 233 | 234 | void cw_pack_map_size(cw_pack_context* pack_context, uint32_t n) 235 | { 236 | if (pack_context->return_code) 237 | return; 238 | 239 | if (n < 16) 240 | tryMove0(0x80 | n); 241 | 242 | if (n < 65536) 243 | tryMove2(0xde, n); 244 | 245 | tryMove4(0xdf, n); 246 | } 247 | 248 | 249 | void cw_pack_str(cw_pack_context* pack_context, const char* v, uint32_t l) 250 | { 251 | if (pack_context->return_code) 252 | return; 253 | 254 | uint8_t *p; 255 | 256 | if (l < 32) // Fixstr 257 | { 258 | cw_pack_reserve_space(l+1); 259 | *p = (uint8_t)(0xa0 + l); 260 | memcpy(p+1,v,l); 261 | return; 262 | } 263 | if (l < 256 && !pack_context->be_compatible) // Str 8 264 | { 265 | cw_pack_reserve_space(l+2); 266 | *p++ = (uint8_t)(0xd9); 267 | *p = (uint8_t)(l); 268 | memcpy(p+1,v,l); 269 | return; 270 | } 271 | if (l < 65536) // Str 16 272 | { 273 | cw_pack_reserve_space(l+3) 274 | *p++ = (uint8_t)0xda; 275 | cw_store16(l); 276 | memcpy(p+2,v,l); 277 | return; 278 | } 279 | // Str 32 280 | cw_pack_reserve_space(l+5) 281 | *p++ = (uint8_t)0xdb; 282 | cw_store32(l); 283 | memcpy(p+4,v,l); 284 | return; 285 | } 286 | 287 | 288 | void cw_pack_bin(cw_pack_context* pack_context, const void* v, uint32_t l) 289 | { 290 | if (pack_context->return_code) 291 | return; 292 | 293 | if (pack_context->be_compatible) 294 | { 295 | cw_pack_str( pack_context, v, l); 296 | return; 297 | } 298 | 299 | uint8_t *p; 300 | 301 | if (l < 256) // Bin 8 302 | { 303 | cw_pack_reserve_space(l+2); 304 | *p++ = (uint8_t)(0xc4); 305 | *p = (uint8_t)(l); 306 | memcpy(p+1,v,l); 307 | return; 308 | } 309 | if (l < 65536) // Bin 16 310 | { 311 | cw_pack_reserve_space(l+3) 312 | *p++ = (uint8_t)0xc5; 313 | cw_store16(l); 314 | memcpy(p+2,v,l); 315 | return; 316 | } 317 | // Bin 32 318 | cw_pack_reserve_space(l+5) 319 | *p++ = (uint8_t)0xc6; 320 | cw_store32(l); 321 | memcpy(p+4,v,l); 322 | return; 323 | } 324 | 325 | 326 | void cw_pack_ext (cw_pack_context* pack_context, int8_t type, const void* v, uint32_t l) 327 | { 328 | if (pack_context->return_code) 329 | return; 330 | 331 | if (pack_context->be_compatible) 332 | PACK_ERROR(CWP_RC_ILLEGAL_CALL); 333 | 334 | uint8_t *p; 335 | 336 | switch (l) 337 | { 338 | case 1: // Fixext 1 339 | cw_pack_reserve_space(3); 340 | *p++ = (uint8_t)0xd4; 341 | *p++ = (uint8_t)type; 342 | *p++ = *(uint8_t*)v; 343 | return; 344 | case 2: // Fixext 2 345 | cw_pack_reserve_space(4); 346 | *p++ = (uint8_t)0xd5; 347 | break; 348 | case 4: // Fixext 4 349 | cw_pack_reserve_space(6); 350 | *p++ = (uint8_t)0xd6; 351 | break; 352 | case 8: // Fixext 8 353 | cw_pack_reserve_space(10); 354 | *p++ = (uint8_t)0xd7; 355 | break; 356 | case 16: // Fixext16 357 | cw_pack_reserve_space(18); 358 | *p++ = (uint8_t)0xd8; 359 | break; 360 | default: 361 | if (l < 256) // Ext 8 362 | { 363 | cw_pack_reserve_space(l+3); 364 | *p++ = (uint8_t)0xc7; 365 | *p++ = (uint8_t)(l); 366 | } 367 | else if (l < 65536) // Ext 16 368 | { 369 | cw_pack_reserve_space(l+4) 370 | *p++ = (uint8_t)0xc8; 371 | cw_store16(l); 372 | p += 2; 373 | } 374 | else // Ext 32 375 | { 376 | cw_pack_reserve_space(l+6) 377 | *p++ = (uint8_t)0xc9; 378 | cw_store32(l); 379 | p += 4; 380 | } 381 | } 382 | *p++ = (uint8_t)type; 383 | memcpy(p,v,l); 384 | } 385 | 386 | 387 | void cw_pack_insert (cw_pack_context* pack_context, const void* v, uint32_t l) 388 | { 389 | uint8_t *p; 390 | cw_pack_reserve_space(l); 391 | memcpy(p,v,l); 392 | } 393 | 394 | /******************************* U N P A C K **********************************/ 395 | 396 | 397 | int cw_unpack_context_init (cw_unpack_context* unpack_context, void* data, unsigned long length, unpack_underflow_handler huu) 398 | { 399 | unpack_context->start = unpack_context->current = (uint8_t*)data; 400 | unpack_context->end = unpack_context->start + length; 401 | unpack_context->return_code = test_byte_order(); 402 | unpack_context->err_no = 0; 403 | unpack_context->handle_unpack_underflow = huu; 404 | return unpack_context->return_code; 405 | } 406 | 407 | 408 | /* Unpacking routines ---------------------------------------------------------- */ 409 | 410 | 411 | 412 | void cw_unpack_next (cw_unpack_context* unpack_context) 413 | { 414 | if (unpack_context->return_code) 415 | return; 416 | 417 | uint64_t tmpu64; 418 | uint32_t tmpu32; 419 | uint16_t tmpu16; 420 | uint8_t* p; 421 | 422 | #define buffer_end_return_code CWP_RC_END_OF_INPUT; 423 | cw_unpack_assert_space(1); 424 | uint8_t c = *p; 425 | #undef buffer_end_return_code 426 | #define buffer_end_return_code CWP_RC_BUFFER_UNDERFLOW; 427 | switch (c) 428 | { 429 | case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07: 430 | case 0x08: case 0x09: case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x0e: case 0x0f: 431 | case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17: 432 | case 0x18: case 0x19: case 0x1a: case 0x1b: case 0x1c: case 0x1d: case 0x1e: case 0x1f: 433 | case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27: 434 | case 0x28: case 0x29: case 0x2a: case 0x2b: case 0x2c: case 0x2d: case 0x2e: case 0x2f: 435 | case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37: 436 | case 0x38: case 0x39: case 0x3a: case 0x3b: case 0x3c: case 0x3d: case 0x3e: case 0x3f: 437 | case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: 438 | case 0x48: case 0x49: case 0x4a: case 0x4b: case 0x4c: case 0x4d: case 0x4e: case 0x4f: 439 | case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57: 440 | case 0x58: case 0x59: case 0x5a: case 0x5b: case 0x5c: case 0x5d: case 0x5e: case 0x5f: 441 | case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: 442 | case 0x68: case 0x69: case 0x6a: case 0x6b: case 0x6c: case 0x6d: case 0x6e: case 0x6f: 443 | case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77: 444 | case 0x78: case 0x79: case 0x7a: case 0x7b: case 0x7c: case 0x7d: case 0x7e: case 0x7f: 445 | getDDItem(CWP_ITEM_POSITIVE_INTEGER, i64, c); return; // positive fixnum 446 | case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87: 447 | case 0x88: case 0x89: case 0x8a: case 0x8b: case 0x8c: case 0x8d: case 0x8e: case 0x8f: 448 | getDDItem(CWP_ITEM_MAP, map.size, c & 0x0f); return; // fixmap 449 | case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97: 450 | case 0x98: case 0x99: case 0x9a: case 0x9b: case 0x9c: case 0x9d: case 0x9e: case 0x9f: 451 | getDDItem(CWP_ITEM_ARRAY, array.size, c & 0x0f); return; // fixarray 452 | case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: case 0xa5: case 0xa6: case 0xa7: 453 | case 0xa8: case 0xa9: case 0xaa: case 0xab: case 0xac: case 0xad: case 0xae: case 0xaf: 454 | case 0xb0: case 0xb1: case 0xb2: case 0xb3: case 0xb4: case 0xb5: case 0xb6: case 0xb7: 455 | case 0xb8: case 0xb9: case 0xba: case 0xbb: case 0xbc: case 0xbd: case 0xbe: case 0xbf: 456 | getDDItem(CWP_ITEM_STR, str.length, c & 0x1f); // fixstr 457 | cw_unpack_assert_blob(str); 458 | case 0xc0: unpack_context->item.type = CWP_ITEM_NIL; return; // nil 459 | case 0xc2: getDDItem(CWP_ITEM_BOOLEAN, boolean, false); return; // false 460 | case 0xc3: getDDItem(CWP_ITEM_BOOLEAN, boolean, true); return; // true 461 | case 0xc4: getDDItem1(CWP_ITEM_BIN, bin.length, uint8_t); // bin 8 462 | cw_unpack_assert_blob(bin); 463 | case 0xc5: getDDItem2(CWP_ITEM_BIN, bin.length, uint16_t); // bin 16 464 | cw_unpack_assert_blob(bin); 465 | case 0xc6: getDDItem4(CWP_ITEM_BIN, bin.length, uint32_t); // bin 32 466 | cw_unpack_assert_blob(bin); 467 | case 0xc7: getDDItem1(CWP_ITEM_EXT, ext.length, uint8_t); // ext 8 468 | cw_unpack_assert_space(1); 469 | unpack_context->item.type = *(int8_t*)p; 470 | cw_unpack_assert_blob(ext); 471 | case 0xc8: getDDItem2(CWP_ITEM_EXT, ext.length, uint16_t); // ext 16 472 | cw_unpack_assert_space(1); 473 | unpack_context->item.type = *(int8_t*)p; 474 | cw_unpack_assert_blob(ext); 475 | case 0xc9: getDDItem4(CWP_ITEM_EXT, ext.length, uint32_t); // ext 32 476 | cw_unpack_assert_space(1); 477 | unpack_context->item.type = *(int8_t*)p; 478 | cw_unpack_assert_blob(ext); 479 | case 0xca: unpack_context->item.type = CWP_ITEM_FLOAT; // float 480 | cw_unpack_assert_space(4); 481 | cw_load32(p); 482 | unpack_context->item.as.real = *(float*)&tmpu32; return; 483 | case 0xcb: getDDItem8(CWP_ITEM_DOUBLE); return; // double 484 | case 0xcc: getDDItem1(CWP_ITEM_POSITIVE_INTEGER, u64, uint8_t); return; // unsigned int 8 485 | case 0xcd: getDDItem2(CWP_ITEM_POSITIVE_INTEGER, u64, uint16_t); return; // unsigned int 16 486 | case 0xce: getDDItem4(CWP_ITEM_POSITIVE_INTEGER, u64, uint32_t); return; // unsigned int 32 487 | case 0xcf: getDDItem8(CWP_ITEM_POSITIVE_INTEGER); return; // unsigned int 64 488 | case 0xd0: getDDItem1(CWP_ITEM_NEGATIVE_INTEGER, i64, int8_t); // signed int 8 489 | if (unpack_context->item.as.i64 >= 0) 490 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER; 491 | return; 492 | case 0xd1: getDDItem2(CWP_ITEM_NEGATIVE_INTEGER, i64, int16_t); // signed int 16 493 | if (unpack_context->item.as.i64 >= 0) 494 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER; 495 | return; 496 | case 0xd2: getDDItem4(CWP_ITEM_NEGATIVE_INTEGER, i64, int32_t); // signed int 32 497 | if (unpack_context->item.as.i64 >= 0) 498 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER; 499 | return; 500 | case 0xd3: getDDItem8(CWP_ITEM_NEGATIVE_INTEGER); // signed int 64 501 | if (unpack_context->item.as.i64 >= 0) 502 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER; 503 | return; 504 | case 0xd4: getDDItemFix(1); // fixext 1 505 | case 0xd5: getDDItemFix(2); // fixext 2 506 | case 0xd6: getDDItemFix(4); // fixext 4 507 | case 0xd7: getDDItemFix(8); // fixext 8 508 | case 0xd8: getDDItemFix(16); // fixext 16 509 | case 0xd9: getDDItem1(CWP_ITEM_STR, str.length, uint8_t); // str 8 510 | cw_unpack_assert_blob(str); 511 | case 0xda: getDDItem2(CWP_ITEM_STR, str.length, uint16_t); // str 16 512 | cw_unpack_assert_blob(str); 513 | case 0xdb: getDDItem4(CWP_ITEM_STR, str.length, uint32_t); // str 32 514 | cw_unpack_assert_blob(str); 515 | case 0xdc: getDDItem2(CWP_ITEM_ARRAY, array.size, uint16_t); return; // array 16 516 | case 0xdd: getDDItem4(CWP_ITEM_ARRAY, array.size, uint32_t); return; // array 32 517 | case 0xde: getDDItem2(CWP_ITEM_MAP, map.size, uint16_t); return; // map 16 518 | case 0xdf: getDDItem4(CWP_ITEM_MAP, map.size, uint32_t); return; // map 32 519 | case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5: case 0xe6: case 0xe7: 520 | case 0xe8: case 0xe9: case 0xea: case 0xeb: case 0xec: case 0xed: case 0xee: case 0xef: 521 | case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6: case 0xf7: 522 | case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xfc: case 0xfd: case 0xfe: case 0xff: 523 | getDDItem(CWP_ITEM_NEGATIVE_INTEGER, i64, (int8_t)c); return; // negative fixnum 524 | default: 525 | UNPACK_ERROR(CWP_RC_MALFORMED_INPUT) 526 | } 527 | 528 | return; 529 | } 530 | 531 | #define cw_skip_bytes(n) \ 532 | cw_unpack_assert_space((n)); \ 533 | break; 534 | 535 | void cw_skip_items (cw_unpack_context* unpack_context, long item_count) 536 | { 537 | if (unpack_context->return_code) 538 | return; 539 | 540 | uint32_t tmpu32; 541 | uint16_t tmpu16; 542 | uint8_t* p; 543 | 544 | while (item_count-- > 0) 545 | { 546 | #undef buffer_end_return_code 547 | #define buffer_end_return_code CWP_RC_END_OF_INPUT; 548 | cw_unpack_assert_space(1); 549 | uint8_t c = *p; 550 | 551 | #undef buffer_end_return_code 552 | #define buffer_end_return_code CWP_RC_BUFFER_UNDERFLOW; 553 | switch (c) 554 | { 555 | case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07: 556 | case 0x08: case 0x09: case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x0e: case 0x0f: 557 | case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17: 558 | case 0x18: case 0x19: case 0x1a: case 0x1b: case 0x1c: case 0x1d: case 0x1e: case 0x1f: 559 | case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27: 560 | case 0x28: case 0x29: case 0x2a: case 0x2b: case 0x2c: case 0x2d: case 0x2e: case 0x2f: 561 | case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37: 562 | case 0x38: case 0x39: case 0x3a: case 0x3b: case 0x3c: case 0x3d: case 0x3e: case 0x3f: 563 | case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47: 564 | case 0x48: case 0x49: case 0x4a: case 0x4b: case 0x4c: case 0x4d: case 0x4e: case 0x4f: 565 | case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57: 566 | case 0x58: case 0x59: case 0x5a: case 0x5b: case 0x5c: case 0x5d: case 0x5e: case 0x5f: 567 | case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67: 568 | case 0x68: case 0x69: case 0x6a: case 0x6b: case 0x6c: case 0x6d: case 0x6e: case 0x6f: 569 | case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77: 570 | case 0x78: case 0x79: case 0x7a: case 0x7b: case 0x7c: case 0x7d: case 0x7e: case 0x7f: 571 | // unsigned fixint 572 | case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5: case 0xe6: case 0xe7: 573 | case 0xe8: case 0xe9: case 0xea: case 0xeb: case 0xec: case 0xed: case 0xee: case 0xef: 574 | case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6: case 0xf7: 575 | case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xfc: case 0xfd: case 0xfe: case 0xff: 576 | // signed fixint 577 | case 0xc0: // nil 578 | case 0xc2: // false 579 | case 0xc3: break; // true 580 | case 0xcc: // unsigned int 8 581 | case 0xd0: cw_skip_bytes(1); // signed int 8 582 | case 0xcd: // unsigned int 16 583 | case 0xd1: // signed int 16 584 | case 0xd4: cw_skip_bytes(2); // fixext 1 585 | case 0xd5: cw_skip_bytes(3); // fixext 2 586 | case 0xca: // float 587 | case 0xce: // unsigned int 32 588 | case 0xd2: cw_skip_bytes(4); // signed int 32 589 | case 0xd6: cw_skip_bytes(5); // fixext 4 590 | case 0xcb: // double 591 | case 0xcf: // unsigned int 64 592 | case 0xd3: cw_skip_bytes(8); // signed int 64 593 | case 0xd7: cw_skip_bytes(9); // fixext 8 594 | case 0xd8: cw_skip_bytes(17); // fixext 16 595 | case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: case 0xa5: case 0xa6: case 0xa7: 596 | case 0xa8: case 0xa9: case 0xaa: case 0xab: case 0xac: case 0xad: case 0xae: case 0xaf: 597 | case 0xb0: case 0xb1: case 0xb2: case 0xb3: case 0xb4: case 0xb5: case 0xb6: case 0xb7: 598 | case 0xb8: case 0xb9: case 0xba: case 0xbb: case 0xbc: case 0xbd: case 0xbe: case 0xbf: 599 | cw_skip_bytes(c & 0x1f); // fixstr 600 | case 0xd9: // str 8 601 | case 0xc4: // bin 8 602 | cw_unpack_assert_space(1); 603 | tmpu32 = *p; 604 | cw_skip_bytes(tmpu32); 605 | 606 | case 0xda: // str 16 607 | case 0xc5: // bin 16 608 | cw_unpack_assert_space(2); 609 | cw_load16(p); 610 | cw_skip_bytes(tmpu16); 611 | 612 | case 0xdb: // str 32 613 | case 0xc6: // bin 32 614 | cw_unpack_assert_space(4); 615 | cw_load32(p); 616 | cw_skip_bytes(tmpu32); 617 | 618 | case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87: 619 | case 0x88: case 0x89: case 0x8a: case 0x8b: case 0x8c: case 0x8d: case 0x8e: case 0x8f: 620 | item_count += 2*(c & 15); // FixMap 621 | break; 622 | 623 | case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97: 624 | case 0x98: case 0x99: case 0x9a: case 0x9b: case 0x9c: case 0x9d: case 0x9e: case 0x9f: 625 | item_count += c & 15; // FixArray 626 | break; 627 | 628 | case 0xdc: // array 16 629 | cw_unpack_assert_space(2); 630 | cw_load16(p); 631 | item_count += tmpu16; 632 | break; 633 | 634 | case 0xde: // map 16 635 | cw_unpack_assert_space(2); 636 | cw_load16(p); 637 | item_count += 2*tmpu16; 638 | break; 639 | 640 | case 0xdd: // array 32 641 | cw_unpack_assert_space(4); 642 | cw_load32(p); 643 | item_count += tmpu32; 644 | break; 645 | 646 | case 0xdf: // map 32 647 | cw_unpack_assert_space(4); 648 | cw_load32(p); 649 | item_count += 2*tmpu32; 650 | break; 651 | 652 | case 0xc7: // ext 8 653 | cw_unpack_assert_space(1); 654 | tmpu32 = *p; 655 | cw_skip_bytes(tmpu32 +1); 656 | 657 | case 0xc8: // ext 16 658 | cw_unpack_assert_space(2); 659 | cw_load16(p); 660 | cw_skip_bytes(tmpu16 +1); 661 | 662 | case 0xc9: // ext 32 663 | cw_unpack_assert_space(4); 664 | cw_load32(p); 665 | cw_skip_bytes(tmpu32 +1); 666 | 667 | default: // illegal 668 | UNPACK_ERROR(CWP_RC_MALFORMED_INPUT) 669 | } 670 | } 671 | return; 672 | } 673 | 674 | /* end cwpack.c */ 675 | -------------------------------------------------------------------------------- /src/cwpack.h: -------------------------------------------------------------------------------- 1 | /* CWPack - cwpack.h */ 2 | /* 3 | The MIT License (MIT) 4 | 5 | Copyright (c) 2017 Claes Wihlborg 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 8 | software and associated documentation files (the "Software"), to deal in the Software 9 | without restriction, including without limitation the rights to use, copy, modify, 10 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 11 | persons to whom the Software is furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all copies or 14 | substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING 17 | BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 19 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | #ifndef CWPack_H__ 24 | #define CWPack_H__ 25 | 26 | 27 | #include 28 | #include 29 | #include "opts.h" 30 | 31 | 32 | 33 | /******************************* Return Codes *****************************/ 34 | 35 | #define CWP_RC_OK 0 36 | #define CWP_RC_END_OF_INPUT -1 37 | #define CWP_RC_BUFFER_OVERFLOW -2 38 | #define CWP_RC_BUFFER_UNDERFLOW -3 39 | #define CWP_RC_MALFORMED_INPUT -4 40 | #define CWP_RC_WRONG_BYTE_ORDER -5 41 | #define CWP_RC_ERROR_IN_HANDLER -6 42 | #define CWP_RC_ILLEGAL_CALL -7 43 | #define CWP_RC_MALLOC_ERROR -8 44 | #define CWP_RC_STOPPED -9 45 | 46 | 47 | 48 | /******************************* P A C K **********************************/ 49 | 50 | 51 | struct cw_pack_context; 52 | 53 | typedef int (*pack_overflow_handler)(struct cw_pack_context*, unsigned long); 54 | 55 | typedef struct cw_pack_context { 56 | uint8_t* start; 57 | uint8_t* current; 58 | uint8_t* end; 59 | bool be_compatible; 60 | int return_code; 61 | int err_no; /* handlers can save error here */ 62 | pack_overflow_handler handle_pack_overflow; 63 | pack_opts *opts; 64 | } cw_pack_context; 65 | 66 | 67 | int cw_pack_context_init (cw_pack_context* pack_context, void* data, unsigned long length, pack_overflow_handler hpo); 68 | void cw_pack_set_compatibility (cw_pack_context* pack_context, bool be_compatible); 69 | 70 | void cw_pack_nil (cw_pack_context* pack_context); 71 | void cw_pack_true (cw_pack_context* pack_context); 72 | void cw_pack_false (cw_pack_context* pack_context); 73 | void cw_pack_boolean (cw_pack_context* pack_context, bool b); 74 | 75 | void cw_pack_signed (cw_pack_context* pack_context, int64_t i); 76 | void cw_pack_unsigned (cw_pack_context* pack_context, uint64_t i); 77 | 78 | void cw_pack_float (cw_pack_context* pack_context, float f); 79 | void cw_pack_double (cw_pack_context* pack_context, double d); 80 | void cw_pack_real (cw_pack_context* pack_context, double d); /* Pack as float if precision isn't destroyed */ 81 | 82 | void cw_pack_array_size (cw_pack_context* pack_context, uint32_t n); 83 | void cw_pack_map_size (cw_pack_context* pack_context, uint32_t n); 84 | void cw_pack_str (cw_pack_context* pack_context, const char* v, uint32_t l); 85 | void cw_pack_bin (cw_pack_context* pack_context, const void* v, uint32_t l); 86 | void cw_pack_ext (cw_pack_context* pack_context, int8_t type, const void* v, uint32_t l); 87 | 88 | void cw_pack_insert (cw_pack_context* pack_context, const void* v, uint32_t l); 89 | 90 | 91 | /***************************** U N P A C K ********************************/ 92 | 93 | 94 | typedef enum 95 | { 96 | CWP_ITEM_MIN_RESERVED_EXT = -128, 97 | CWP_ITEM_MAX_RESERVED_EXT = -1, 98 | CWP_ITEM_MIN_USER_EXT = 0, 99 | CWP_ITEM_MAX_USER_EXT = 127, 100 | CWP_ITEM_NIL = 300, 101 | CWP_ITEM_BOOLEAN = 301, 102 | CWP_ITEM_POSITIVE_INTEGER = 302, 103 | CWP_ITEM_NEGATIVE_INTEGER = 303, 104 | CWP_ITEM_FLOAT = 304, 105 | CWP_ITEM_DOUBLE = 305, 106 | CWP_ITEM_STR = 306, 107 | CWP_ITEM_BIN = 307, 108 | CWP_ITEM_ARRAY = 308, 109 | CWP_ITEM_MAP = 309, 110 | CWP_ITEM_EXT = 310, 111 | CWP_NOT_AN_ITEM = 999, 112 | } cwpack_item_types; 113 | 114 | 115 | typedef struct { 116 | const void* start; 117 | uint32_t length; 118 | } cwpack_blob; 119 | 120 | 121 | typedef struct { 122 | uint32_t size; 123 | } cwpack_container; 124 | 125 | 126 | typedef struct { 127 | cwpack_item_types type; 128 | union 129 | { 130 | bool boolean; 131 | uint64_t u64; 132 | int64_t i64; 133 | float real; 134 | double long_real; 135 | cwpack_container array; 136 | cwpack_container map; 137 | cwpack_blob str; 138 | cwpack_blob bin; 139 | cwpack_blob ext; 140 | } as; 141 | } cwpack_item; 142 | 143 | struct cw_unpack_context; 144 | 145 | typedef int (*unpack_underflow_handler)(struct cw_unpack_context*, unsigned long); 146 | 147 | typedef struct cw_unpack_context { 148 | cwpack_item item; 149 | uint8_t* start; 150 | uint8_t* current; 151 | uint8_t* end; /* logical end of buffer */ 152 | int return_code; 153 | int err_no; /* handlers can save error here */ 154 | unpack_underflow_handler handle_unpack_underflow; 155 | unpack_opts *opts; 156 | } cw_unpack_context; 157 | 158 | 159 | 160 | int cw_unpack_context_init (cw_unpack_context* unpack_context, void* data, unsigned long length, unpack_underflow_handler huu); 161 | 162 | void cw_unpack_next (cw_unpack_context* unpack_context); 163 | void cw_skip_items (cw_unpack_context* unpack_context, long item_count); 164 | 165 | 166 | #endif /* CWPack_H__ */ 167 | -------------------------------------------------------------------------------- /src/cwpack_defines.h: -------------------------------------------------------------------------------- 1 | /* CWPack - cwpack_defines.h */ 2 | /* 3 | The MIT License (MIT) 4 | 5 | Copyright (c) 2017 Claes Wihlborg 6 | 7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this 8 | software and associated documentation files (the "Software"), to deal in the Software 9 | without restriction, including without limitation the rights to use, copy, modify, 10 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit 11 | persons to whom the Software is furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all copies or 14 | substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING 17 | BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, 19 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 21 | */ 22 | 23 | 24 | #ifndef cwpack_defines_h 25 | #define cwpack_defines_h 26 | 27 | 28 | 29 | /************************* A L I G N M E N T ******************************/ 30 | 31 | /* 32 | * Sometime the processor demands that integer access is to an even memory address. 33 | * In that case define FORCE_ALIGNMENT 34 | */ 35 | 36 | /* #define FORCE_ALIGNMENT */ 37 | 38 | 39 | /************************* C S Y S T E M L I B R A R Y ****************/ 40 | 41 | /* 42 | * The packer uses "memcpy" to move blobs. If you dont want to load C system library 43 | * for just that, define FORCE_NO_LIBRARY and CWPack will use an internal "memcpy" 44 | */ 45 | 46 | /* #define FORCE_NO_LIBRARY */ 47 | 48 | 49 | 50 | /************************* B Y T E O R D E R ****************************/ 51 | 52 | /* 53 | * The pack/unpack routines are written in three versions: for big endian, for 54 | * little endian and insensitive to byte order. As you can get some speed gain 55 | * if the byte order is known, we try that when we can certainly detect it. 56 | * Define COMPILE_FOR_BIG_ENDIAN or COMPILE_FOR_LITTLE_ENDIAN if you know. 57 | */ 58 | 59 | #ifndef FORCE_ALIGNMENT 60 | #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && defined(__ORDER_BIG_ENDIAN__) 61 | 62 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ 63 | #define COMPILE_FOR_BIG_ENDIAN 64 | #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ 65 | #define COMPILE_FOR_LITTLE_ENDIAN 66 | #endif 67 | 68 | #elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN) 69 | 70 | #if __BYTE_ORDER == __BIG_ENDIAN 71 | #define COMPILE_FOR_BIG_ENDIAN 72 | #elif __BYTE_ORDER == __LITTLE_ENDIAN 73 | #define COMPILE_FOR_LITTLE_ENDIAN 74 | #endif 75 | 76 | #elif defined(__BIG_ENDIAN__) 77 | #define COMPILE_FOR_BIG_ENDIAN 78 | 79 | #elif defined(__LITTLE_ENDIAN__) 80 | #define COMPILE_FOR_LITTLE_ENDIAN 81 | 82 | #elif defined(__i386__) || defined(__x86_64__) 83 | #define COMPILE_FOR_LITTLE_ENDIAN 84 | 85 | #endif 86 | #endif 87 | 88 | //#undef COMPILE_FOR_LITTLE_ENDIAN 89 | 90 | 91 | /******************************* P A C K **********************************/ 92 | 93 | 94 | 95 | #define PACK_ERROR(error_code) \ 96 | { \ 97 | pack_context->return_code = error_code; \ 98 | return; \ 99 | } 100 | 101 | 102 | 103 | #ifdef COMPILE_FOR_BIG_ENDIAN 104 | 105 | #define cw_store16(x) *(uint16_t*)p = *(uint16_t*)&x; 106 | #define cw_store32(x) *(uint32_t*)p = *(uint32_t*)&x; 107 | #define cw_store64(x) *(uint64_t*)p = *(uint64_t*)&x; 108 | 109 | #else /* Byte order little endian or undetermined */ 110 | 111 | #ifdef COMPILE_FOR_LITTLE_ENDIAN 112 | 113 | #define cw_store16(d) \ 114 | *(uint16_t*)p = (uint16_t)((((d) >> 8) & 0x0ff) | (d) << 8) 115 | 116 | #define cw_store32(x) \ 117 | *(uint32_t*)p = \ 118 | ((uint32_t)((((uint32_t)(x)) >> 24) | \ 119 | (((uint32_t)(x) & 0x00ff0000) >> 8) | \ 120 | (((uint32_t)(x) & 0x0000ff00) << 8) | \ 121 | (((uint32_t)(x)) << 24))); \ 122 | 123 | #define cw_store64(x) \ 124 | *(uint64_t*)p = \ 125 | ((uint64_t)( \ 126 | (((((uint64_t)(x)) >> 40) | \ 127 | (((uint64_t)(x)) << 24)) & 0x0000ff000000ff00ULL) | \ 128 | (((((uint64_t)(x)) >> 24) | \ 129 | (((uint64_t)(x)) << 40)) & 0x00ff000000ff0000ULL) | \ 130 | (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \ 131 | (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \ 132 | (((uint64_t)(x)) >> 56) | \ 133 | (((uint64_t)(x)) << 56))); \ 134 | 135 | #else /* Byte order undetermined */ 136 | 137 | #define cw_store16(d) \ 138 | *p = (uint8_t)(d >> 8); \ 139 | p[1] = (uint8_t)d; 140 | 141 | #define cw_store32(d) \ 142 | *p = (uint8_t)(d >> 24); \ 143 | p[1] = (uint8_t)(d >> 16); \ 144 | p[2] = (uint8_t)(d >> 8); \ 145 | p[3] = (uint8_t)d; 146 | 147 | #define cw_store64(z) \ 148 | *p = (uint8_t)(z >> 56); \ 149 | p[1] = (uint8_t)(z >> 48); \ 150 | p[2] = (uint8_t)(z >> 40); \ 151 | p[3] = (uint8_t)(z >> 32); \ 152 | p[4] = (uint8_t)(z >> 24); \ 153 | p[5] = (uint8_t)(z >> 16); \ 154 | p[6] = (uint8_t)(z >> 8); \ 155 | p[7] = (uint8_t)z; 156 | #endif 157 | #endif 158 | 159 | 160 | 161 | #define cw_pack_reserve_space(more) \ 162 | { \ 163 | p = pack_context->current; \ 164 | uint8_t* nyp = p + more; \ 165 | if (nyp > pack_context->end) \ 166 | { \ 167 | if (!pack_context->handle_pack_overflow) \ 168 | PACK_ERROR(CWP_RC_BUFFER_OVERFLOW) \ 169 | int rc = pack_context->handle_pack_overflow (pack_context, (unsigned long)(more)); \ 170 | if (rc) \ 171 | PACK_ERROR(rc) \ 172 | p = pack_context->current; \ 173 | nyp = p + more; \ 174 | } \ 175 | pack_context->current = nyp; \ 176 | } 177 | 178 | 179 | #define tryMove0(t) \ 180 | { \ 181 | uint8_t *p; \ 182 | cw_pack_reserve_space(1) \ 183 | *p = (uint8_t)(t); \ 184 | return; \ 185 | } 186 | 187 | #define tryMove1(t,d) \ 188 | { \ 189 | uint8_t *p; \ 190 | cw_pack_reserve_space(2) \ 191 | *p++ = (uint8_t)t; \ 192 | *p = (uint8_t)d; \ 193 | return; \ 194 | } 195 | 196 | #define tryMove2(t,d) \ 197 | { \ 198 | uint8_t *p; \ 199 | cw_pack_reserve_space(3) \ 200 | *p++ = (uint8_t)t; \ 201 | cw_store16(d); \ 202 | return; \ 203 | } 204 | 205 | #define tryMove4(t,d) \ 206 | { \ 207 | uint8_t *p; \ 208 | cw_pack_reserve_space(5) \ 209 | *p++ = (uint8_t)t; \ 210 | cw_store32(d); \ 211 | return; \ 212 | } 213 | 214 | #define tryMove8(t,d) \ 215 | { \ 216 | uint8_t *p; \ 217 | cw_pack_reserve_space(9) \ 218 | *p++ = (uint8_t)t; \ 219 | cw_store64(d); \ 220 | return; \ 221 | } 222 | 223 | 224 | 225 | 226 | /******************************* U N P A C K **********************************/ 227 | 228 | 229 | 230 | #define UNPACK_ERROR(error_code) \ 231 | { \ 232 | unpack_context->item.type = CWP_NOT_AN_ITEM; \ 233 | unpack_context->return_code = error_code; \ 234 | return; \ 235 | } 236 | 237 | 238 | 239 | #ifdef COMPILE_FOR_BIG_ENDIAN 240 | 241 | #define cw_load16(ptr) tmpu16 = *(uint16_t*)ptr; 242 | #define cw_load32(ptr) tmpu32 = *(uint32_t*)ptr; 243 | #define cw_load64(ptr) tmpu64 = *(uint64_t*)ptr; 244 | 245 | #else /* Byte order little endian or undetermined */ 246 | 247 | #ifdef COMPILE_FOR_LITTLE_ENDIAN 248 | 249 | #define cw_load16(ptr) \ 250 | tmpu16 = *(uint16_t*)ptr; \ 251 | tmpu16 = (uint16_t)((tmpu16<<8) | (tmpu16>>8)) 252 | 253 | #define cw_load32(ptr) \ 254 | tmpu32 = *(uint32_t*)ptr; \ 255 | tmpu32 = (tmpu32<<24) | ((tmpu32 & 0xff00)<<8) | \ 256 | ((tmpu32 & 0xff0000)>>8) | (tmpu32>>24) 257 | 258 | #define cw_load64(ptr) \ 259 | tmpu64 = *((uint64_t*)ptr); \ 260 | tmpu64 = ( \ 261 | (((tmpu64 >> 40) | \ 262 | (tmpu64 << 24)) & 0x0000ff000000ff00ULL) | \ 263 | (((tmpu64 >> 24) | \ 264 | (tmpu64 << 40)) & 0x00ff000000ff0000ULL) | \ 265 | ((tmpu64 & 0x000000ff00000000ULL) >> 8) | \ 266 | ((tmpu64 & 0x00000000ff000000ULL) << 8) | \ 267 | (tmpu64 >> 56) | \ 268 | (tmpu64 << 56) ) 269 | 270 | #else /* Byte order undetermined */ 271 | 272 | #define cw_load16(ptr) \ 273 | tmpu16 = (uint16_t)((*ptr++) << 8); \ 274 | tmpu16 |= (uint16_t)(*ptr++) 275 | 276 | #define cw_load32(ptr) \ 277 | tmpu32 = (uint32_t)(*ptr++ << 24); \ 278 | tmpu32 |= (uint32_t)(*ptr++ << 16); \ 279 | tmpu32 |= (uint32_t)(*ptr++ << 8); \ 280 | tmpu32 |= (uint32_t)(*ptr++) 281 | 282 | #define cw_load64(ptr) \ 283 | tmpu64 = ((uint64_t)*ptr++) << 56; \ 284 | tmpu64 |= ((uint64_t)*ptr++) << 48; \ 285 | tmpu64 |= ((uint64_t)*ptr++) << 40; \ 286 | tmpu64 |= ((uint64_t)*ptr++) << 32; \ 287 | tmpu64 |= ((uint64_t)*ptr++) << 24; \ 288 | tmpu64 |= ((uint64_t)*ptr++) << 16; \ 289 | tmpu64 |= ((uint64_t)*ptr++) << 8; \ 290 | tmpu64 |= (uint64_t)*ptr++ 291 | 292 | #endif 293 | #endif 294 | 295 | 296 | 297 | #define cw_unpack_assert_space(more) \ 298 | { \ 299 | p = unpack_context->current; \ 300 | uint8_t* nyp = p + more; \ 301 | if (nyp > unpack_context->end) \ 302 | { \ 303 | if (!unpack_context->handle_unpack_underflow) \ 304 | UNPACK_ERROR(buffer_end_return_code) \ 305 | int rc = unpack_context->handle_unpack_underflow (unpack_context, (unsigned long)(more)); \ 306 | if (rc != CWP_RC_OK) \ 307 | { \ 308 | if (rc != CWP_RC_END_OF_INPUT) \ 309 | UNPACK_ERROR(rc) \ 310 | else \ 311 | UNPACK_ERROR(buffer_end_return_code) \ 312 | } \ 313 | p = unpack_context->current; \ 314 | nyp = p + more; \ 315 | } \ 316 | unpack_context->current = nyp; \ 317 | } 318 | 319 | 320 | #define cw_unpack_assert_blob(blob) \ 321 | cw_unpack_assert_space(unpack_context->item.as.blob.length); \ 322 | unpack_context->item.as.blob.start = p; \ 323 | return; 324 | 325 | 326 | #define getDDItem(typ,var,val) \ 327 | unpack_context->item.type = typ; \ 328 | unpack_context->item.as.var = val; 329 | 330 | #define getDDItem1(typ,var,cast) \ 331 | unpack_context->item.type = typ; \ 332 | cw_unpack_assert_space(1); \ 333 | unpack_context->item.as.var = (cast)*p; 334 | 335 | #define getDDItem2(typ,var,cast) \ 336 | unpack_context->item.type = typ; \ 337 | cw_unpack_assert_space(2); \ 338 | cw_load16(p); \ 339 | unpack_context->item.as.var = (cast)tmpu16; 340 | 341 | #define getDDItem4(typ,var,cast) \ 342 | unpack_context->item.type = typ; \ 343 | cw_unpack_assert_space(4); \ 344 | cw_load32(p); \ 345 | unpack_context->item.as.var = (cast)tmpu32; 346 | 347 | #define getDDItem8(typ) \ 348 | unpack_context->item.type = typ; \ 349 | cw_unpack_assert_space(8); \ 350 | cw_load64(p); \ 351 | unpack_context->item.as.u64 = tmpu64; 352 | 353 | #define getDDItemFix(len) \ 354 | cw_unpack_assert_space(1); \ 355 | unpack_context->item.type = *(int8_t*)p; \ 356 | unpack_context->item.as.ext.length = len; \ 357 | cw_unpack_assert_blob(ext); 358 | 359 | 360 | 361 | 362 | #endif /* cwpack_defines_h */ 363 | -------------------------------------------------------------------------------- /src/decode.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include "decode.h" 3 | 4 | #include 5 | #include "utf8.h" 6 | 7 | #ifdef DEBUG 8 | #define LOGD(FMT, ...) \ 9 | LOG("%.*s " FMT, \ 10 | MIN (cxt->opts->depth, 40), \ 11 | ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>!", \ 12 | ##__VA_ARGS__); 13 | #else 14 | #define LOGD(...) NULL 15 | #endif 16 | 17 | SEXP extract_sexp(cw_unpack_context *); 18 | int handle_unpack_underflow(cw_unpack_context *, unsigned long); 19 | void cw_unpack_next_or_fail(cw_unpack_context *); 20 | SEXP make_sexp_from_context(cw_unpack_context *); 21 | 22 | SEXP extract_simplified_vector(cw_unpack_context *); 23 | SEXP fill_vector(cw_unpack_context *, SEXP, uint32_t, PROTECT_INDEX, SEXP); 24 | SEXP coerce(SEXP, SEXPTYPE); 25 | 26 | SEXP extract_env(cw_unpack_context *); 27 | SEXP new_env(SEXP parent); 28 | 29 | SEXP make_charsxp_or_null(cw_unpack_context *); 30 | SEXP make_charsxp_or_raw_from_context(cw_unpack_context *); 31 | SEXP make_raw_from_context(cw_unpack_context *); 32 | SEXP make_ext_from_context(cw_unpack_context *); 33 | 34 | SEXP always_make_charsxp_from_context(cw_unpack_context *); 35 | 36 | double i64_to_double(cw_unpack_context *, int64_t); 37 | double u64_to_double(cw_unpack_context *, uint64_t); 38 | 39 | SEXPTYPE type_to_sexptype(int t); 40 | 41 | const char *decode_item_type(cwpack_item_types); 42 | 43 | /* used to make WARN_ONCE work: once per call */ 44 | static long calls = 0; 45 | 46 | void * dataptr_or_null(SEXP x) { 47 | switch(TYPEOF(x)) { 48 | case VECSXP: return VECTOR_PTR(x); 49 | case RAWSXP: return RAW(x); 50 | case LGLSXP: return LOGICAL(x); 51 | case INTSXP: return INTEGER(x); 52 | case REALSXP: return REAL(x); 53 | case CPLXSXP: return COMPLEX(x); 54 | case STRSXP: return STRING_PTR(x); 55 | default: return 0; 56 | } 57 | } 58 | 59 | 60 | 61 | SEXP _unpack_opts(SEXP dict, 62 | SEXP use_df, 63 | SEXP simplify, 64 | SEXP package, 65 | SEXP max_size, 66 | SEXP max_depth, 67 | SEXP underflow_handler) { 68 | SEXP optsxp = PROTECT(allocVector(RAWSXP, sizeof(unpack_opts))); 69 | unpack_opts *opts = (unpack_opts*)(RAW(optsxp)); 70 | opts->use_df = asLogical(use_df); 71 | opts->dict = dict; 72 | opts->simplify = asLogical(simplify); 73 | opts->package = package; 74 | 75 | opts->underflow_handler = underflow_handler; 76 | opts->buf = R_NilValue; /* the buf field is not filled out 77 | here. Whoever fills it out is responsible 78 | for managing protection. */ 79 | opts->buf_index = -1; 80 | if (!R_finite(asReal(max_size))) { 81 | opts->max_pending = ULONG_MAX; 82 | LOG("max_size passed is not finite [%s: 0x%02lx]", type2char(TYPEOF(max_size)), *((unsigned long *) dataptr_or_null(max_size))); 83 | } else { 84 | LOG("max_size passed is finite [%s: 0x%02lx]", type2char(TYPEOF(max_size)), *((unsigned long *) dataptr_or_null(max_size))); 85 | opts->max_pending = asReal(max_size); /* as real since it may be > 2^32 */ 86 | } 87 | if (!R_finite(asReal(max_depth))) { 88 | opts->max_depth = UINT_MAX; 89 | } else { 90 | opts->max_depth = asInteger(max_depth); 91 | } 92 | LOG("max_size = %f -> %lu; max_depth = %f -> %lu", asReal(max_size), opts->max_pending, asReal(max_depth), opts->max_depth); 93 | 94 | /* To make sure that R does not finalize the objects I'm using, 95 | also hang the object references off of it. 96 | 97 | The decode buffer is handled in init_unpack_context. */ 98 | SEXP refs = PROTECT(CONS(dict, 99 | CONS(package, 100 | CONS(underflow_handler, 101 | R_NilValue)))); 102 | setAttrib(optsxp, 103 | install("refs"), 104 | refs); 105 | UNPROTECT(2); 106 | return optsxp; 107 | } 108 | 109 | 110 | int init_unpack_context(cw_unpack_context *cxt, 111 | SEXP optsxp, 112 | SEXP buf, 113 | unsigned long start, 114 | unsigned long end) { 115 | assert_type(optsxp, RAWSXP); 116 | ASSERT(LENGTH(optsxp) == sizeof(unpack_opts)); 117 | 118 | unpack_opts *opts = (unpack_opts *)RAW(optsxp); 119 | opts->depth = 0; 120 | opts->pending = 0; 121 | 122 | unpack_underflow_handler huu; 123 | if (opts->underflow_handler != R_NilValue) { 124 | huu = &handle_unpack_underflow; 125 | } else { 126 | huu = NULL; 127 | } 128 | 129 | int r = cw_unpack_context_init(cxt, RAW(buf) + start, end - start, huu); 130 | if (r) 131 | error("Could not initialize cw_unpack, %s", decode_return_code(r)); 132 | 133 | cxt->opts = opts; 134 | 135 | /* Here is where we start holding a reference to the 136 | buffer... needss to be released on returning to R */ 137 | if (cxt->opts->buf_index != -1) { 138 | LOGD("optsxp previously held a buffer iwth protextion index %d", 139 | cxt->opts->buf_index); 140 | } 141 | PROTECT_WITH_INDEX(cxt->opts->buf = buf, &(cxt->opts->buf_index)); 142 | LOGD("First buffer is, ( 0x%x[%d:%d][%d] )", 143 | RAW(cxt->opts->buf), 144 | cxt->start - RAW(cxt->opts->buf), 145 | cxt->end - RAW(cxt->opts->buf), 146 | cxt->current - RAW(cxt->opts->buf)); 147 | LOGD("buffer protected at index %d", cxt->opts->buf_index); 148 | 149 | return 1; 150 | } 151 | 152 | 153 | 154 | SEXP _unpack_msg(SEXP buf, SEXP optsxp) { 155 | calls++; 156 | cw_unpack_context cxt; 157 | int protections = init_unpack_context(&cxt, optsxp, buf, 0, LENGTH(buf)); 158 | LOG("depth = %u, pending = %lu", cxt.opts->depth, cxt.opts->pending); 159 | SEXP out = PROTECT(extract_sexp(&cxt)); 160 | protections++; 161 | LOG("depth = %u, pending = %lu", cxt.opts->depth, cxt.opts->pending); 162 | ASSERT(cxt.opts->depth == 0); 163 | ASSERT(cxt.opts->pending == 0); 164 | UNPROTECT(protections); 165 | return out; 166 | } 167 | 168 | 169 | int handle_unpack_underflow(cw_unpack_context *cxt, unsigned long x) { 170 | unsigned long need = x - ((cxt->end - cxt->current) / sizeof(*(cxt->end))); 171 | /* bounce back to R to read X bytes */ 172 | 173 | if (cxt->opts->underflow_handler != R_NilValue) { 174 | unsigned long had = 0; 175 | while(TRUE) { 176 | assert_type(cxt->opts->underflow_handler, CLOSXP); 177 | LOGD("Asking R to read %d bytes", need - had); 178 | SEXP scurrent = PROTECT(ScalarInteger(cxt->current - RAW(cxt->opts->buf))); 179 | SEXP sneed = PROTECT(ScalarInteger(need - had)); 180 | SEXP call = PROTECT(lang3(cxt->opts->underflow_handler, 181 | scurrent, 182 | sneed)); 183 | /* result should be a list( rawsxp, start, end, current ) */ 184 | SEXP result = PROTECT(eval(call, cxt->opts->package)); 185 | assert_type(result, VECSXP); 186 | ASSERT(LENGTH(result) == 4); 187 | SEXP buf = VECTOR_ELT(result, 0); 188 | assert_type(buf, RAWSXP); 189 | unsigned long start = asInteger(VECTOR_ELT(result, 1)); 190 | unsigned long end = asInteger(VECTOR_ELT(result, 2)); 191 | unsigned long current = asInteger(VECTOR_ELT(result, 3)); 192 | unsigned long have = (end - current) - (cxt->end - cxt->current); 193 | if (have >= need) { 194 | LOGD("Swapping buffers, ( 0x%x[%d:%d][%d] -> 0x%x[%d:%d][%d] )", 195 | RAW(cxt->opts->buf), 196 | cxt->start - RAW(cxt->opts->buf), 197 | cxt->end - RAW(cxt->opts->buf), 198 | cxt->current - RAW(cxt->opts->buf), 199 | RAW(buf), start, end, current); 200 | 201 | ASSERT(cxt->opts->buf_index != -1); 202 | REPROTECT(cxt->opts->buf = buf, cxt->opts->buf_index); 203 | cxt->start = RAW(buf) + start; 204 | cxt->end = RAW(buf) + end; 205 | cxt->current = RAW(buf) + current; 206 | UNPROTECT(4); 207 | return CWP_RC_OK; 208 | } else if (have > had) { 209 | LOGD("Not enough new data read, have %d, need %d", have, need); 210 | /* try again */ 211 | had = have; 212 | UNPROTECT(4); 213 | } else { 214 | LOGD("No new data read"); 215 | /* give up */ 216 | UNPROTECT(4); 217 | return CWP_RC_END_OF_INPUT; 218 | } 219 | } 220 | } else { 221 | LOGD("No underflow handler"); 222 | return CWP_RC_END_OF_INPUT; 223 | } 224 | } 225 | 226 | 227 | 228 | SEXP _unpack_msg_partial(SEXP buf, SEXP startx, SEXP endx, SEXP optsxp) { 229 | /* Note that this must be able to cope with underflow handlers. */ 230 | calls++; /* reset WARN_ONCE */ 231 | 232 | cw_unpack_context cxt; 233 | assert_type(optsxp, RAWSXP); 234 | 235 | unsigned long start = asInteger(startx); 236 | unsigned long end = asInteger(endx); 237 | int protections = init_unpack_context(&cxt, optsxp, buf, start, end); 238 | LOG("Before: buf = 0x%x[%d:%d][%d], status = '%s'", 239 | RAW(cxt.opts->buf), 240 | cxt.start - RAW(cxt.opts->buf), 241 | cxt.end - RAW(cxt.opts->buf), 242 | cxt.current - RAW(cxt.opts->buf), 243 | decode_return_code(cxt.return_code)); 244 | 245 | SEXP msg = PROTECT(extract_sexp(&cxt)); 246 | protections++; 247 | 248 | LOG("After: buf = 0x%x[%d:%d][%d], status = '%s'", 249 | RAW(cxt.opts->buf), 250 | cxt.start - RAW(cxt.opts->buf), 251 | cxt.end - RAW(cxt.opts->buf), 252 | cxt.current - RAW(cxt.opts->buf), 253 | decode_return_code(cxt.return_code)); 254 | 255 | ASSERT(cxt.opts->depth == 0); 256 | ASSERT(cxt.opts->pending == 0); 257 | 258 | SEXP out = list3( 259 | PROTECT(ScalarString(mkChar(decode_return_code(cxt.return_code)))), 260 | msg, 261 | PROTECT(ScalarInteger((cxt.current - RAW(cxt.opts->buf)) / sizeof(Rbyte)))); 262 | 263 | protections += 2; 264 | 265 | UNPROTECT(protections); 266 | return out; 267 | } 268 | 269 | 270 | SEXP extract_sexp(cw_unpack_context *cxt) { 271 | cw_unpack_next_or_fail(cxt); 272 | return make_sexp_from_context(cxt); 273 | } 274 | 275 | 276 | void cw_unpack_next_or_fail(cw_unpack_context *cxt) { 277 | cw_unpack_next(cxt); 278 | if (cxt->return_code != CWP_RC_OK) { 279 | error("%s", 280 | decode_return_code(cxt->return_code)); 281 | } 282 | } 283 | 284 | 285 | void add_pending(cw_unpack_context *cxt, unsigned long howmany) { 286 | if (cxt->opts->depth >= cxt->opts->max_depth) { 287 | error("Message too deeply nested"); 288 | } 289 | cxt->opts->pending += howmany; 290 | if (cxt->opts->pending >= cxt->opts->max_pending) { 291 | error("Pending message of %lu bytes too long, max is %lu", cxt->opts->pending, cxt->opts->max_pending); 292 | } 293 | LOGD("depth = %u, pending = %lu", cxt->opts->depth, cxt->opts->pending); 294 | } 295 | 296 | 297 | SEXP alloc_vector(cw_unpack_context *cxt, SEXPTYPE type, unsigned long len) { 298 | add_pending(cxt, len); 299 | return allocVector(type, len); 300 | } 301 | 302 | 303 | SEXP make_sexp_from_context(cw_unpack_context *cxt) { 304 | LOGD("Making sexp from a %s", decode_item_type(cxt->item.type)); 305 | cwpack_item_types x = cxt->item.type; 306 | switch (x) { 307 | case CWP_ITEM_NIL: 308 | return ScalarLogical(NA_LOGICAL); 309 | 310 | case CWP_ITEM_BOOLEAN: 311 | return ScalarLogical(cxt->item.as.boolean); 312 | 313 | case CWP_ITEM_ARRAY: 314 | return extract_simplified_vector(cxt); 315 | 316 | case CWP_ITEM_BIN: 317 | return make_raw_from_context(cxt); 318 | 319 | case CWP_ITEM_STR: { 320 | SEXP x = PROTECT(make_charsxp_or_raw_from_context(cxt)); 321 | LOGD("x is a %s", type2char(TYPEOF(x))); 322 | if (TYPEOF(x) == CHARSXP) { 323 | UNPROTECT(1); 324 | return ScalarString(x); 325 | } else { 326 | UNPROTECT(1); 327 | return x; 328 | } 329 | } 330 | 331 | case CWP_ITEM_NEGATIVE_INTEGER: 332 | case CWP_ITEM_POSITIVE_INTEGER: 333 | case CWP_ITEM_FLOAT: 334 | case CWP_ITEM_DOUBLE: { 335 | SEXP buf; 336 | PROTECT_INDEX ix; 337 | PROTECT_WITH_INDEX(buf = alloc_vector(cxt, type_to_sexptype(cxt->item.type), 1), 338 | &ix); 339 | buf = fill_vector(cxt, buf, 1, ix, R_NilValue); 340 | UNPROTECT(1); 341 | return buf; 342 | } 343 | 344 | case CWP_ITEM_MAP: 345 | if (TYPEOF(cxt->opts->dict) == ENVSXP) { 346 | return extract_env(cxt); 347 | } else { 348 | return extract_simplified_vector(cxt); 349 | } 350 | 351 | default: 352 | if (x <= CWP_ITEM_MAX_USER_EXT 353 | && x >= CWP_ITEM_MIN_USER_EXT) { 354 | return make_ext_from_context(cxt); 355 | } else if (x <= CWP_ITEM_MAX_RESERVED_EXT 356 | && x >= CWP_ITEM_MIN_RESERVED_EXT) { 357 | return make_ext_from_context(cxt); 358 | } else { 359 | error("Unsupported item: %s", decode_item_type(cxt->item.type)); 360 | } 361 | } 362 | } 363 | 364 | 365 | SEXP extract_simplified_vector(cw_unpack_context *cxt) { 366 | int has_names = 0; 367 | switch(cxt->item.type) { 368 | case CWP_ITEM_ARRAY: 369 | has_names = 0; break; 370 | case CWP_ITEM_MAP: 371 | has_names = 1; break; 372 | default: 373 | error("Can't extract vector from a %s", decode_item_type(cxt->item.type)); 374 | } 375 | 376 | uint32_t len = cxt->item.as.array.size; 377 | LOGD("Extracting array of %d elements, simplifying", len); 378 | SEXP buf; 379 | PROTECT_INDEX ix; 380 | int protections = 0; 381 | 382 | SEXP names = R_NilValue; 383 | if(has_names) { 384 | LOGD("Allocating names[%d]", len); 385 | PROTECT(names = alloc_vector(cxt, STRSXP, len)); 386 | protections++; 387 | } 388 | 389 | /* Peek at the first item, allocate a buffer */ 390 | if (len > 0) { 391 | if(has_names) { 392 | cw_unpack_next_or_fail(cxt); 393 | SEXP name = PROTECT(always_make_charsxp_from_context(cxt)); 394 | SET_STRING_ELT(names, 0, name); 395 | UNPROTECT(1); 396 | } 397 | cw_unpack_next_or_fail(cxt); 398 | LOGD("first is a %s", decode_item_type(cxt->item.type)); 399 | 400 | SEXPTYPE type; 401 | if (cxt->opts->simplify) { 402 | type = type_to_sexptype(cxt->item.type); 403 | } else { 404 | type = VECSXP; 405 | } 406 | PROTECT_WITH_INDEX(buf = alloc_vector(cxt, type, len), 407 | &ix); 408 | protections++; 409 | LOGD("Allocated %s vector of length %d, protection id %d", 410 | type2char(type), len, ix); 411 | 412 | buf = fill_vector(cxt, buf, len, ix, names); 413 | 414 | } else { /* len == 0 */ 415 | buf = allocVector(LGLSXP, 0); 416 | } 417 | 418 | UNPROTECT(protections); 419 | return buf; 420 | } 421 | 422 | 423 | SEXP fill_vector(cw_unpack_context *cxt, SEXP buf, uint32_t len, 424 | PROTECT_INDEX ix, SEXP names) { 425 | cxt->opts->depth++; 426 | 427 | /* when called, we have the first item "primed" for us in cxt. Also 428 | buf has been protected. */ 429 | 430 | int non_null_seen = 0; 431 | int output_data_frame = ((names != R_NilValue) 432 | && cxt->opts->use_df 433 | && cxt->item.type == CWP_ITEM_ARRAY 434 | && len > 0); 435 | long df_rows = 0; 436 | if (output_data_frame) { 437 | LOGD("Might be a data frame"); 438 | df_rows = cxt->item.as.array.size; 439 | } 440 | 441 | for (int i = 0; i < len; i++) { /* loop once per item read */ 442 | if (i > 0) { /* skip what we did when we peeked at first item */ 443 | if (names != R_NilValue) { 444 | LOGD("Reading name %d", i); 445 | cw_unpack_next_or_fail(cxt); 446 | SEXP c = PROTECT(always_make_charsxp_from_context(cxt)); 447 | SET_STRING_ELT(names, i, c); 448 | UNPROTECT(1); 449 | } 450 | 451 | LOGD("Unpacking item %d", i); 452 | cw_unpack_next_or_fail(cxt); 453 | 454 | if (output_data_frame 455 | && (cxt->item.type != CWP_ITEM_ARRAY 456 | || cxt->item.as.array.size != df_rows)) { 457 | LOGD("Not a data frame after all"); 458 | output_data_frame = 0; 459 | } 460 | } 461 | 462 | cxt->opts->pending -= (names == R_NilValue ? 1 : 2); 463 | 464 | UNPACK_VALUE: /* goto here to "try again" after coercion */ 465 | LOGD("Storing item %d, a %s, into a %s", 466 | i, decode_item_type(cxt->item.type), type2char(TYPEOF(buf))); 467 | 468 | switch (TYPEOF(buf)) { 469 | 470 | case LGLSXP: 471 | 472 | switch(cxt->item.type) { 473 | 474 | case CWP_ITEM_NIL: 475 | LOGICAL(buf)[i] = NA_LOGICAL; 476 | break; 477 | 478 | case CWP_ITEM_BOOLEAN: 479 | non_null_seen = 1; 480 | LOGICAL(buf)[i] = (cxt->item.as.boolean ? TRUE : FALSE); 481 | break; 482 | 483 | default: 484 | if (non_null_seen) { 485 | REPROTECT(buf = coerce(buf, VECSXP), ix); 486 | } else { 487 | non_null_seen = 1; 488 | REPROTECT(buf = coerce(buf, type_to_sexptype(cxt->item.type)), ix); 489 | } 490 | goto UNPACK_VALUE; 491 | } 492 | break; 493 | 494 | 495 | case INTSXP: 496 | 497 | switch(cxt->item.type) { 498 | 499 | case CWP_ITEM_NIL: 500 | INTEGER(buf)[i] = NA_INTEGER; 501 | break; 502 | 503 | 504 | case CWP_ITEM_NEGATIVE_INTEGER: 505 | if (cxt->item.as.i64 == NA_INTEGER 506 | || cxt->item.as.i64 < INT_MIN) { 507 | REPROTECT(buf = coerce(buf, REALSXP), ix); 508 | goto UNPACK_VALUE; 509 | } else { 510 | INTEGER(buf)[i] = cxt->item.as.i64; 511 | } 512 | break; 513 | 514 | case CWP_ITEM_POSITIVE_INTEGER: 515 | if (cxt->item.as.u64 > INT_MAX) { 516 | REPROTECT(buf = coerce(buf, REALSXP), ix); 517 | goto UNPACK_VALUE; 518 | } else { 519 | INTEGER(buf)[i] = cxt->item.as.u64; 520 | } 521 | break; 522 | 523 | case CWP_ITEM_FLOAT: 524 | case CWP_ITEM_DOUBLE: 525 | REPROTECT(buf = coerce(buf, REALSXP), ix); 526 | goto UNPACK_VALUE; 527 | 528 | default: 529 | REPROTECT(buf = coerce(buf, VECSXP), ix); 530 | goto UNPACK_VALUE; 531 | } 532 | 533 | break; 534 | 535 | 536 | case REALSXP: 537 | switch(cxt->item.type) { 538 | 539 | case CWP_ITEM_NIL: 540 | REAL(buf)[i] = NA_REAL; 541 | break; 542 | 543 | case CWP_ITEM_NEGATIVE_INTEGER: 544 | REAL(buf)[i] = i64_to_double(cxt, cxt->item.as.i64); 545 | break; 546 | 547 | case CWP_ITEM_POSITIVE_INTEGER: 548 | REAL(buf)[i] = u64_to_double(cxt, cxt->item.as.u64); 549 | break; 550 | 551 | case CWP_ITEM_FLOAT: 552 | REAL(buf)[i] = cxt->item.as.real; 553 | break; 554 | 555 | case CWP_ITEM_DOUBLE: 556 | REAL(buf)[i] = cxt->item.as.long_real; 557 | break; 558 | 559 | default: 560 | REPROTECT(buf = coerce(buf, VECSXP), ix); 561 | goto UNPACK_VALUE; 562 | } 563 | 564 | break; 565 | 566 | 567 | case STRSXP: 568 | switch(cxt->item.type) { 569 | 570 | case CWP_ITEM_NIL: 571 | SET_STRING_ELT(buf, i, NA_STRING); 572 | break; 573 | 574 | case CWP_ITEM_STR: 575 | { 576 | SEXP s = PROTECT(make_charsxp_or_null(cxt)); 577 | if (s == R_NilValue) { 578 | REPROTECT(buf = coerce(buf, VECSXP), ix); 579 | UNPROTECT(1); 580 | goto UNPACK_VALUE; 581 | } else { 582 | SET_STRING_ELT(buf, i, s); 583 | } 584 | UNPROTECT(1); 585 | } 586 | break; 587 | 588 | default: 589 | REPROTECT(buf = coerce(buf, VECSXP), ix); 590 | goto UNPACK_VALUE; 591 | } 592 | 593 | break; 594 | 595 | 596 | case VECSXP: 597 | SET_VECTOR_ELT(buf, i, make_sexp_from_context(cxt)); 598 | 599 | break; 600 | 601 | 602 | default: 603 | error("Don't know how to fill out a %s (this shouldn't happen)", 604 | type2char(TYPEOF(buf))); 605 | 606 | } /* switch(TYPEOF(buf)) */ 607 | 608 | } /* for(i = ... */ 609 | 610 | if (names != R_NilValue) { 611 | setAttrib(buf, R_NamesSymbol, names); 612 | } 613 | 614 | if (output_data_frame) { 615 | LOGD("Making a data frame"); 616 | SEXP nrows = PROTECT(ScalarInteger(df_rows)); 617 | SEXP call = PROTECT(lang2(install(".set_row_names"), nrows)); 618 | SEXP rn = PROTECT(eval(call, R_BaseEnv)); 619 | setAttrib(buf, R_RowNamesSymbol, rn); 620 | setAttrib(buf, R_ClassSymbol, ScalarString(mkChar("data.frame"))); 621 | UNPROTECT(3); 622 | } 623 | 624 | LOGD("Returning a %s", type2char(TYPEOF(buf))); 625 | cxt->opts->depth--; 626 | return buf; 627 | } /* SEXP fill_vector(... */ 628 | 629 | 630 | SEXP coerce(SEXP buf, SEXPTYPE type) { 631 | LOG("Coercing %s to %s", type2char(TYPEOF(buf)), type2char(type)); 632 | return coerceVector(buf, type); 633 | } 634 | 635 | 636 | SEXP extract_env(cw_unpack_context *cxt) { 637 | ASSERT(cxt->item.type == CWP_ITEM_MAP); 638 | ASSERT(TYPEOF(cxt->opts->dict) == ENVSXP); 639 | int nkeys = cxt->item.as.map.size; 640 | add_pending(cxt, 2*nkeys); 641 | SEXP env = PROTECT(new_env(cxt->opts->dict)); 642 | for (int i = 0; i < nkeys; i++) { 643 | cw_unpack_next_or_fail(cxt); 644 | cxt->opts->pending--; 645 | SEXP key = PROTECT(always_make_charsxp_from_context(cxt)); 646 | SEXP value = PROTECT(extract_sexp(cxt)); 647 | cxt->opts->pending--; 648 | if (LENGTH(key) == 0) { 649 | WARN_ONCE("Item with empty name was discarded"); 650 | } else { 651 | SEXP sym = PROTECT(installChar(key)); 652 | 653 | if (sym == R_MissingArg || sym == R_DotsSymbol || DDVAL(sym)) { 654 | WARN_ONCE("Item with key `%s` was discarded", CHAR(PRINTNAME(sym))); 655 | } else { 656 | LOGD("storing variable `%s` into a %s", 657 | CHAR(PRINTNAME(sym)), type2char(TYPEOF(env))); 658 | defineVar(sym, value, env); 659 | } 660 | UNPROTECT(1); 661 | } 662 | UNPROTECT(2); 663 | } 664 | UNPROTECT(1); 665 | return env; 666 | } 667 | 668 | 669 | SEXP new_env(SEXP parent) { 670 | ASSERT(TYPEOF(parent) == ENVSXP); 671 | SEXP call = PROTECT(lang3(install("new.env"), ScalarLogical(TRUE), parent)); 672 | SEXP x = eval(call, R_BaseEnv); 673 | UNPROTECT(1); 674 | return x; 675 | } 676 | 677 | 678 | SEXP make_charsxp_or_raw_from_context(cw_unpack_context *cxt) { 679 | ASSERT(cxt->item.type == CWP_ITEM_STR || cxt->item.type == CWP_ITEM_BIN); 680 | SEXP c = PROTECT(make_charsxp_or_null(cxt)); 681 | if (c == R_NilValue) { 682 | UNPROTECT(1); 683 | return make_raw_from_context(cxt); 684 | } else { 685 | UNPROTECT(1); 686 | return c; 687 | } 688 | } 689 | 690 | 691 | SEXP make_raw_from_context(cw_unpack_context *cxt) { 692 | uint32_t len = cxt->item.as.str.length; 693 | SEXP out = PROTECT(allocVector(RAWSXP, len)); 694 | memcpy(RAW(out), cxt->item.as.str.start, len * sizeof(uint8_t)); 695 | UNPROTECT(1); 696 | return out; 697 | } 698 | 699 | 700 | SEXP make_ext_from_context(cw_unpack_context *cxt) { 701 | char typename[32]; 702 | WARN_ONCE("Extension type %d decoded as raw", cxt->item.type); 703 | snprintf(typename, sizeof(typename)/sizeof(char), "ext%d", cxt->item.type); 704 | SEXP out = PROTECT(make_raw_from_context(cxt)); 705 | setAttrib(out, R_ClassSymbol, ScalarString(mkChar(typename))); 706 | UNPROTECT(1); 707 | return out; 708 | } 709 | 710 | 711 | SEXP make_charsxp_or_null(cw_unpack_context *cxt) { 712 | if (cxt->item.type == CWP_ITEM_STR || cxt->item.type == CWP_ITEM_BIN) { 713 | const char *buf = cxt->item.as.str.start; 714 | int len = cxt->item.as.str.length; 715 | int non_ascii = 0; 716 | 717 | for (int i = 0; i < len; i++) { 718 | if (buf[i] == 0) { 719 | WARN_ONCE("Embedded null in string, returning raw instead."); 720 | return R_NilValue; 721 | } 722 | if (buf[i] & 0x80) { 723 | non_ascii = 1; 724 | } 725 | } 726 | 727 | if (non_ascii) { 728 | /* maybe overly paranoid but I didn't see mkCharLenCE doing such 729 | verification */ 730 | if (!verify_utf8(buf, len)) { 731 | WARN_ONCE("String is not valid UTF-8, returning raw instead."); 732 | return R_NilValue; 733 | } 734 | } 735 | 736 | return mkCharLenCE(buf, len, CE_UTF8); 737 | } else if (cxt->item.type == CWP_ITEM_NIL) { 738 | return NA_STRING; 739 | } else { 740 | return R_NilValue; 741 | } 742 | } 743 | 744 | 745 | SEXP always_make_charsxp_from_context(cw_unpack_context *cxt) { 746 | SEXP x = PROTECT(make_charsxp_or_null(cxt)); 747 | if (x == R_NilValue) { 748 | WARN_ONCE("Non-string used as key was coerced to string"); 749 | SEXP item = PROTECT(make_sexp_from_context(cxt)); 750 | SEXP call = PROTECT(lang2(install("repr"), item)); 751 | SEXP convert = PROTECT(eval(call, cxt->opts->package)); 752 | LOG("repr returned a %s", type2char(TYPEOF(convert))); 753 | SEXP chr = STRING_ELT(convert, 0); 754 | UNPROTECT(4); 755 | return chr; 756 | } else { 757 | UNPROTECT(1); 758 | return x; 759 | } 760 | } 761 | 762 | 763 | double i64_to_double(cw_unpack_context *cxt, int64_t x) { 764 | double xx = x; 765 | if ((int64_t)xx != x) { 766 | WARN_ONCE("Cast of integer %" PRId64 " to double loses precision", x); 767 | }; 768 | return xx; 769 | } 770 | 771 | 772 | double u64_to_double(cw_unpack_context *cxt, uint64_t x) { 773 | double xx = x; 774 | if ((uint64_t)xx != x) { 775 | WARN_ONCE("Cast of integer %" PRIu64 " to double loses precision", x); 776 | } 777 | return xx; 778 | } 779 | 780 | 781 | SEXPTYPE type_to_sexptype(int t) { 782 | switch(t) { 783 | case CWP_ITEM_NIL: 784 | case CWP_ITEM_BOOLEAN: return LGLSXP; 785 | case CWP_ITEM_POSITIVE_INTEGER: 786 | case CWP_ITEM_NEGATIVE_INTEGER: return INTSXP; 787 | case CWP_ITEM_FLOAT: 788 | case CWP_ITEM_DOUBLE: return REALSXP; 789 | case CWP_ITEM_STR: return STRSXP; 790 | default: 791 | { 792 | if (t <= CWP_ITEM_MAX_USER_EXT 793 | && t >= CWP_ITEM_MIN_USER_EXT) { 794 | return RAWSXP; 795 | } else if (t <= CWP_ITEM_MAX_RESERVED_EXT 796 | && t >= CWP_ITEM_MIN_RESERVED_EXT) { 797 | return RAWSXP; 798 | } else { 799 | return VECSXP; 800 | } 801 | } 802 | } 803 | } 804 | 805 | 806 | const char *decode_item_type(cwpack_item_types x) { 807 | switch(x) { 808 | case CWP_ITEM_NIL: return "nil"; 809 | case CWP_ITEM_BOOLEAN: return "boolean"; 810 | case CWP_ITEM_POSITIVE_INTEGER: return "positive integer"; 811 | case CWP_ITEM_NEGATIVE_INTEGER: return "negative integer"; 812 | case CWP_ITEM_FLOAT: return "float"; 813 | case CWP_ITEM_DOUBLE: return "double"; 814 | case CWP_ITEM_STR: return "string"; 815 | case CWP_ITEM_BIN: return "binary"; 816 | case CWP_ITEM_ARRAY: return "array"; 817 | case CWP_ITEM_MAP: return "map"; 818 | case CWP_NOT_AN_ITEM: return "not an item"; 819 | default: 820 | if (x <= CWP_ITEM_MAX_USER_EXT 821 | && x >= CWP_ITEM_MIN_USER_EXT) { 822 | return "CWP_ITEM_USER_EXT(n)"; 823 | } else if (x <= CWP_ITEM_MAX_RESERVED_EXT 824 | && x >= CWP_ITEM_MIN_RESERVED_EXT) { 825 | return "CWP_ITEM_RESERVED_EXT(n)"; 826 | } else { 827 | return "???"; 828 | } 829 | } 830 | } 831 | -------------------------------------------------------------------------------- /src/decode.h: -------------------------------------------------------------------------------- 1 | #ifndef DECODE_H 2 | #define DECODE_H 3 | 4 | #include "vadr.h" 5 | #include "cwpack.h" 6 | 7 | SEXP extract_sexp(cw_unpack_context *); 8 | int init_unpack_context(cw_unpack_context *, SEXP, SEXP, unsigned long, unsigned long); 9 | 10 | SEXP _unpack_msg(SEXP, SEXP); 11 | SEXP _unpack_msg_partial(SEXP, SEXP, SEXP, SEXP); 12 | SEXP _unpack_opts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 13 | 14 | #endif 15 | -------------------------------------------------------------------------------- /src/encode.c: -------------------------------------------------------------------------------- 1 | #include "cwpack.h" 2 | #include "vadr.h" 3 | 4 | int handle_overflow(cw_pack_context *, unsigned long); 5 | SEXP copy_to_new_raw(SEXP, unsigned long, unsigned long); 6 | 7 | void pack_sexp(cw_pack_context *, SEXP); 8 | 9 | void pack_singleton(cw_pack_context *, SEXP); 10 | void pack_vector(cw_pack_context*, SEXP); 11 | void pack_named_vector(cw_pack_context*, SEXP, SEXP); 12 | 13 | void pack_logical(cw_pack_context *, int); 14 | void pack_integer(cw_pack_context *, int); 15 | void pack_real(cw_pack_context *, double); 16 | void pack_string(cw_pack_context *, SEXP); 17 | void pack_raw(cw_pack_context *, SEXP); 18 | 19 | static long calls = 0; 20 | 21 | SEXP _pack_opts(SEXP compatible, 22 | SEXP as_is, 23 | SEXP use_dict, 24 | SEXP max_size, 25 | SEXP buf_size, 26 | SEXP package) { 27 | SEXP optsxp = PROTECT(allocVector(RAWSXP, sizeof(pack_opts))); 28 | pack_opts *opts = (pack_opts*)RAW(optsxp); 29 | opts->buf = R_NilValue; 30 | opts->buf_index = -1; 31 | opts->as_is = asLogical(as_is); 32 | opts->use_dict = asLogical(use_dict); 33 | opts->buf_size = asInteger(buf_size); 34 | double max_sized = asInteger(max_size); 35 | if (isinf(max_sized)) { 36 | opts->max_size = LONG_MAX; 37 | } else { 38 | opts->max_size = max_sized; 39 | } 40 | opts->compatible = asLogical(compatible); 41 | 42 | opts->package = package; 43 | /* also hang onto explicit refs for the SEXP values*/ 44 | SEXP pkg = PROTECT(CONS(package, R_NilValue)); 45 | setAttrib(optsxp, install("refs"), pkg); 46 | UNPROTECT(2); 47 | return optsxp; 48 | } 49 | 50 | 51 | int init_pack_context(cw_pack_context *cxt, SEXP opts) { 52 | ASSERT(TYPEOF(opts) == RAWSXP && LENGTH(opts) == sizeof(pack_opts)); 53 | cxt->opts = (pack_opts *) RAW(opts); 54 | 55 | /* allocate a buffer and protect it. This is unprotected by caller */ 56 | PROTECT_WITH_INDEX(cxt->opts->buf = allocVector(RAWSXP, cxt->opts->buf_size), 57 | &cxt->opts->buf_index); 58 | cw_pack_context_init(cxt, 59 | RAW(cxt->opts->buf), 60 | LENGTH(cxt->opts->buf), 61 | &handle_overflow); 62 | cw_pack_set_compatibility(cxt, cxt->opts->compatible); 63 | return 1; /* return protections */ 64 | } 65 | 66 | 67 | SEXP _pack_msg(SEXP input, SEXP opts) { 68 | calls++; 69 | cw_pack_context cxt; 70 | int protections = init_pack_context(&cxt, opts); 71 | 72 | pack_sexp(&cxt, input); 73 | 74 | if (cxt.return_code != CWP_RC_OK) { 75 | error("%s", decode_return_code(cxt.return_code)); 76 | } 77 | unsigned long newlength = (cxt.current - cxt.start) / sizeof(Rbyte); 78 | SETLENGTH(cxt.opts->buf, newlength); 79 | UNPROTECT(protections); 80 | return cxt.opts->buf; 81 | } 82 | 83 | 84 | int handle_overflow(cw_pack_context *cxt, unsigned long more) { 85 | /* allocate a vector twice as big, copy data into the new vector, 86 | and update context. */ 87 | unsigned long newlen = LENGTH(cxt->opts->buf); 88 | unsigned long req = LENGTH(cxt->opts->buf) + more; 89 | 90 | if (req > cxt->opts->max_size) return CWP_RC_BUFFER_OVERFLOW; 91 | 92 | while (newlen < req) newlen *= 2; 93 | if (newlen > cxt->opts->max_size) { 94 | newlen = cxt->opts->max_size; 95 | } 96 | 97 | LOG("%d / %d bytes used, need %d more, resizing to %d\n", 98 | cxt->current - cxt->start, 99 | LENGTH(cxt->opts->buf), 100 | more, 101 | newlen); 102 | 103 | REPROTECT(cxt->opts->buf = 104 | copy_to_new_raw(cxt->opts->buf, 105 | newlen, 106 | LENGTH(cxt->opts->buf)), 107 | cxt->opts->buf_index); 108 | 109 | // update the context structure to point to the new buf 110 | cxt->current = cxt->current - cxt->start + RAW(cxt->opts->buf); 111 | cxt->end = RAW(cxt->opts->buf) + LENGTH(cxt->opts->buf) * sizeof(Rbyte); 112 | cxt->start = RAW(cxt->opts->buf); 113 | return 0; 114 | } 115 | 116 | 117 | SEXP copy_to_new_raw(SEXP from, unsigned long new_len, unsigned long copy_len) { 118 | assert_type(from, RAWSXP); 119 | SEXP to = PROTECT(allocVector(RAWSXP, new_len)); 120 | memcpy(RAW(to), RAW(from), MIN(new_len, copy_len) * sizeof(Rbyte)); 121 | UNPROTECT(1); 122 | return to; 123 | } 124 | 125 | 126 | int containsString(SEXP cl, const char *ch) { 127 | assert_type(cl, STRSXP); 128 | SEXP cmp = PROTECT(mkChar(ch)); 129 | int found = 0; 130 | for (int i = 0; i < LENGTH(cl); i++) { 131 | if (STRING_ELT(cl, i) == cmp) { 132 | found = 1; break; 133 | } 134 | } 135 | UNPROTECT(1); 136 | return found; 137 | } 138 | 139 | 140 | void pack_sexp(cw_pack_context* cxt, SEXP dat) { 141 | int as_is_sto = cxt->opts->as_is; 142 | int unp = 0; 143 | 144 | /* check for asIs, classes */ 145 | SEXP cl = getAttrib(dat, R_ClassSymbol); 146 | if ((cl) != R_NilValue) { 147 | if (containsString(cl, "AsIs")) { 148 | cxt->opts->as_is = TRUE; 149 | } else { 150 | /* Preprocess (unless in the middle of an AsIs) */ 151 | LOG("Preprocessing a %s of class '%s'\n", 152 | type2char(TYPEOF(cxt->opts->package)), 153 | CHAR(STRING_ELT(cl, 0))); 154 | SEXP call = PROTECT(lang2(install("prepack"), dat)); 155 | dat = PROTECT(eval(call, cxt->opts->package)); 156 | unp += 2; 157 | /* Check if the preprocessor gave us an AsIs, but don't 158 | preprocess again */ 159 | cl = getAttrib(dat, R_ClassSymbol); 160 | if (cl != R_NilValue && containsString(cl, "AsIs")) { 161 | cxt->opts->as_is = TRUE; 162 | LOG("Preprocessor returned an AsIs!"); 163 | } 164 | } 165 | } 166 | 167 | if (isVector(dat)) { 168 | SEXP names = getAttrib(dat, R_NamesSymbol); 169 | if (names != R_NilValue && cxt->opts->use_dict) { 170 | PROTECT(names); unp += 1; 171 | pack_named_vector(cxt, dat, names); 172 | } else if (LENGTH(dat) == 1 && !cxt->opts->as_is) { 173 | pack_singleton(cxt, dat); 174 | } else { 175 | pack_vector(cxt, dat); 176 | } 177 | } else { 178 | switch (TYPEOF(dat)) { 179 | 180 | case NILSXP: 181 | cw_pack_nil(cxt); break; 182 | 183 | case ENVSXP: 184 | { 185 | SEXP args = PROTECT(lang4(install("as.list.environment"), 186 | dat, 187 | ScalarLogical(1), 188 | ScalarLogical(1))); 189 | SEXP list = PROTECT(eval(args, R_BaseEnv)); 190 | unp += 2; 191 | SEXP names = getAttrib(list, R_NamesSymbol); 192 | pack_named_vector(cxt, list, names); 193 | break; 194 | } 195 | 196 | default: 197 | cxt->opts->buf = NULL; 198 | error("can't pack a %s", type2char(TYPEOF(dat))); 199 | } 200 | } 201 | UNPROTECT(unp); 202 | cxt->opts->as_is = as_is_sto; 203 | } 204 | 205 | 206 | void pack_singleton(cw_pack_context *cxt, SEXP dat) { 207 | switch (TYPEOF(dat)) { 208 | case LGLSXP: 209 | pack_logical(cxt, LOGICAL(dat)[0]); 210 | break; 211 | 212 | case INTSXP: 213 | pack_integer(cxt, INTEGER(dat)[0]); 214 | break; 215 | 216 | case REALSXP: 217 | pack_real(cxt, REAL(dat)[0]); 218 | break; 219 | 220 | case STRSXP: 221 | pack_string(cxt, STRING_ELT(dat, 0)); 222 | break; 223 | 224 | case RAWSXP: 225 | pack_raw(cxt, dat); 226 | break; 227 | 228 | case VECSXP: 229 | pack_vector(cxt, dat); 230 | break; 231 | 232 | default: 233 | cxt->opts->buf = NULL; 234 | error("can't pack a singleton %s", type2char(TYPEOF(dat))); 235 | } 236 | } 237 | 238 | // generic for-loop macro 239 | #define PACK_VECTOR(CXT, X, ACCESSOR, STORE) { \ 240 | int len = LENGTH(X); \ 241 | cw_pack_array_size(cxt, len); \ 242 | for (int i = 0; i < len; i++) { \ 243 | STORE(CXT, ACCESSOR(X, i)); \ 244 | } \ 245 | } 246 | 247 | void pack_vector(cw_pack_context *cxt, SEXP x) { 248 | ASSERT(isVector(x)); 249 | 250 | switch(TYPEOF(x)) { 251 | 252 | case RAWSXP: 253 | pack_raw(cxt, x); 254 | break; 255 | 256 | case LGLSXP: 257 | PACK_VECTOR(cxt, x, LOGICAL_ELT, pack_logical); 258 | break; 259 | 260 | case INTSXP: 261 | PACK_VECTOR(cxt, x, INTEGER_ELT, pack_integer); 262 | break; 263 | 264 | case REALSXP: 265 | PACK_VECTOR(cxt, x, REAL_ELT, pack_real); 266 | break; 267 | 268 | case VECSXP: 269 | PACK_VECTOR(cxt, x, VECTOR_ELT, pack_sexp); 270 | break; 271 | 272 | case STRSXP: 273 | PACK_VECTOR(cxt, x, STRING_ELT, pack_string); 274 | break; 275 | 276 | default: 277 | cxt->opts->buf = NULL; 278 | error("Don't know how to pack a %s vector", type2char(TYPEOF(x))); 279 | } 280 | } 281 | 282 | // generic for-loop macro 283 | #define PACK_NAMED_VECTOR(CXT, X, NAMES, ACCESSOR, STORE) { \ 284 | int len = LENGTH(X); \ 285 | cw_pack_map_size(cxt, len); \ 286 | for (int i = 0; i < len; i++) { \ 287 | pack_string(CXT, STRING_ELT(NAMES, i)); \ 288 | STORE(CXT, ACCESSOR(X, i)); \ 289 | } \ 290 | } 291 | 292 | void pack_named_vector(cw_pack_context *cxt, SEXP x, SEXP names) { 293 | ASSERT(isVector(x)); 294 | 295 | switch(TYPEOF(x)) { 296 | 297 | case RAWSXP: 298 | WARN_ONCE("Names discarded from raw object"); 299 | pack_raw(cxt, x); 300 | break; 301 | 302 | case LGLSXP: 303 | PACK_NAMED_VECTOR(cxt, x, names, LOGICAL_ELT, pack_logical); 304 | break; 305 | 306 | case INTSXP: 307 | PACK_NAMED_VECTOR(cxt, x, names, INTEGER_ELT, pack_integer); 308 | break; 309 | 310 | case REALSXP: 311 | PACK_NAMED_VECTOR(cxt, x, names, REAL_ELT, pack_real); 312 | break; 313 | 314 | case VECSXP: 315 | PACK_NAMED_VECTOR(cxt, x, names, VECTOR_ELT, pack_sexp); 316 | break; 317 | 318 | case STRSXP: 319 | PACK_NAMED_VECTOR(cxt, x, names, STRING_ELT, pack_string); 320 | break; 321 | 322 | default: 323 | cxt->opts->buf = NULL; 324 | error("Don't know how to pack a %s vector", type2char(TYPEOF(x))); 325 | } 326 | } 327 | 328 | 329 | void pack_logical(cw_pack_context *cxt, int x) { 330 | if (x == NA_LOGICAL) { 331 | cw_pack_nil(cxt); 332 | } else if (x) { 333 | cw_pack_true(cxt); 334 | } else { 335 | cw_pack_false(cxt); 336 | } 337 | } 338 | 339 | void pack_integer(cw_pack_context *cxt, int x) { 340 | if (x == NA_INTEGER) { 341 | cw_pack_nil(cxt); 342 | } else { 343 | cw_pack_signed(cxt, x); 344 | } 345 | } 346 | 347 | void pack_real(cw_pack_context *cxt, double x) { 348 | /* save bytes by packing integer if possible. */ 349 | if (ISNA(x)) { 350 | cw_pack_nil(cxt); 351 | } else if (ceil(x) == x) { 352 | if (x >= 0 && x <= UINT32_MAX) { 353 | cw_pack_unsigned(cxt, x); 354 | } else if (x < 0 && x >= INT32_MIN) { 355 | cw_pack_signed(cxt, x); 356 | } else { 357 | /* NaN's end up here. */ 358 | cw_pack_real(cxt, x); 359 | } 360 | } else { 361 | cw_pack_real(cxt, x); 362 | } 363 | } 364 | 365 | void pack_string(cw_pack_context *cxt, SEXP x) { 366 | assert_type(x, CHARSXP); 367 | if (x == NA_STRING) { 368 | cw_pack_nil(cxt); 369 | } else { 370 | if (getCharCE(x) != CE_UTF8) { 371 | /* reEnc allocates temp memory */ 372 | void *vmax = vmaxget(); 373 | const char *newbuf = reEnc(CHAR(x), getCharCE(x), CE_UTF8, 1); 374 | x = PROTECT(mkCharCE(newbuf, CE_UTF8)); 375 | int len = R_nchar(x, Bytes, 0, 0, ""); 376 | cw_pack_str(cxt, CHAR(x), len); 377 | UNPROTECT(1); 378 | /* free temp memory */ 379 | vmaxset(vmax); 380 | } else { 381 | int len = R_nchar(x, Bytes, 0, 0, ""); 382 | cw_pack_str(cxt, CHAR(x), len); 383 | } 384 | } 385 | } 386 | 387 | void pack_raw(cw_pack_context *cxt, SEXP x) { 388 | assert_type(x, RAWSXP); 389 | cw_pack_bin(cxt, RAW(x), LENGTH(x)); 390 | } 391 | -------------------------------------------------------------------------------- /src/encode.h: -------------------------------------------------------------------------------- 1 | #ifndef ENCODE_H 2 | #define ENCODE_H 3 | 4 | #include "vadr.h" 5 | #include "cwpack.h" 6 | 7 | SEXP _pack_opts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP); 8 | SEXP _pack_msg(SEXP, SEXP); 9 | 10 | void pack_sexp(cw_pack_context *, SEXP); 11 | int init_pack_context(cw_pack_context *, SEXP); 12 | 13 | #endif 14 | 15 | -------------------------------------------------------------------------------- /src/init.c: -------------------------------------------------------------------------------- 1 | #include 2 | #include 3 | #include // for NULL 4 | #include 5 | #include "encode.h" 6 | #include "decode.h" 7 | 8 | /* FIXME: 9 | Check these declarations against the C/Fortran source code. 10 | */ 11 | 12 | static const R_CallMethodDef CallEntries[] = { 13 | {"_pack_msg", (DL_FUNC) &_pack_msg, 2}, 14 | {"_pack_opts", (DL_FUNC) &_pack_opts, 6}, 15 | {"_unpack_msg", (DL_FUNC) &_unpack_msg, 2}, 16 | {"_unpack_msg_partial", (DL_FUNC) &_unpack_msg_partial, 4}, 17 | {"_unpack_opts", (DL_FUNC) &_unpack_opts, 7}, 18 | {NULL, NULL, 0} 19 | }; 20 | 21 | void R_init_msgpack(DllInfo *dll) 22 | { 23 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 24 | R_useDynamicSymbols(dll, FALSE); 25 | } 26 | -------------------------------------------------------------------------------- /src/opts.h: -------------------------------------------------------------------------------- 1 | #ifndef OPTS_H 2 | #define OPTS_H 3 | 4 | #include 5 | #include 6 | 7 | /* To hand options down from R code into the callbacks, I attach this 8 | options structure to the CWpack state structures. 9 | 10 | These structures exist at R level as the contents of some RAWSXP. 11 | This seems safe enough. But part of these structures are also 12 | pointers to SEXPs. I cover for this by also holding the SEXP 13 | pointers as an attrubute. But this feels tacky. 14 | */ 15 | 16 | typedef struct unpack_opts { 17 | SEXP dict; 18 | int use_df; 19 | int simplify; 20 | SEXP package; 21 | unsigned int max_depth; 22 | unsigned long max_pending; 23 | 24 | /* state variables used in unpack */ 25 | SEXP buf; 26 | PROTECT_INDEX buf_index; 27 | unsigned int depth; 28 | unsigned long pending; 29 | SEXP underflow_handler; 30 | 31 | } unpack_opts; 32 | 33 | 34 | typedef struct pack_opts { 35 | int as_is; 36 | int compatible; 37 | int use_dict; 38 | long max_size; 39 | long buf_size; 40 | 41 | /* state used by pack / write */ 42 | SEXP buf; 43 | PROTECT_INDEX buf_index; 44 | SEXP package; 45 | 46 | /* state used by write */ 47 | SEXP conn; 48 | 49 | } pack_opts; 50 | 51 | #endif 52 | -------------------------------------------------------------------------------- /src/utf8.c: -------------------------------------------------------------------------------- 1 | #include "utf8.h" 2 | #include "vadr.h" 3 | 4 | /* Copyright (c) 2008-2009 Bjoern Hoehrmann 5 | See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */ 6 | 7 | #define UTF8_ACCEPT 0 8 | #define UTF8_REJECT 1 9 | 10 | static const uint8_t utf8d[] = { 11 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 12 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 13 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 14 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 15 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 16 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 17 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 18 | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 19 | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 20 | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 21 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 22 | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 23 | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 24 | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 25 | }; 26 | 27 | uint32_t 28 | decode_utf8(uint32_t* state, uint32_t* codep, uint32_t byte) { 29 | uint32_t type = utf8d[byte]; 30 | 31 | *codep = (*state != UTF8_ACCEPT) ? 32 | (byte & 0x3fu) | (*codep << 6) : 33 | (0xff >> type) & (byte); 34 | 35 | *state = utf8d[256 + *state*16 + type]; 36 | return *state; 37 | } 38 | 39 | /* LICENSE for above code: */ 40 | 41 | /* Permission is hereby granted, free of charge, to any person 42 | obtaining a copy of this software and associated documentation 43 | files (the "Software"), to deal in the Software without 44 | restriction, including without limitation the rights to use, copy, 45 | modify, merge, publish, distribute, sublicense, and/or sell copies 46 | of the Software, and to permit persons to whom the Software is 47 | furnished to do so, subject to the following conditions: */ 48 | 49 | /* The above copyright notice and this permission notice shall be 50 | included in all copies or substantial portions of the Software. */ 51 | 52 | /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 53 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 54 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 55 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 56 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 57 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 58 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 59 | SOFTWARE. */ 60 | 61 | 62 | int verify_utf8(const char *x, uint32_t len) { 63 | uint32_t codepoint = 0; 64 | uint32_t state = 0; 65 | 66 | for (uint32_t i = 0; i < len; i++) { 67 | decode_utf8(&state, &codepoint, (unsigned char) x[i]); 68 | } 69 | 70 | return state == UTF8_ACCEPT; 71 | } 72 | -------------------------------------------------------------------------------- /src/utf8.h: -------------------------------------------------------------------------------- 1 | #ifndef UTF8_H 2 | #define UTF8_H 3 | 4 | #include 5 | 6 | int verify_utf8(const char *, uint32_t len); 7 | 8 | #endif 9 | -------------------------------------------------------------------------------- /src/vadr.c: -------------------------------------------------------------------------------- 1 | #include "vadr.h" 2 | #include "cwpack.h" 3 | #include 4 | 5 | void assert_type5(SEXP x, SEXPTYPE type, const char *where, 6 | const char *file, int line) { 7 | if (TYPEOF(x) != type) { 8 | error("Expected %s in %s, got %s (%s:%d)", 9 | type2char(type), where, type2char(TYPEOF(x)), file, line); 10 | } 11 | } 12 | 13 | const char *decode_return_code(int x) { 14 | switch(x) { 15 | case CWP_RC_OK: return "ok"; 16 | case CWP_RC_END_OF_INPUT: return "end of input"; 17 | case CWP_RC_BUFFER_OVERFLOW: return "buffer overflow"; 18 | case CWP_RC_BUFFER_UNDERFLOW: return "buffer underflow"; 19 | case CWP_RC_MALFORMED_INPUT: return "malformed input"; 20 | case CWP_RC_WRONG_BYTE_ORDER: return "wrong byte order"; 21 | case CWP_RC_ERROR_IN_HANDLER: return "error in handler"; 22 | case CWP_RC_ILLEGAL_CALL: return "illegal call"; 23 | case CWP_RC_MALLOC_ERROR: return "malloc error"; 24 | case CWP_RC_STOPPED: return "stopped"; 25 | default: return "unknown error"; 26 | } 27 | } 28 | 29 | -------------------------------------------------------------------------------- /src/vadr.h: -------------------------------------------------------------------------------- 1 | #ifndef _VADR_H 2 | #define _VADR_H 3 | 4 | #include 5 | #include 6 | 7 | #define MIN(x,y) ((x) < (y) ? (x) : (y)) 8 | #define MAX(x,y) ((x) > (y) ? (x) : (y)) 9 | #define ASSERT(X) {if (! (X)) { error("%s: expected that (%s) @%s:%d\n", __func__, #X, __FILE__, __LINE__); }} 10 | 11 | #define LOGICAL_ELT(O, I) (LOGICAL(O)[I]) 12 | #define INTEGER_ELT(O, I) (INTEGER(O)[I]) 13 | #define REAL_ELT(O, I) (REAL(O)[I]) 14 | 15 | #define WARN_ONCE(...) { \ 16 | static long last_warned = 0; \ 17 | if (last_warned < calls) { \ 18 | last_warned = calls; \ 19 | warning(__VA_ARGS__); \ 20 | } \ 21 | } 22 | 23 | #undef DEBUG 24 | 25 | #ifdef DEBUG 26 | #define LOG(FMT, ...) Rprintf("%s: " FMT " @%s:%d\n", \ 27 | __func__, ##__VA_ARGS__, __FILE__, __LINE__) 28 | #else 29 | #define LOG(...) NULL 30 | #endif 31 | 32 | #define assert_type(x, type) assert_type3(x, type, __func__) 33 | #define assert_type3(x, type, where) assert_type5(x, type, where, __FILE__, __LINE__) 34 | void assert_type5(SEXP x, SEXPTYPE type, const char *where, const char *file, int line); 35 | const char *decode_return_code(int); 36 | 37 | #endif 38 | 39 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(msgpack) 3 | 4 | test_check("msgpack") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-static.R: -------------------------------------------------------------------------------- 1 | context("msgpackr static API") 2 | 3 | `%is%` <- expect_equal 4 | 5 | 6 | roundtrip <- function(start) { 7 | bin <- packMsg(start) 8 | end <- unpackMsg(bin) 9 | expect_equal(start, end) 10 | bin 11 | } 12 | 13 | 14 | pack_rt <- function (start, cmp) { 15 | bin <- packMsg(start) 16 | expect_equal(bin, cmp) 17 | end <- unpackMsg(bin) 18 | expect_equivalent(start, end) 19 | } 20 | 21 | test_that("pack singletons", { 22 | #null 23 | pack_rt(NA, as.raw(0xc0)) 24 | packMsg(NULL) %is% as.raw(0xc0) 25 | 26 | #logical 27 | pack_rt(FALSE, as.raw(0xc2)) 28 | pack_rt(TRUE, as.raw(0xc3)) 29 | 30 | #small ints 31 | pack_rt(12L, as.raw(0x0c)) 32 | pack_rt(-4L, as.raw(0xfc)) 33 | 34 | #32 bit ints 35 | pack_rt(2147483647L, as.raw(c(0xCE, 0x7f, 0xff, 0xff, 0xff))) 36 | 37 | # cwpack will use 32 bit float if precision is preserved. For example, Inf: 38 | pack_rt(Inf, as.raw(c(0xca, 0x7f, 0x80, 0x00, 0x00))) 39 | 40 | # and a float64: 41 | x <- 1.7976931348623157e308 # .Machine$double.xmax 42 | # 0 11111111110 1111111111111111111111111111111111111111111111111111 43 | pack_rt(x, as.raw(c(0xCB, 0x7F, 0xEF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF))) 44 | 45 | # character 46 | pack_rt("hello", 47 | as.raw(c(0xa5, 0x68, 0x65, 0x6c, 0x6c, 0x6f))) 48 | 49 | # raw bytes 50 | pack_rt(as.raw(0xab), 51 | as.raw(c(0xc4, 0x01, 0xab))) 52 | 53 | #NAs and NULL all collapse to nil 54 | packMsg(NA_character_) %is% as.raw(0xc0) 55 | packMsg(NA_real_) %is% as.raw(0xc0) 56 | packMsg(NA_integer_) %is% as.raw(0xc0) 57 | packMsg(NULL) %is% as.raw(0xc0) 58 | }) 59 | 60 | test_that("unpack large ints to float", { 61 | bigint = as.raw(c(0xcf, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01)) 62 | negint = as.raw(c(0xd3, 0xff, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff)) 63 | not_na = as.raw(c(0xd3, 0xff, 0xff, 0xff, 0xff, 0x80, 0x00, 0x00, 0x00)) 64 | bigfloat = 9007199254740993 #read as float loses precision 65 | negfloat = -2147483649 66 | not_na_float = -2147483648 67 | expect_warning(unpackMsg(bigint) %is% bigfloat, "precision") 68 | unpackMsg(negint) %is% negfloat 69 | unpackMsg(not_na) %is% not_na_float 70 | # 71 | # and all in a vector... 72 | expect_warning( 73 | unpackMsg(c(as.raw(0x93), bigint, negint, not_na)) %is% 74 | c(bigfloat, negfloat, not_na_float), 75 | "precision") 76 | }) 77 | 78 | test_that("nice errors from unpack", { 79 | expect_error(unpackMsg(as.raw(c(0x92, 0xc0))), 80 | "end of input") 81 | }) 82 | 83 | 84 | test_that("Pack raws", { 85 | roundtrip(as.raw(c(0xab, 0xbc, 0x00))) 86 | }) 87 | 88 | 89 | test_that("Pack lists", { 90 | roundtrip(list(1, "what")) 91 | roundtrip(list("a", list("b", 4))) 92 | }) 93 | 94 | 95 | test_that("unpack simplified vectors", { 96 | roundtrip(c(FALSE, NA)) #bool 97 | roundtrip(list(FALSE, 3L)) #list; don't coerce logicals (that aren't all NA) 98 | roundtrip(c(1L, NA)) #integer 99 | roundtrip(c(1L, NA, 1.0)) #real 100 | roundtrip(c("hello", NA)) #string 101 | roundtrip(list(1L, 2L, "hi")) #list, don't coerce to char 102 | roundtrip(list(c(1,2), c("hi", "bye"))) #list 103 | }) 104 | 105 | 106 | test_that("unpack simplified vectors starting with NA", { 107 | roundtrip(c(NA, FALSE, TRUE)) 108 | roundtrip(c(NA, 1L, 2L)) 109 | roundtrip(c(NA, exp(0), pi)) 110 | roundtrip(c(NA, "hi", "bye")) 111 | }) 112 | 113 | 114 | test_that("pack zero length vectors", { 115 | roundtrip(logical(0)) 116 | }) 117 | 118 | 119 | test_that("packing overflow handler works", { 120 | expect_true(length(packMsg(1:10000)) > 1000) 121 | }) 122 | 123 | test_that("extension mechanism", { 124 | obj <- c(1, 2, 3) 125 | class(obj) <- c("reverse") 126 | assign(envir = globalenv(), "prepack.reverse", function(x) rev(x)) 127 | unpackMsg(packMsg(obj)) %is% c(3, 2, 1) 128 | }) 129 | 130 | 131 | test_that("recursive use of msgpack works", { 132 | assign(envir = globalenv(), "prepack.blob", function(x) packMsg(unclass(x))) 133 | obj <- "hello" 134 | class(obj) <- "blob" 135 | typeof(unpackMsg(packMsg(obj))) %is% "raw" 136 | }) 137 | 138 | 139 | test_that("Max buffer size", { 140 | packMsg(300:400, max_size=306, buf_size=10) 141 | expect_error(packMsg(300:401, max_size=306, buf_size = 10), "overflow") 142 | }) 143 | 144 | 145 | test_that("NA and NaN are distinct doubles,", { 146 | roundtrip(c(NA, NaN)) 147 | }) 148 | 149 | 150 | test_that("compatibility mode", { 151 | packMsg(as.raw(c(1, 2, 3))) %is% as.raw(c(0xc4, 0x03, 0x01, 0x02, 0x03)) 152 | packMsg(as.raw(c(1, 2, 3)), compatible=TRUE) %is% as.raw(c(0xa3, 0x01, 0x02, 0x03)) 153 | }) 154 | 155 | 156 | test_that("UnpackMsg: detect bad strings, warn, and return raw", { 157 | 158 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa3, 0x00, 0x62, 0x63))), 159 | as.raw(c(0x00, 0x62, 0x63))), 160 | "nul") 161 | 162 | #and check for malformed UTF8 163 | #3 byte sequence with last continuation byte missing 164 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa2, 0x30, 0x80))), 165 | as.raw(c(0x30, 0x80))), 166 | "UTF") 167 | 168 | #2 bytes of 3 byte sequence followed by space 169 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa3, 0x30, 0x80, 0x20))), 170 | as.raw(c(0x30, 0x80, 0x20))), 171 | "UTF") 172 | 173 | # illegal byte 174 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa1, 0xff))), 175 | as.raw(c(0xff))), 176 | "UTF") 177 | #also for malformed UTF8? 178 | 179 | }) 180 | 181 | test_that("always emit strings in UTF8,", { 182 | x <- "fa\xE7ile" 183 | Encoding(x) <- "latin1" 184 | packMsg(x) %is% as.raw(c(0xa7, 0x66, 0x61, 0xc3, 0xa7, 0x69, 0x6c, 0x65)) 185 | }) 186 | 187 | test_that("use ints for integral floats under 32 bits", { 188 | packMsg(1) %is% packMsg(1L) 189 | length(packMsg(2^32)) %is% 5 190 | length(packMsg(2^32+1)) %is% 9 191 | length(packMsg(-2^31)) %is% 5 192 | length(packMsg(-2^31-1)) %is% 9 193 | }) 194 | 195 | 196 | test_that("as_is uses arrays even for singletons", { 197 | length(packMsg(1)) %is% 1 198 | length(packMsg(1, as_is=TRUE)) %is% 2 199 | length(packMsg(list(1, 2, 3))) %is% 4 200 | length(packMsg(list(1, 2, 3), as_is = TRUE)) %is% 7 201 | unpackMsg(packMsg(list(1, 2, 3))) %is% c(1, 2, 3) 202 | unpackMsg(packMsg(list(1, 2, 3), as_is = TRUE)) %is% list(1, 2, 3) 203 | length(packMsg(I(1), as_is=FALSE)) %is% 2 204 | }) 205 | 206 | 207 | test_that( "single row data frames also pack with asIs", { 208 | expect_true( length(packMsg(data.frame( a=1, b=2))) 209 | > length(packMsg(list(a=1, b=2)))) 210 | }) 211 | 212 | 213 | test_that("pack named vectors into dicts", { 214 | unpackMsg(packMsg(list(a=1, b=NULL))) %is% c(a=1, b=NA) 215 | unpackMsg(packMsg(list(a=1, b=NULL), use_dict=FALSE)) %is% c(1, NA) 216 | }) 217 | 218 | 219 | test_that("Unpack dicts into envs", { 220 | unpackMsg(packMsg(list2env(list(a=1, b=2)))) %is% c(a=1, b=2) 221 | x <- new.env() 222 | e <- unpackMsg(packMsg(list(a = 1, b = NA)), parent = x) 223 | typeof(e) %is% "environment" 224 | as.list(e) %is% list(a = 1, b = NA) 225 | parent.env(e) %is% x 226 | unpackMsg(packMsg(emptyenv()), parent=emptyenv()) 227 | }) 228 | 229 | 230 | test_that("non-string dict keys", { 231 | d = as.raw(c(0x82, 0xa1, 0x61, 0x01, 0x02, 0x02)) 232 | expect_warning(unpackMsg(d) %is% c(a=1, `2`=2), "string") 233 | d2 = as.raw(c(as.raw(c(0x82, 0xa1, 0x61, 0x92, 0x01, 0x04, 0x92, 0x02, 0x04, 0x00)))) 234 | expect_warning(unpackMsg(d2) %is% list(a=c(1, 4), `c(2, 4)` = 0), "string") 235 | }) 236 | 237 | 238 | test_that("pack envs into sorted dicts", { 239 | e <- list2env(list(c=3, b=1, d=4, a=2)) 240 | unpackMsg(packMsg(e)) %is% c(a=2, b=1, c=3, d=4) 241 | }) 242 | 243 | 244 | 245 | test_that("Unpack dicts into envs", { 246 | x <- unpackMsg(packMsg(c(a=2, b=1, c=3, d=4)), parent=environment()) 247 | expect_equal(as.list.environment(x, sorted = TRUE), 248 | list(a=2, b=1, c=3, d=4)) 249 | }) 250 | 251 | 252 | test_that("warn bad var names and discard", { 253 | b <- packMsg(c(a=1, 3, b=4)) 254 | expect_warning(e <- unpackMsg(b, parent=environment()), "empty") 255 | as.list.environment(e, all.names=TRUE, sorted=TRUE) %is% list(a=1, b=4) 256 | }) 257 | 258 | 259 | test_that("NA names, dots names...", { 260 | x <- list(2, "four", rep(1,6), c(), list()) 261 | n <- c("two", NA, "...", "..5", "") 262 | names(x) <- n 263 | names(unpackMsg(packMsg(x))) %is% n 264 | expect_warning(e <- unpackMsg(packMsg(x), parent=environment())) 265 | ls(e, all.names=TRUE) %is% c("NA", "two") 266 | }) 267 | 268 | 269 | test_that("detect data frames", { 270 | expect_false(is.data.frame(unpackMsg(packMsg(list(a=1, b=2))))) 271 | expect_true(is.data.frame(unpackMsg(packMsg(list(a=1, b=2), as_is=TRUE)))) 272 | expect_true(is.data.frame(unpackMsg(packMsg(list(a=numeric(0)))))) 273 | expect_false(is.data.frame(unpackMsg(packMsg(list(a=c(1, 2, 3), b=c(2, 3)))))) 274 | }) 275 | 276 | 277 | test_that("Raw with names", { 278 | expect_warning(packMsg(structure(as.raw(c(1,2)), names=c("a", "b")))) 279 | }) 280 | 281 | 282 | test_that("unpackMsg refusing to simplify", { 283 | unpackMsg(packMsg(list(1, 2))) %is% c(1L, 2L) 284 | unpackMsg(packMsg(list(1, 2)), simplify=FALSE) %is% list(1L, 2L) 285 | }) 286 | 287 | 288 | test_that("Homepage example", { 289 | pack_rt(list(compact=TRUE, schema=0), 290 | c(as.raw(c(0x82, 0xa7)), 291 | charToRaw("compact"), 292 | as.raw(c(0xc3, 0xa6)), 293 | charToRaw("schema"), 294 | as.raw(00))) 295 | }) 296 | 297 | 298 | test_that("warnings trigger once per message", { 299 | bigint = as.raw(c(0xcf, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01)) 300 | length(capture_warnings(unpackMsg(c(as.raw(0x92), bigint, bigint)))) %is% 1 301 | }) 302 | 303 | 304 | test_that("limit pending size", { 305 | # the idea is that you may get a message like, 306 | # which if not defended against, will make 307 | # you malloc up all your memory with a small message. So we need to 308 | # limit the number of items we'va allocated, as well. E.G. each 309 | # array we are in the middle of, treat as a promise for at least one 310 | # byte per item in the message. 311 | bad <- packMsg(c(1:5, list(1:10), 6:10)) 312 | unpackMsg(bad, max_size = 16) 313 | expect_error(unpackMsg(bad, max_size=15), "long") 314 | }) 315 | 316 | 317 | test_that("prevent stack overflows", { 318 | # another attack may be to try to overflow the stack with an indefinitely 319 | # nested array, e.g. 0x919191919191919191..... 320 | bad <- packMsg(list(list(list(1), list(list(2, list(3)))))) 321 | expect_error(unpackMsg(bad, max_depth=4), "nest") 322 | unpackMsg(bad, max_depth=5) 323 | }) 324 | 325 | 326 | test_that("extension types unpacked as raw with a class attr", { 327 | x <- as.raw(c(0xd4, 0xfe, 0xdd)) 328 | expect_warning(d <- unpackMsg(x), "raw") 329 | expect_equal(d, structure(as.raw(0xdd), class="ext-2")) 330 | 331 | x <- as.raw(c(0xd4, 0x7f, 0xdd)) 332 | expect_warning(d <- unpackMsg(x), "raw") 333 | expect_equal(d, structure(as.raw(0xdd), class="ext127")) 334 | }) 335 | -------------------------------------------------------------------------------- /tests/testthat/test-streams.R: -------------------------------------------------------------------------------- 1 | context("unpack from streams") 2 | #most of these tests to be implemented using a fifo connection. 3 | 4 | `%is%` <- expect_equal 5 | 6 | msgs <- as.raw(c(0xa5, 0x68, 0x65, 0x6c, 0x6c, 0x6f, 7 | 0xa3, 0x61, 0x6e, 0x64, 8 | 0xa7, 0x67, 0x6f, 0x6f)) 9 | 10 | two_ends <- function(f, read_size = 2^16, max_buf = NA) { 11 | fl <- file("", open="w+b", blocking=FALSE) 12 | on.exit(close(fl), add=TRUE) 13 | f(fl, msgConnection(fl)) 14 | } 15 | 16 | test_that("consume N messages and return remaining data", { 17 | expect_equal(unpackMsgs(as.raw(1:10)), 18 | list(msgs = as.list(1:10), 19 | remaining = raw(0), 20 | status = "end of input", 21 | bytes_read = 10)) 22 | 23 | expect_equal(unpackMsgs(msgs), 24 | list(msgs = list("hello", "and"), 25 | remaining = as.raw(c(0xa7, 0x67, 0x6f, 0x6f)), 26 | status = "buffer underflow", 27 | bytes_read = 10)) 28 | 29 | expect_equal(unpackMsgs(as.raw(c(0xa7, 0x6f, 0x6f))), 30 | list(msgs = list(), 31 | remaining = as.raw(c(0xa7, 0x6f, 0x6f)), 32 | status = "buffer underflow", 33 | bytes_read = 0)) 34 | 35 | expect_equal(unpackMsgs(as.raw(c(1:10)), 3), 36 | list(msgs = as.list(1:3), 37 | remaining = as.raw(4:10), 38 | status = "ok", 39 | bytes_read = 3)) 40 | }) 41 | 42 | test_that("packMsgs round trip", { 43 | unpackMsgs(packMsgs(c(list(1:10), 1:10)))$msgs %is% 44 | c(list(1:10), as.list(1:10)) 45 | }) 46 | 47 | test_that("Errors raised in parsing are caught", { 48 | unpackMsgs(as.raw(c(0xc1)))$status %is% "malformed input" 49 | }) 50 | 51 | test_that("read from a connection (blocking)", { 52 | 53 | conn <- msgConnection(rawConnection(msgs, open="r")) 54 | readMsg(conn) %is% "hello" 55 | readMsg(conn) %is% "and" 56 | expect_error(readMsg(conn)) 57 | close(conn) 58 | 59 | conn <- msgConnection(rawConnection(msgs, open="r")) 60 | readMsgs(conn) %is% list("hello", "and") 61 | status(conn) %is% "buffer underflow" 62 | close(conn) 63 | }) 64 | 65 | test_that("write to a connection", { 66 | conn <- rawConnection(raw(), open="w") 67 | writeMsg(1:10, conn) 68 | writeMsgs(1:10, conn) 69 | expect_equal( 70 | unpackMsgs(rawConnectionValue(conn))$msgs, 71 | c(list(1:10), as.list(1:10))) 72 | close(conn) 73 | }) 74 | 75 | test_that("write to and read from connections", { 76 | con <- msgConnection(rawBuffer(raw(0))) 77 | writeMsgs(1:10, con) 78 | readMsgs(con) %is% as.list(1:10) 79 | close(con) 80 | two_ends(function(A, B) { 81 | writeMsgs(1:10, A) 82 | flush(A) 83 | readMsgs(B) 84 | }) %is% as.list(1:10) 85 | }) 86 | 87 | test_that("read non-blocking with complete message", { 88 | test <- packMsgs(list("hello", "and")) 89 | conn <- msgConnection( 90 | rawConnection(packMsgs(list("hello", "and", "world")), open="r"), 91 | read_size = length(test)) 92 | readMsgs(conn) %is% list("hello", "and", "world") 93 | close(conn) 94 | }) 95 | 96 | test_that("read non-blocking with incomplete message", { 97 | two_ends(function(endA, endB) { 98 | writeMsg(1, endA) 99 | partial <- packMsgs(list("here is a partial message", 2)) 100 | writeBin(partial[1:10], endA) 101 | flush(endA) 102 | readMsgs(endB) %is% list(1) 103 | status(endB) %is% "buffer underflow" 104 | partial(endB) %is% partial[1:10] 105 | writeBin(partial[11:length(partial)], endA) 106 | flush(endA) 107 | Sys.sleep(0.1) 108 | readMsgs(endB) %is% list("here is a partial message", 2) 109 | }) 110 | }) 111 | 112 | test_that("underflow at incomplete message (1-process)", { 113 | buf <- rawBuffer() 114 | partial <- packMsgs(list(1, "here is a partial message", 2)) 115 | writeRaw(partial[1:10], buf) 116 | con <- msgConnection(buf) 117 | readMsgs(con) %is% list(1) 118 | writeRaw(partial[11:length(partial)], buf) 119 | readMsgs(con) %is% list("here is a partial message", 2) 120 | readMsgs(con) %is% list() 121 | close(con) 122 | }) 123 | 124 | test_that("read non-blocking with array breaking over chunks", { 125 | ##this should at least trigger the underflow handler, no? 126 | partial <- packMsgs(list("hello", 1:2)) 127 | full <- packMsgs(list("hello", 1:10)) 128 | conn <- rawConnection(full, open="r") 129 | conn <- msgConnection(conn, read_size = length(partial)) 130 | readMsgs(conn) %is% list("hello", 1:10) 131 | close(conn) 132 | }) 133 | 134 | test_that("rawBuffer", { 135 | x <- rawBuffer(as.raw(1:5)) 136 | tryCatch({ 137 | readRaw(x, 3) %is% as.raw(1:3) 138 | writeRaw(as.raw(6:10), x) 139 | readRaw(x, 5) %is% as.raw(4:8) 140 | readRaw(x, 10) %is% as.raw(9:10) 141 | readRaw(x, 10) %is% raw(0) 142 | writeRaw(as.raw(11:25), x) 143 | readRaw(x, 100) %is% as.raw(11:25) 144 | writeRaw(as.raw(26:27), x) 145 | readRaw(x, 2) %is% as.raw(26:27) 146 | readRaw(x, 0) %is% raw(0) 147 | readRaw(x, 10) %is% raw(0) 148 | }, finally = { 149 | close(x) 150 | }) 151 | }) 152 | 153 | test_that("read non-blocking and underflow handling when variously interrupted", 154 | { 155 | orig <- list("hello", c("hello", "world"), list("hello", "world", c(1, 2, 3))) 156 | packed <- packMsgs(orig) 157 | 158 | cut <- 28 159 | for (cut in 1:(length(packed) - 1)) { 160 | firstChunk <- packed[1:cut] 161 | secondChunk <- packed[ (cut+1) : (length(packed)) ] 162 | con <- msgConnection(rawBuffer(firstChunk)) 163 | expect_error(read1 <- readMsgs(con), NA, info = paste0("at cut ", cut)) 164 | writeRaw(secondChunk, con) 165 | expect_error(read2 <- readMsgs(con), NA, info = paste0("at cut ", cut)) 166 | expect_equal(c(read1, read2), orig, info = paste0("at cut ", cut)) 167 | close(con) 168 | } 169 | }) 170 | 171 | test_that("assembling an array > read_size", { 172 | mess <- (1:25) + rep(0, 1000) 173 | c <- msgConnection(rawBuffer(raw(0)), read_size=32) 174 | writeMsg(mess, c) 175 | readMsgs(c) %is% list(mess) 176 | close(c) 177 | }) 178 | 179 | test_that("resume from interrupt when message >> read_size", 180 | { 181 | mess <- (1:25) + rep(0, 1000) 182 | con <- msgConnection(rawBuffer(raw(0)), read_size=32) 183 | bin <- packMsg(mess) 184 | writeRaw(bin[1:100], con) 185 | readMsgs(con) %is% list() 186 | writeRaw(bin[101:200], con) 187 | readMsgs(con) %is% list() 188 | writeRaw(bin[201:length(bin)], con) 189 | readMsgs(con) %is% list(mess) 190 | close(con) 191 | }) 192 | 193 | test_that("Assembling a string >> read size", { 194 | ## strings are individual messages that stretch over potentially many reads. 195 | mess <- paste0(letters[sample(26, 1000, replace=TRUE)], collapse="") 196 | con <- msgConnection(rawBuffer(raw(0)), read_size=32) 197 | writeMsgs(list(mess), con) 198 | readMsgs(con) %is% list(mess) 199 | expect_equal(seek(con, rw="r"), 1003) 200 | close(con) 201 | }) 202 | 203 | test_that("seek method", { 204 | con = msgConnection(rawConnection(msgs, open="r")) 205 | readMsgs(con) 206 | seek(con) %is% 10 207 | close(con) 208 | 209 | con = msgConnection(rawConnection(msgs, open="w")) 210 | writeMsg("hello", con) 211 | seek(con) %is% 6 212 | close(con) 213 | 214 | con = msgConnection(rawBuffer()) 215 | writeMsg("hello", con) 216 | expect_error(seek(con)) 217 | seek(con, rw = "w") %is% 6 218 | seek(con, rw = "r") %is% 0 219 | readMsg(con) 220 | seek(con, rw = "r") %is% 6 221 | close(con) 222 | }) 223 | 224 | `%@%` <- function(x, name) { 225 | attr(x, as.character(substitute(name))) 226 | } 227 | 228 | test_that("large blob to force GC?", { 229 | data <- sample(as.raw(0:255), 0x2000000, TRUE) 230 | con <- msgConnection(rawConnection(raw(0), open="wb")) 231 | packet <- packMsg(data) 232 | writeMsg(data, con) 233 | bytes <- rawConnectionValue(con) 234 | close(con) 235 | con2 <- msgConnection(rawConnection(bytes, open="rb")) 236 | as.read <<- readMsg(con2) 237 | close(con2) 238 | expect_identical(as.read, data) 239 | }) 240 | 241 | test_that("smallish blob under gctorture", { 242 | data <- sample(as.raw(0:255), 0x30000, TRUE) 243 | con <- msgConnection(rawConnection(raw(0), open="wb")) 244 | packet <- packMsg(data) 245 | writeMsg(data, con) 246 | bytes <- rawConnectionValue(con) 247 | close(con) 248 | con2 <- msgConnection(rawConnection(bytes, open="rb")) 249 | as.read <- NULL 250 | local({ 251 | gctorture(TRUE) 252 | on.exit(gctorture(FALSE)) 253 | # getting Error: RAW() can only be applied to a 'raw', not a 'NULL' 254 | # I think perhaps because the buffer gets collected. 255 | # I should hold an external pointer to the buffer, then? 256 | # gctorture triggers an early error "2" so yeah. 257 | # any way to debug GC, though? 258 | as.read <<- readMsg(con2) 259 | }) 260 | close(con2) 261 | expect_identical(as.read, data) 262 | }) 263 | 264 | 265 | # I'd like to have some tests with reading/writing to a separate 266 | # process. I tried with parallel/mcfork, makeForkCluster, but forking 267 | # seems to cause weirdness with nonblocking. (however this was before 268 | # I sorted out partial reads.) Subprocess package seems to not have 269 | # that prob. but the subprocess that is created doesn't have my 270 | # functions loaded. 271 | 272 | print.raw <- function(x, max.print = getOption("max.print"), ...) { 273 | # Display raw vectors as hex dumps. Debugging use, not exported. 274 | con <- pipe("hexdump -C", "wb") 275 | if (length(x) > max.print) { 276 | writeBin(x[1:max.print], con) 277 | close(con) 278 | cat(paste0("[ reached getOption(\"max.print\") -- omitted ", 279 | length(x) - max.print, 280 | " entries ] \n")) 281 | } else { 282 | writeBin(x, con[1:max.print]) 283 | close(con) 284 | } 285 | } 286 | -------------------------------------------------------------------------------- /vignettes/comparison.html.asis: -------------------------------------------------------------------------------- 1 | %\VignetteIndexEntry{Which Data Format is Fastest Data Format?} 2 | %\VignetteEngine{R.rsp::asis} 3 | %\VignetteKeyword{HTML} 4 | %\VignetteKeyword{vignette} 5 | %\VignetteKeyword{package} 6 | --------------------------------------------------------------------------------