├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS
├── R
    ├── conn.R
    ├── decode.R
    └── encode.R
├── README.Rmd
├── README.md
├── gh
    ├── space.svg
    └── time.svg
├── inst
    ├── benchmarking.R
    ├── benchmarks.RData
    └── comparison.Rmd
├── man
    ├── msgConnection.Rd
    ├── packMsg.Rd
    └── unpackMsg.Rd
├── src
    ├── cwpack.c
    ├── cwpack.h
    ├── cwpack_defines.h
    ├── decode.c
    ├── decode.h
    ├── encode.c
    ├── encode.h
    ├── init.c
    ├── opts.h
    ├── utf8.c
    ├── utf8.h
    ├── vadr.c
    └── vadr.h
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-static.R
    │   └── test-streams.R
└── vignettes
    ├── comparison.html
    └── comparison.html.asis


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^Meta$
 2 | ^doc$
 3 | ^.*\.Rproj$
 4 | ^\.Rproj\.user$
 5 | ^vignettes/comparison_cache$
 6 | ^vignettes/comparison_files$
 7 | ^README.*
 8 | .git
 9 | .travis.yml
10 | gh
11 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | Meta
 2 | .Rproj.user
 3 | .Rhistory
 4 | .RData
 5 | *.so
 6 | *.o
 7 | *.bc
 8 | README.html
 9 | src/symbols.rds
10 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: r
 2 | cache: packages
 3 | r:
 4 |   - oldrel
 5 |   - release
 6 |   - devel
 7 | r_github_packages:
 8 |   - r-lib/covr
 9 | 
10 | after_success:
11 |   - Rscript -e 'covr::codecov()'


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: msgpack
 2 | Title: A Compact, High Speed Data Format
 3 | Version: 1.0.1
 4 | Authors@R: c(person("Peter", "Meilstrup", email="peter.meilstrup@gmail.com", role=c("aut", "cre", "cph")),
 5 |              person("Claes", "Wihlborg", role=c("ctr", "cph")),
 6 |              person("Björn", "Höhrmann", role=c("ctr", "cph")))
 7 | Description: A fast C-based encoder and streaming decoder for the
 8 |              'messagepack' data format. 'Messagepack' is similar in structure
 9 |              to 'JSON' but uses a more compact binary encoding. Based on the
10 |              'CWPack' C library.
11 | Depends:
12 |     R (>= 3.3.0)
13 | License: MIT + file LICENSE
14 | Encoding: UTF-8
15 | LazyData: true
16 | Suggests:
17 |     testthat, R.rsp
18 | VignetteBuilder: R.rsp
19 | RoxygenNote: 6.1.0
20 | Roxygen: list(markdown=TRUE)
21 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2017-2018
2 | COPYRIGHT HOLDER: Peter Meilstrup
3 | 
4 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | S3method(close,msgConnection)
 4 | S3method(partial,msgConnection)
 5 | S3method(prepack,data.frame)
 6 | S3method(prepack,default)
 7 | S3method(readMsg,msgConnection)
 8 | S3method(readMsgs,msgConnection)
 9 | S3method(seek,msgConnection)
10 | S3method(status,msgConnection)
11 | S3method(summary,msgConnection)
12 | S3method(writeMsg,connection)
13 | S3method(writeMsg,msgConnection)
14 | S3method(writeMsgs,connection)
15 | S3method(writeMsgs,msgConnection)
16 | export(msgConnection)
17 | export(packMsg)
18 | export(packMsgs)
19 | export(partial)
20 | export(prepack)
21 | export(readMsg)
22 | export(readMsgs)
23 | export(status)
24 | export(unpackMsg)
25 | export(unpackMsgs)
26 | export(writeMsg)
27 | export(writeMsgs)
28 | useDynLib(msgpack, .registration = TRUE)
29 | 


--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
 1 | msgpack 1.0.1
 2 | ==========
 3 | 
 4 | Version 1.0.1
 5 | 
 6 |     * Updated an example which prevented `R CMD check` from passing on Windows.
 7 |     * Fixed a protection issue flagged by `rchk`.
 8 | 
 9 | msgpack 1.0
10 | ==========
11 | 
12 | Initial release.
13 | 


--------------------------------------------------------------------------------
/R/conn.R:
--------------------------------------------------------------------------------
  1 | #' Read and write msgpack formatted messages over R connections.
  2 | #'
  3 | #' A `msgConnection` object decodes msgpack messages from an
  4 | #' underlying R raw connection.
  5 | #'
  6 | #' @param con A [connection] object open in binary mode.
  7 | #' @param max_size The largest partial message to store, in
  8 | #'   bytes. `NA` means do not enforce a limit.
  9 | #' @param read_size How many bytes to read at a time.
 10 | #' @param ... Unpacking options (see [unpackMsg]).
 11 | #' @return `msgConnection()` returns an object of class
 12 | #'   `msgConnection`.
 13 | #'
 14 | #' @examples
 15 | #' out <- rawConnection(raw(0), open="wb")
 16 | #' apply(quakes, 1, function(x) writeMsg(x, out)) # one message for each row
 17 | #' length(rawConnectionValue(out))
 18 | #' inn <- msgConnection(rawConnection(rawConnectionValue(out), open="rb"))
 19 | #' close(out)
 20 | #' readMsg(inn)
 21 | #' readMsgs(inn, 3)
 22 | #' length(readMsgs(inn))
 23 | #' close(inn)
 24 | #' @export
 25 | msgConnection <- function(con, read_size=2^16, max_size=NA, ...) {
 26 |   partial <- raw(0)
 27 |   status <- "ok"
 28 |   bread <- 0
 29 |   bwrite <- 0
 30 | 
 31 |   reader <- function(desired) {
 32 |     # ignore "desired" and just read non-blockingly.
 33 |     readRaw(con, read_size)
 34 |   }
 35 | 
 36 |   readMsgs <- function(n=NA, read_size = parent.env(environment())$read_size) {
 37 |     if (is.na(n)) n <- .Machine$integer.max
 38 |     msgs_bytes <- unpackMsgs(partial, n, max_size = max_size, reader = reader)
 39 |     partial <<- msgs_bytes$remaining
 40 |     status <<- msgs_bytes$status
 41 |     bread <<- bread + msgs_bytes$bytes_read
 42 |     msgs_bytes$msgs
 43 |   }
 44 | 
 45 |   doClose <- function(...) {
 46 |     close(con, ...)
 47 |   }
 48 | 
 49 |   #msgConnection object is just the orig object with this
 50 |   #environment dangled off it.
 51 |   structure(addClass(con, "msgConnection"), reader = environment())
 52 | }
 53 | 
 54 | #' @export
 55 | summary.msgConnection <- function(object, ...) {
 56 |   s <- NextMethod("summary")
 57 |   c(s, list(status = status(object)))
 58 | }
 59 | 
 60 | 
 61 | #' @rdname msgConnection
 62 | #' @export
 63 | close.msgConnection <- function(con, ...) {
 64 |     attr(con, "reader")$doClose(...)
 65 | }
 66 | 
 67 | catenator <- function(val=c()) {
 68 |   # An in-memory FIFO type object.
 69 |   #tracemem(val)
 70 |   start <- 0
 71 |   end <- length(val)
 72 |   function(x, action="store", ..., opts) {
 73 | 
 74 |     switch(action,
 75 |            store = {
 76 |              lx <- length(x)
 77 |              l <- length(val)
 78 |              if (lx > 0) {
 79 |                #check for overflow
 80 |                if (end + lx > l && start > 0) {
 81 |                  # rotate back to start
 82 |                  if (start > 0 && end != start) {
 83 |                    val[1:(end-start)] <- val[(start+1):end]
 84 |                  }
 85 |                  end <<- end - start
 86 |                  start <<- 0
 87 |                }
 88 |                if (end + lx > l) {
 89 |                  # double array length
 90 |                  length(val) <<- max(end + lx, 2 * l);
 91 |                }
 92 | 
 93 |                #inject new values
 94 |                val[ (end + 1):(end + lx) ] <<- x
 95 |                end <<- end + lx
 96 |              }
 97 |              dbg("lx", lx, "start", start, "end", end, "\n")
 98 |              x
 99 |            },
100 | 
101 |            read = {
102 |              if (end > start) {
103 |                val[(start+1):end]
104 |              } else val[c()]
105 |            },
106 | 
107 |            buf = {
108 |              val
109 |            },
110 | 
111 |            start = start,
112 | 
113 |            length = end - start,
114 | 
115 |            end = end,
116 | 
117 |            contents = {
118 |              list(val, start, end)
119 |            },
120 | 
121 |            reset = {
122 |              val <<- x
123 |              start <<- 0
124 |              end <<- length(x)
125 |            },
126 | 
127 |            drop = {
128 |              if (x <= end - start && x >= 0) {
129 |                start <<- start + x
130 |              } else {
131 |                stop("have ", end - start, ", can't drop ",  x)
132 |              }
133 |            })
134 |   }
135 | }
136 | 
137 | lister <- function(val = list()) {
138 |   n <- length(val)
139 |   function(x, action="store") {
140 |     switch(action,
141 |            store = {
142 |              if (n > length(val))
143 |                length(val) <<- max(1, 2 * length(val))
144 |              n <<- n + 1
145 |              val[[n]] <<- x
146 |            },
147 |            read = {
148 |              length(val) <<- n
149 |              val
150 |            },
151 |            length = {
152 |              n
153 |            },
154 |            clear = {
155 |              n <<- 0
156 |              length(val) <<- 0
157 |            }
158 |            )
159 |   }
160 | }
161 | 
162 | addClass <- function(x, classes) structure(x, class = c(classes, class(x)))
163 | 
164 | #' @return `partial(con)` returns any data that has been read ahead of
165 | #'   the last decoded message.
166 | #' @rdname msgConnection
167 | #' @export
168 | partial <- function(con) UseMethod("partial")
169 | 
170 | #' @rdname msgConnection
171 | #' @export
172 | partial.msgConnection <- function(con) {
173 |   attr(con, "reader")$partial
174 | }
175 | 
176 | #' @rdname msgConnection
177 | #' @export
178 | #' @param n The maximum number of messages to read. A value of NA
179 | #'     means to parse all available messages until end of input.
180 | #' @return `readMsgs(con, n)` returns a list of up to `n` decoded messages.
181 | readMsgs <- function(con, n = NA, ...) {
182 |   UseMethod("readMsgs")
183 | }
184 | 
185 | #' @export
186 | readMsgs.msgConnection <- function(con, n = NA, ...) {
187 |   attr(con, "reader")$readMsgs(n, ...)
188 | }
189 | 
190 | #' @rdname msgConnection
191 | #' @return `status(con)` returns the status of msgpack decoding on the
192 | #'   connection. A value of `"ok"` indicates all requested messages
193 | #'   were read, `"buffer underflow"` for a non-blocking connection
194 | #'   indicates that only part of a message has been received, and
195 | #'   `"end of input"` means the last available message has been read.
196 | #'   Other values indicate errors encountered in decoding, which will
197 | #'   effectively halt reading.
198 | #' @export
199 | status <- function(con) UseMethod("status")
200 | 
201 | #' @rdname msgConnection
202 | #' @export
203 | status.msgConnection <- function(con) {
204 |   attr(con, "reader")$status
205 | }
206 | 
207 | 
208 | #' @rdname msgConnection
209 | #' @return `seek(con)` returns the number of bytes that have been
210 | #'   successfully read or written, depending on the mode of the
211 | #'   connection. (Repositioning is not supported.)
212 | #' @param rw See [seek()].
213 | #' @export
214 | seek.msgConnection <- function(con, rw = summary(con)$mode, ...) {
215 |   rw <- pmatch(rw, c("read", "write"), 0L)
216 |   switch(rw,
217 |          attr(con, "reader")$bread,
218 |          attr(con, "reader")$bwrite,
219 |          )
220 | }
221 | 
222 | #' `readMsg(con)` reads exactly one message from a
223 | #'   msgConnection, or throws an error.
224 | #'
225 | #' @rdname msgConnection
226 | #' @return `readMsg(con)` returns one decoded message.
227 | #' @export
228 | readMsg <- function(con, ...) {
229 |   UseMethod("readMsg", con)
230 | }
231 | 
232 | #' @export
233 | readMsg.msgConnection <- function(con, ...) {
234 |   x <- readMsgs(con, 1, ...)
235 |   if (length(x) < 1) {
236 |     stop(status(con))
237 |   }
238 |   x[[1]]
239 | }
240 | 
241 | #' `writeMsg(x, con)` writes a single message to a msgConnection.
242 | #'
243 | #' @rdname msgConnection
244 | #' @param obj An R object.
245 | #' @export
246 | writeMsg <- function(obj, con, ...) {
247 |   UseMethod("writeMsg", con)
248 | }
249 | 
250 | #' @export
251 | writeMsg.connection <- function(obj, con, ...) {
252 |   writeMsgs(list(obj), con, ...)
253 | }
254 | 
255 | #' @export
256 | writeMsg.msgConnection <- function(obj, con, ...) {
257 |   writeMsgs(list(obj), con, ...)
258 | }
259 | 
260 | #' `writeMsgs(l, conn)` writes a list of
261 | #'   messages to a connection. That is, `writeMsg(1:10, conn)` writes one
262 | #'   message containing an array, while `writeMsgs(1:10, conn)` writes
263 | #'   ten consecutive messages each containing one integer.
264 | #'
265 | #' `writeMsg` will work with any R connection in raw mode, but reading
266 | #' requires a msgConnection object.
267 | #'
268 | #' Because msgpack messages have unpredictable length, the decoder
269 | #' reads ahead in chunks, then finds the boundaries between messages.
270 | #' Therefore when reading over a socket or a fifo it is best to use a
271 | #' nonblocking connection, and it will not work to mix readMsg and
272 | #' readBin on the same connection.
273 | #'
274 | #' If you are reading data from a not completely trusted source you
275 | #' should specify options `max_size` and `max_depth` (see
276 | #' [unpackOpts]). Without it, some deeply nested or cleverly designed
277 | #' messages can cause a stack overflow or out-of-memory error.  With
278 | #' these options set, you will get an R exception instead.
279 | 
280 | #' @rdname msgConnection
281 | #' @param objs A list of R objects.
282 | #' @export
283 | writeMsgs <- function(objs, con, ...) {
284 |   UseMethod("writeMsgs", con)
285 | }
286 | 
287 | #' @export
288 | writeMsgs.connection <- function(objs, con, ...) {
289 |   writeRaw(packMsgs(objs, ...), con)
290 | }
291 | 
292 | #' @export
293 | writeMsgs.msgConnection <- function(objs, con, ...) {
294 |   buf <- packMsgs(objs, ...)
295 |   result <- writeRaw(buf, con)
296 |   attr(con, "reader")$bwrite <- attr(con, "reader")$bwrite + length(buf)
297 |   invisible(result)
298 | }
299 | 
300 | ## To support test harness, we use "readRaw" and "writeRaw"
301 | ## internally, instead of "readBin" which is not an S3 method
302 | readRaw <- function(con, n, ...) {
303 |   UseMethod("readRaw")
304 | }
305 | 
306 | writeRaw <- function(object, con, ...) {
307 |   UseMethod("writeRaw", con)
308 | }
309 | 
310 | readRaw.connection <- function(con, n) {
311 |   # I get errors thrown (sometimes) when reading at the end of
312 |   # a nonblocking fifo.
313 |   tryCatch({
314 |     readBin(con, 'raw', n)
315 |   },
316 |   error = function(e) {
317 |     # warning("Ignoring ", e)
318 |     raw(0)
319 |   })
320 | }
321 | 
322 | writeRaw.connection <- function(object, con, ...) {
323 |   writeBin(object, con, ...)
324 | }
325 | 
326 | 
327 | ## An inefficient double ended byte buffer for test harness purposes
328 | rawBuffer <- function(object = raw(0)) {
329 |     open <- "r"
330 |     bytes <- length(object)
331 |     buf <- rawConnection(object, open = "r")
332 |     object <- NULL
333 | 
334 |     write <- function(object) {
335 |       switch(open,
336 |              "w" = {
337 |                writeBin(object, buf)
338 |              },
339 |              "r" = {
340 |                data <- readBin(buf, 'raw', bytes - seek(buf))
341 |                close(buf)
342 |                buf <<- rawConnection(data,
343 |                                      open = "w")
344 |                open <<- "w"
345 |                writeBin(data, buf)
346 |                write(object)
347 |              }
348 |       )
349 |     }
350 | 
351 |     read <- function(n) {
352 |       switch(open,
353 |              "r" = {
354 |                readBin(buf, 'raw', n)
355 |              },
356 |              "w" = {
357 |                ##convert a write buffer into a read buffer
358 |                val <- rawConnectionValue(buf)
359 |                close(buf)
360 |                buf <<- rawConnection(val, open = "r")
361 |                bytes <<- length(val)
362 |                open <<- "r"
363 |                read(n)
364 |              }
365 |       )
366 |     }
367 | 
368 |     doClose <- function(n) {
369 |       close(buf)
370 |       buf <- NULL
371 |     }
372 | 
373 |     structure(list(environment()), class = "rawBuffer")
374 | }
375 | 
376 | writeRaw.rawBuffer <- function(object, con) {
377 |     con[[1]]$write(object)
378 | }
379 | 
380 | readRaw.rawBuffer <- function(con, n) {
381 |     con[[1]]$read(n)
382 | }
383 | 
384 | close.rawBuffer <- function(con) {
385 |     con[[1]]$doClose()
386 | }
387 | 


--------------------------------------------------------------------------------
/R/decode.R:
--------------------------------------------------------------------------------
  1 | #' Decode msgpack messages.
  2 | #'
  3 | #' `unpackMsg` converts a raw array containing one message in msgpack
  4 | #' format into the corresponding R data structure.
  5 | #'
  6 | #' @param x A [raw()] object, perhaps read from a file or socket.
  7 | #' @param ... Options passed to [unpackOpts].
  8 | #' @return `unpackMsg(x)` returns one decoded message (which might be
  9 | #'   shorter than the input raw), or throws an error.
 10 | #' @useDynLib msgpack, .registration = TRUE
 11 | #' @examples
 12 | #' msg <- as.raw(c(0x82, 0xa7, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x63, 0x74, 0xc3,
 13 | #'                 0xa6, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x00))
 14 | #' unpackMsg(msg)
 15 | #' @export
 16 | unpackMsg <- function(x, ...) {
 17 |   .Call("_unpack_msg", x, unpackOpts(...))
 18 | }
 19 | 
 20 | dbg <- alist
 21 | #dbg <- cat
 22 | 
 23 | #' `unpackMsgs` extracts a number of msgpack messages from a raw object.
 24 | #'
 25 | #' @param n How many messages to read. An "NA" here means to read as
 26 | #'   much as possible.
 27 | #' @param reader For implementing connections; a function that takes
 28 | #'   no arguments and returns a raw containing more data.
 29 | #' @return `unpackMsgs(r, n)` returns a list `X` with four elements:
 30 | #'   * `X$msgs` is a list of the messages unpacked.
 31 | #'   * `X$remaining` is data remaining to be parsed.
 32 | #'   * `X$status` is a status message, typically "ok", "end of input",
 33 | #'     or "buffer underflow".
 34 | #'   * `x$bytes_read` the number of bytes consumed.
 35 | #'
 36 | #' @examples
 37 | #' x <- packMsgs(list("one", "two", "three"))
 38 | #' unpackMsgs(x, 2)
 39 | #' @rdname unpackMsg
 40 | #' @export
 41 | unpackMsgs <- function(x, n = NA, reader = NULL, ...) {
 42 |   if (is.na(n)) {
 43 |     n <- .Machine$integer.max
 44 |   }
 45 | 
 46 |   bread <- 0                            # bytes read
 47 |   saveMessage <- lister()               # messages read so far
 48 |   saveData <- catenator(x)              # bytes pending
 49 |   offset <- 0                           # position within buffer
 50 | 
 51 |   status <- "ok"
 52 | 
 53 |   readMore <- function(current, desired) {
 54 |     dbg("readMore:", "current =", current, "desired =", desired, "\n")
 55 |     start <- saveData(action="start")
 56 |     saveData(reader(desired))
 57 |     new_start <- saveData(action="start")
 58 |     current <- new_start - start + current
 59 |     result <- c(saveData(action="contents"), current)
 60 |     # cat("after read: "); print(saveData(action="contents"))
 61 |     result
 62 |   }
 63 | 
 64 |   opts = unpackOpts(...,
 65 |                     underflow_handler = if (is.null(reader)) NULL else readMore)
 66 | 
 67 |   # cat("buffer: "); print(saveData(action="contents"))
 68 |   tryCatch(
 69 |     while(saveMessage(action="length") < n) {
 70 |       last_start <- saveData(action="start")
 71 |       result <- .Call("_unpack_msg_partial",
 72 |                       saveData(action="buf"),
 73 |                       offset,
 74 |                       saveData(action="end"),
 75 |                       opts)
 76 |       ## result is a pairlist( message, status, new_offset )
 77 |       # cat("unpack result: "); print(result)
 78 |       status <- result[[1]]
 79 |       if (status == "ok") {
 80 |         # got a good message,
 81 |         saveMessage(result[[2]])
 82 |         message_length <- ((result[[3]] - offset) +
 83 |                            (saveData(action="start") - last_start))
 84 |         bread <- bread + message_length
 85 |         saveData(message_length, action="drop")
 86 |         offset <- result[[3]]
 87 |         # dbg("after 1 msg, buffer: "); print(saveData(action="contents"))
 88 |       } else {
 89 |         stop(status)
 90 |       }
 91 |     },
 92 |     error = function(e) {
 93 |       dbg("Stopping with exception: ", e$message, "in", deparse(e$call), "\n")
 94 |       status <<- e$message
 95 |     }
 96 |   )
 97 |   # dbg("after failure, buffer: "); print(saveData(action="contents"))
 98 | 
 99 |   list(msgs = saveMessage(action="read"),
100 |        remaining = saveData(action="read"),
101 |        status = status,
102 |        bytes_read = bread)
103 | }
104 | 
105 | #' [unpackOpts()] interprets is passed to `...` in [unpackMsgs()],
106 | #' [unpackMsg()], and [msgConnection()]. It is not exported.
107 | #'
108 | #' @param parent When an environment is given, (such as [emptyenv()]),
109 | #'   unpack msgpack dicts into environment objects, with the given
110 | #'   value as parent. This option overrides `use_df=TRUE`. Otherwise,
111 | #'   unpack dicts into named vectors / lists.
112 | #' @param df When `TRUE`, convert msgpack dicts, whose elements are
113 | #'   all arrays having the same length, into [data.frame()]s.
114 | #' @param simplify If `TRUE`, simplify msgpack lists into primitive
115 | #'   vectors.
116 | #' @param max_size The maximum length of message to decode.
117 | #' @param max_depth The maximum degree of nesting to support.
118 | #' @param underflow_handler Used internally.
119 | #'
120 | #' @details
121 | #' The msgpack format does not have typed arrays, so all msgpack
122 | #' arrays are effectively lists from the R perspective. However, if an
123 | #' array containing compatibly typed elements is read, `unpack` will
124 | #' return a logical, integer, real or string vector as
125 | #' appropriate. This behavior is disabled with `simplify=FALSE`.  The
126 | #' coercion used is more conservative than R's coercion: Integer
127 | #' values may be converted to real, but boolean values will not be
128 | #' cast to numeric, nor any types to string. If conversion from a
129 | #' large integer to real loses precision, a warning is printed.
130 | #'
131 | #' Msgpack also does not distinguish between `NA` and `NULL`. All nils
132 | #' will be decoded as NA.
133 | #'
134 | #' Strings are assumed to be UTF-8 encoded. If a msgpack string does
135 | #' not appear to be valid UTF-8, a warning is printed and a raw object
136 | #' is produced instead.
137 | #'
138 | #' Msgpack allows any type to be the key of a dict, but R only
139 | #' supports strings. If a non-string appears as key in a msgpack dict,
140 | #' it will be converted to string with [deparse()].
141 | #'
142 | #' Extension types will be decoded as raw objects with a class like
143 | #' `"ext120"` and a warning.
144 | #'
145 | #' @rdname unpackMsg
146 | unpackOpts <- function(parent = NULL,
147 |                        df = TRUE,
148 |                        simplify = TRUE,
149 |                        max_size = NA,
150 |                        max_depth = NA,
151 |                        underflow_handler = NULL) {
152 |   .Call("_unpack_opts",
153 |         parent,
154 |         df,
155 |         simplify,
156 |         parent.env(environment()),
157 |         max_size,
158 |         max_depth,
159 |         underflow_handler);
160 | }
161 | 
162 | # Come up with a name when a non-string is used as key.
163 | repr <- function(x) {
164 |   paste0(deparse(x, control = c("keepNA")), collapse="")
165 | }
166 | 


--------------------------------------------------------------------------------
/R/encode.R:
--------------------------------------------------------------------------------
 1 | #' Convert R objects to msgpack format.
 2 | #'
 3 | #' @param x An R object, which can be null, a vector, list,
 4 | #'   environment, raw, or any combinations thereof.
 5 | #' @param ... Options passed to [packOpts()]
 6 | #' @return An object of class "raw".
 7 | #' @details
 8 | #' Strings are re-encoded to UTF-8 if necessary. Real numbers
 9 | #' taking integral values may be emitted as integers to save space.
10 | #'
11 | #' Normally an R vector of length 1 will be unboxed, e.g. packMsg(1)
12 | #' will make a msgpack integer, but packMsg(c(1,2)) will make a
13 | #' msgpack list. To prevent this and produce a list of length 1 in the
14 | #' first case, specify `as_is = TRUE`. Objects of class `AsIs` or
15 | #' `data.frame` will always be encoded as-is.
16 | #'
17 | #' A hook for pre-processing R objects before packing is supported, by
18 | #' giving the object an S3 [class] and implementing a method
19 | #' `prepack`. For instance, `prepack.data.frame(x)` simply adds the
20 | #' `"AsIs"` class to `x`.
21 | #'
22 | #' Environment objects are written out with the keys in sorted order,
23 | #' but named vectors are written in the order which the entries
24 | #' appear.
25 | #'
26 | #' Object attributes other than `names` and `class` are ignored.
27 | #'
28 | #' @examples
29 | #' packMsg( list(compact=TRUE, schema=0) )
30 | #' @export
31 | packMsg <- function(x, ...)  {
32 |   .Call("_pack_msg", x, packOpts(...))
33 | }
34 | 
35 | #' @param xs a list of objects to pack.
36 | #' @rdname packMsg
37 | #' @examples
38 | #' x <- packMsgs(list("one", "two", "three"))
39 | #' unpackMsgs(x, 2)
40 | #' @export
41 | packMsgs <- function(xs, ...) {
42 |  opts <- packOpts(...)
43 |  unlist(lapply(xs, function(xx) .Call("_pack_msg", xx, opts)))
44 | }
45 | 
46 | #' [packOpts()] interprets the `...` argument of packMsg and
47 | #' packMsgs. it is not exported.
48 | #'
49 | #' @param compatible If TRUE, emitted bytes conform to version 1.0 of
50 | #'   msgpack encoding. This means that msgpack strings are used for
51 | #'   raw objects.
52 | #' @param as_is If TRUE, singletons (R primitive vectors of length 1
53 | #'   having no names attribute) are encoded as msgpack arrays of
54 | #'   length 1. Otherwise singletons are simplified to msgpack scalars.
55 | #' @param use_dict If TRUE, vectors having a "names" attribute are
56 | #'   encoded as dicts. If false, they are encoded as arrays and the
57 | #'   names are discarded.
58 | #' @param max_size The largest buffer that will be allocated.
59 | #' @param buf_size The initial amount of memory, in bytes, to allocate
60 | #'   for packing each message. This will be dynamically grown if a
61 | #'   larger message is passed, so there is little reason to change
62 | #'   this.
63 | #' @rdname packMsg
64 | packOpts <- function(compatible = FALSE,
65 |                     as_is = FALSE,
66 |                     use_dict = TRUE,
67 |                     max_size = NA,
68 |                     buf_size = 512) {
69 |   .Call("_pack_opts",
70 |         compatible,
71 |         as_is,
72 |         use_dict,
73 |         max_size,
74 |         buf_size,
75 |         parent.env(environment()))
76 | }
77 | 
78 | #' @rdname packMsg
79 | #' @export
80 | prepack <- function(x) UseMethod("prepack")
81 | 
82 | #' @rdname packMsg
83 | #' @export
84 | prepack.default <- function(x) unclass(x)
85 | 
86 | #' @rdname packMsg
87 | #' @export
88 | prepack.data.frame <- function(x) I(unclass(x))
89 | 


--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
 1 | ---
 2 | title: "msgpack for R"
 3 | output: github_document
 4 | author: Peter Meilstrup
 5 | ---
 6 | 
 7 | [![CRAN version badge](http://www.r-pkg.org/badges/version/msgpack)](https://cran.r-project.org/package=msgpack)
 8 | [![Travis build status](http://travis-ci.org/crowding/msgpack-r.svg?branch=master)](https://travis-ci.org/crowding/msgpack-r)
 9 | [![Code coverage](https://codecov.io/gh/crowding/msgpack-r/branch/master/graph/badge.svg)](https://codecov.io/gh/crowding/msgpack-r)
10 | 
11 | 
12 | This is a high speed [msgpack](https://msgpack.org) encoder
13 | and decoder for R, based on the [CWPack](https://github.com/clwi/CWPack) C
14 | implementation.
15 | 
16 | `msgpack` is a binary data format with data structures similar to
17 | `JSON` and a compact binary encoding. It can be a drop-in replacement
18 | for `JSON` in most applications. It is designed to be fast to parse
19 | and compact to transmit and store.
20 | 
21 | ```{R echo=FALSE}
22 | knitr::opts_chunk$set(collapse=TRUE)
23 | ```
24 | 
25 | ## Installation
26 | 
27 | From CRAN:
28 | 
29 | ```{R eval=FALSE}
30 | install.packages("msgpack")
31 | ```
32 | 
33 | From Github:
34 | 
35 | ```{R eval=FALSE}
36 | library(devtools)
37 | install_github("crowding/msgpack-r")
38 | ```
39 | 
40 | ## Usage
41 | 
42 | ```{R}
43 | library(msgpack)
44 | x <- packMsg( list(compact=TRUE, schema=0) )
45 | x
46 | dput(unpackMsg( x ))
47 | ```
48 | 
49 | ### Connections / Streaming
50 | 
51 | Write messages one or several at a time:
52 | 
53 | ```{r}
54 | conOut <- rawConnection(raw(0), open = "w") # or socketConnection, etc
55 | writeMsg("one", conOut)
56 | writeMsgs(list(2, c(buckle=TRUE), c(owner="my", type="shoe")), conOut)
57 | ```
58 | 
59 | Use a `msgConnection` object to read messages one or several at a time:
60 | 
61 | ```{r}
62 | conIn <- msgConnection(rawConnection(rawConnectionValue(conOut), open = "r"))
63 | dput(readMsgs(conIn, 2))
64 | dput(readMsg(conIn))
65 | dput(readMsgs(conIn))
66 | ```
67 | 
68 | ### Performance
69 | 
70 | Msgpack is fast and compact. See the [benchmarking vignette](inst/doc/comparison.html).
71 | 
72 | ![Plot of time taken to transmit dataset, vs size of dataset, for each encoder under four conditions.](gh/space.svg)
73 | 
74 | ![Comparison of space used by each encoder to encode a test dataset.](gh/time.svg)
75 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | msgpack for R
 2 | ================
 3 | Peter Meilstrup
 4 | 
 5 | [![CRAN version
 6 | badge](http://www.r-pkg.org/badges/version/msgpack)](https://cran.r-project.org/package=msgpack)
 7 | [![Travis build
 8 | status](http://travis-ci.org/crowding/msgpack-r.svg?branch=master)](https://travis-ci.org/crowding/msgpack-r)
 9 | [![Code
10 | coverage](https://codecov.io/gh/crowding/msgpack-r/branch/master/graph/badge.svg)](https://codecov.io/gh/crowding/msgpack-r)
11 | 
12 | This is a high speed [msgpack](https://msgpack.org) encoder and decoder
13 | for R, based on the [CWPack](https://github.com/clwi/CWPack) C
14 | implementation.
15 | 
16 | `msgpack` is a binary data format with data structures similar to `JSON`
17 | and a compact binary encoding. It can be a drop-in replacement for
18 | `JSON` in most applications. It is designed to be fast to parse and
19 | compact to transmit and store.
20 | 
21 | ## Installation
22 | 
23 | From CRAN:
24 | 
25 | ``` r
26 | install.packages("msgpack")
27 | ```
28 | 
29 | From Github:
30 | 
31 | ``` r
32 | library(devtools)
33 | install_github("crowding/msgpack-r")
34 | ```
35 | 
36 | ## Usage
37 | 
38 | ``` r
39 | library(msgpack)
40 | x <- packMsg( list(compact=TRUE, schema=0) )
41 | x
42 | ##  [1] 82 a7 63 6f 6d 70 61 63 74 c3 a6 73 63 68 65 6d 61 00
43 | dput(unpackMsg( x ))
44 | ## structure(list(compact = TRUE, schema = 0L), .Names = c("compact", 
45 | ## "schema"))
46 | ```
47 | 
48 | ### Connections / Streaming
49 | 
50 | Write messages one or several at a time:
51 | 
52 | ``` r
53 | conOut <- rawConnection(raw(0), open = "w") # or socketConnection, etc
54 | writeMsg("one", conOut)
55 | writeMsgs(list(2, c(buckle=TRUE), c(owner="my", type="shoe")), conOut)
56 | ```
57 | 
58 | Use a `msgConnection` object to read messages one or several at a
59 | time:
60 | 
61 | ``` r
62 | conIn <- msgConnection(rawConnection(rawConnectionValue(conOut), open = "r"))
63 | dput(readMsgs(conIn, 2))
64 | ## list("one", 2L)
65 | dput(readMsg(conIn))
66 | ## structure(TRUE, .Names = "buckle")
67 | dput(readMsgs(conIn))
68 | ## list(structure(c("my", "shoe"), .Names = c("owner", "type")))
69 | ```
70 | 
71 | ### Performance
72 | 
73 | Msgpack is fast and compact. See the [benchmarking
74 | vignette](inst/doc/comparison.html).
75 | 
76 | ![Plot of time taken to transmit dataset, vs size of dataset, for each
77 | encoder under four conditions.](gh/space.svg)
78 | 
79 | ![Comparison of space used by each encoder to encode a test
80 | dataset.](gh/time.svg)
81 | 


--------------------------------------------------------------------------------
/gh/space.svg:
--------------------------------------------------------------------------------
 1 | <?xml version='1.0' encoding='UTF-8' ?>
 2 | <svg xmlns='http://www.w3.org/2000/svg' xmlns:xlink='http://www.w3.org/1999/xlink' viewBox='0 0 432.00 288.00'>
 3 | <defs>
 4 |   <style type='text/css'><![CDATA[
 5 |     line, polyline, path, rect, circle {
 6 |       fill: none;
 7 |       stroke: #000000;
 8 |       stroke-linecap: round;
 9 |       stroke-linejoin: round;
10 |       stroke-miterlimit: 10.00;
11 |     }
12 |   ]]></style>
13 | </defs>
14 | <rect width='100%' height='100%' style='stroke: none; fill: #FFFFFF;'/>
15 | <rect x='0.00' y='0.00' width='432.00' height='288.00' style='stroke-width: 1.07; stroke: #FFFFFF; fill: #FFFFFF;' />
16 | <defs>
17 |   <clipPath id='cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg=='>
18 |     <rect x='38.45' y='24.14' width='388.07' height='255.64' />
19 |   </clipPath>
20 | </defs>
21 | <rect x='38.45' y='24.14' width='388.07' height='255.64' style='stroke-width: 1.07; stroke: none; fill: #EBEBEB;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
22 | <polyline points='38.45,239.64 426.52,239.64 ' style='stroke-width: 0.53; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
23 | <polyline points='38.45,182.60 426.52,182.60 ' style='stroke-width: 0.53; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
24 | <polyline points='38.45,125.57 426.52,125.57 ' style='stroke-width: 0.53; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
25 | <polyline points='38.45,68.53 426.52,68.53 ' style='stroke-width: 0.53; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
26 | <polyline points='38.45,268.16 426.52,268.16 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
27 | <polyline points='38.45,211.12 426.52,211.12 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
28 | <polyline points='38.45,154.09 426.52,154.09 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
29 | <polyline points='38.45,97.05 426.52,97.05 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
30 | <polyline points='38.45,40.01 426.52,40.01 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
31 | <polyline points='63.76,279.78 63.76,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
32 | <polyline points='105.94,279.78 105.94,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
33 | <polyline points='148.12,279.78 148.12,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
34 | <polyline points='190.30,279.78 190.30,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
35 | <polyline points='232.48,279.78 232.48,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
36 | <polyline points='274.67,279.78 274.67,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
37 | <polyline points='316.85,279.78 316.85,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
38 | <polyline points='359.03,279.78 359.03,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
39 | <polyline points='401.21,279.78 401.21,24.14 ' style='stroke-width: 1.07; stroke: #FFFFFF; stroke-linecap: butt;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
40 | <rect x='44.77' y='226.01' width='37.96' height='42.15' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #619CFF;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
41 | <rect x='86.96' y='196.24' width='37.96' height='71.92' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #DB72FB;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
42 | <rect x='129.14' y='189.36' width='37.96' height='78.80' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #00B9E3;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
43 | <rect x='171.32' y='170.75' width='37.96' height='97.41' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #00C19F;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
44 | <rect x='213.50' y='164.71' width='37.96' height='103.45' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #D39200;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
45 | <rect x='255.68' y='162.34' width='37.96' height='105.82' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #FF61C3;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
46 | <rect x='297.87' y='122.93' width='37.96' height='145.23' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #93AA00;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
47 | <rect x='340.05' y='101.14' width='37.96' height='167.02' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #F8766D;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
48 | <rect x='382.23' y='35.76' width='37.96' height='232.40' style='stroke-width: 1.07; stroke: none; stroke-linecap: butt; fill: #00BA38;' clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)' />
49 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='39.75' y='223.18' style='font-size: 12.00px; font-family: Arial;' textLength='48.02px' lengthAdjust='spacingAndGlyphs'>msgpack</text></g>
50 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='92.93' y='193.40' style='font-size: 12.00px; font-family: Arial;' textLength='26.01px' lengthAdjust='spacingAndGlyphs'>rjson</text></g>
51 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='139.12' y='186.52' style='font-size: 12.00px; font-family: Arial;' textLength='18.00px' lengthAdjust='spacingAndGlyphs'>csv</text></g>
52 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='178.62' y='167.92' style='font-size: 12.00px; font-family: Arial;' textLength='23.36px' lengthAdjust='spacingAndGlyphs'>dput</text></g>
53 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='205.81' y='161.88' style='font-size: 12.00px; font-family: Arial;' textLength='53.34px' lengthAdjust='spacingAndGlyphs'>RJSONIO</text></g>
54 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='252.66' y='153.29' style='font-size: 12.00px; font-family: Arial;' textLength='44.02px' lengthAdjust='spacingAndGlyphs'>serialize</text></g>
55 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='304.18' y='120.10' style='font-size: 12.00px; font-family: Arial;' textLength='25.34px' lengthAdjust='spacingAndGlyphs'>yaml</text></g>
56 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='330.69' y='98.30' style='font-size: 12.00px; font-family: Arial;' textLength='56.68px' lengthAdjust='spacingAndGlyphs'>msgpackR</text></g>
57 | <g clip-path='url(#cpMzguNDQ2OXw0MjYuNTIxfDI3OS43ODF8MjQuMTQzNg==)'><text x='382.53' y='32.93' style='font-size: 12.00px; font-family: Arial;' textLength='37.35px' lengthAdjust='spacingAndGlyphs'>jsonlite</text></g>
58 | <defs>
59 |   <clipPath id='cpMHw0MzJ8Mjg4fDA='>
60 |     <rect x='0.00' y='0.00' width='432.00' height='288.00' />
61 |   </clipPath>
62 | </defs>
63 | <g clip-path='url(#cpMHw0MzJ8Mjg4fDA=)'><text x='28.62' y='271.31' style='font-size: 8.80px; fill: #4D4D4D; font-family: Arial;' textLength='4.89px' lengthAdjust='spacingAndGlyphs'>0</text></g>
64 | <g clip-path='url(#cpMHw0MzJ8Mjg4fDA=)'><text x='23.73' y='214.27' style='font-size: 8.80px; fill: #4D4D4D; font-family: Arial;' textLength='9.79px' lengthAdjust='spacingAndGlyphs'>25</text></g>
65 | <g clip-path='url(#cpMHw0MzJ8Mjg4fDA=)'><text x='23.73' y='157.24' style='font-size: 8.80px; fill: #4D4D4D; font-family: Arial;' textLength='9.79px' lengthAdjust='spacingAndGlyphs'>50</text></g>
66 | <g clip-path='url(#cpMHw0MzJ8Mjg4fDA=)'><text x='23.73' y='100.20' style='font-size: 8.80px; fill: #4D4D4D; font-family: Arial;' textLength='9.79px' lengthAdjust='spacingAndGlyphs'>75</text></g>
67 | <g clip-path='url(#cpMHw0MzJ8Mjg4fDA=)'><text x='18.83' y='43.16' style='font-size: 8.80px; fill: #4D4D4D; font-family: Arial;' textLength='14.68px' lengthAdjust='spacingAndGlyphs'>100</text></g>
68 | <polyline points='35.71,268.16 38.45,268.16 ' style='stroke-width: 1.07; stroke: #333333; stroke-linecap: butt;' clip-path='url(#cpMHw0MzJ8Mjg4fDA=)' />
69 | <polyline points='35.71,211.12 38.45,211.12 ' style='stroke-width: 1.07; stroke: #333333; stroke-linecap: butt;' clip-path='url(#cpMHw0MzJ8Mjg4fDA=)' />
70 | <polyline points='35.71,154.09 38.45,154.09 ' style='stroke-width: 1.07; stroke: #333333; stroke-linecap: butt;' clip-path='url(#cpMHw0MzJ8Mjg4fDA=)' />
71 | <polyline points='35.71,97.05 38.45,97.05 ' style='stroke-width: 1.07; stroke: #333333; stroke-linecap: butt;' clip-path='url(#cpMHw0MzJ8Mjg4fDA=)' />
72 | <polyline points='35.71,40.01 38.45,40.01 ' style='stroke-width: 1.07; stroke: #333333; stroke-linecap: butt;' clip-path='url(#cpMHw0MzJ8Mjg4fDA=)' />
73 | <g clip-path='url(#cpMHw0MzJ8Mjg4fDA=)'><text transform='translate(13.35,161.43) rotate(-90)' style='font-size: 11.00px; font-family: Arial;' textLength='18.94px' lengthAdjust='spacingAndGlyphs'>MiB</text></g>
74 | <g clip-path='url(#cpMHw0MzJ8Mjg4fDA=)'><text x='38.45' y='14.93' style='font-size: 13.20px; font-family: Arial;' textLength='308.90px' lengthAdjust='spacingAndGlyphs'>Space used to encode nycflights13 (shorter is better)</text></g>
75 | </svg>
76 | 


--------------------------------------------------------------------------------
/inst/benchmarking.R:
--------------------------------------------------------------------------------
  1 | ## ----- setup
  2 | ensureExists <- function(packages){
  3 |   installed <- library()$results[,1]
  4 |   needs.install <- setdiff(packages, installed)
  5 |   if(length(needs.install) > 0) {
  6 |     install.packages(needs.install)
  7 |   }
  8 | }
  9 | 
 10 | if (exists("clust") && !is.null(clust)) {
 11 |   stopCluster(clust)
 12 |   clust <- NULL
 13 | }
 14 | 
 15 | ensureExists(c(
 16 |   "knitr",
 17 |   "rmarkdown",
 18 |   "purrr",
 19 |   "dplyr",
 20 |   "magrittr",
 21 |   "tibble",
 22 |   "printr",
 23 |   "ggplot2",
 24 |   "nycflights13",
 25 |   "parallel",
 26 |   "ssh.utils",
 27 |   "tools",
 28 |   "msgpack",
 29 |   "jsonlite",
 30 |   "msgpackR",
 31 |   "rjson",
 32 |   "RJSONIO",
 33 |   "yaml",
 34 |   "broom"
 35 | ))
 36 | 
 37 | library(knitr)
 38 | library(rmarkdown)
 39 | library(purrr)
 40 | library(dplyr)
 41 | library(magrittr)
 42 | library(tibble)
 43 | library(printr)
 44 | library(ggplot2)
 45 | 
 46 | ## ---- dataset
 47 | 
 48 | library(nycflights13)
 49 | dataset <- as.list(as.environment("package:nycflights13"))
 50 | 
 51 | subsample <- function(dataset, rate) {
 52 |   lapply(dataset, function(df) {
 53 |     df[1:(1 + round(nrow(df) * rate)), ]
 54 |   })
 55 | }
 56 | 
 57 | onerow <- subsample(dataset, 0)
 58 | 
 59 | ## ---- definitions
 60 | 
 61 | remoteHost <- "paradoxus.local"
 62 | port <- 42170
 63 | 
 64 | bufferBytes <- function(f) function(con, chunk = 65536L) {
 65 |   buf <- rawConnection(raw(0), open="w")
 66 |   repeat {
 67 |     result <- readBin(con = con, what = "raw", n = chunk)
 68 |     if (length(result) == 0) break
 69 |     writeBin(result, buf)
 70 |   }
 71 |   f(rawConnectionValue(buf))
 72 | }
 73 | 
 74 | bufferBytes2 <- function(f) function(con, chunk = 65536L) {
 75 |   buf <- msgpack:::catenator(raw(0)) # this is faster
 76 |   repeat {
 77 |     result <- readBin(con = con, what = "raw", n = chunk)
 78 |     if (length(result) == 0) break
 79 |     buf(result)
 80 |   }
 81 |   f(buf(action="read"))
 82 | }
 83 | 
 84 | bufferRawConn <- function(f) function(con, chunk = 65536L) {
 85 |   buf <- msgpack:::catenator(raw(0)) # this is faster
 86 |   repeat {
 87 |     result <- readBin(con = con, what = "raw", n = chunk)
 88 |     if (length(result) == 0) break
 89 |     buf(result)
 90 |   }
 91 |   f(rawConnection(buf(action="read"), open="r"))
 92 | }
 93 | 
 94 | 
 95 | bufferLines <- function(reader) function(con) {
 96 |   lines <- msgpack:::catenator(character(0))
 97 |   repeat {
 98 |     l <- readLines(con)
 99 |     if (length(l) == 0) break()
100 |     lines(l)
101 |   }
102 |   buf <- textConnection(lines(action="read"))
103 |   on.exit(close(buf))
104 |   reader(buf)
105 | }
106 | 
107 | forked <- function(ifParent, ifChild, catch=TRUE) {
108 |   #   Fork the R process. In the parent process call ifParent(), in
109 |   #   the child process call ifChild(). Return a list of return value of
110 |   #   both functions, or error messages.
111 |   other <- parallel:::mcfork()
112 |   if (inherits(other, "masterProcess")) {
113 |     # we are child process
114 |     if (catch) {
115 |       tryCatch({
116 |         result <- ifChild(x)
117 |         parallel:::mcexit(0, send = result)
118 |       },
119 |       error =
120 |         function(e) {
121 |           parallel:::mcexit(1, send = list(error=e, calls=sys.calls()))
122 |         }
123 |       )
124 |     } else {
125 |       result <- ifChild(x)
126 |       parallel:::mcexit(0, send = result)
127 |     }
128 |   } else {
129 |     # we are master process
130 |     if (catch) {
131 |       mine <- tryCatch(ifParent(x),
132 |                        error = if (catch) function(e) list(error=e, calls = sys.calls()) else NULL)
133 |     } else {
134 |       mine <- ifParent(x)
135 |     }
136 |     theirs <- tryCatch(
137 |       unserialize(parallel:::readChild(other)),
138 |       error = if(catch) function(e) list(error=e, calls=sys.calls()) else NULL)
139 |     child_pids <- vapply(parallel:::children(), function(x) x$pid, 0)
140 |     if (other$pid %in% child_pids) {
141 |       warning("Killing child process ", deparse(other))
142 |       parallel:::mckill(other, tools::SIGTERM)
143 |     }
144 |     c(theirs, mine)
145 |   }
146 | }
147 | 
148 | return_result <- FALSE
149 | # Function accepts a set of sys.time readings and computes timings
150 | times <- function(start.write, end.write,
151 |                   start.read, end.read,
152 |                   bytes, result,
153 |                   start.parent, end.parent, ...) {
154 |   c(list(extra=list(list(...))),
155 |     if (missing(end.read) || missing(end.write))
156 |       list()
157 |     else
158 |       c(read.user = end.read[["user.self"]] - start.read[["user.self"]],
159 |         read.sys = end.read[["sys.self"]] - start.read[["sys.self"]],
160 |         read.elapsed = end.read[["elapsed"]] - start.read[["elapsed"]],
161 |         write.user = end.write[["user.self"]] - start.write[["user.self"]],
162 |         write.sys = end.write[["sys.self"]] - start.write[["sys.self"]],
163 |         write.elapsed = end.write[["elapsed"]] - start.write[["elapsed"]],
164 |         total.user =
165 |           (   end.read[["user.self"]] + end.write[["user.self"]]
166 |             - start.write[["user.self"]] - start.read[["user.self"]]),
167 |         total.sys =
168 |           (   end.read[["sys.self"]] + end.write[["sys.self"]]
169 |             - start.write[["sys.self"]] - start.read[["sys.self"]]),
170 |         total.elapsed = max(end.read[["elapsed"]] - start.read[["elapsed"]],
171 |                             end.write[["elapsed"]] - start.write[["elapsed"]])),
172 |     bytes = if (missing(bytes)) list() else bytes,
173 |     result = if (missing(result) || !return_result) list() else list(result),
174 |     parent = if (missing(start.parent)) list()
175 |              else c(user = (end.parent[["user.self"]] - start.parent[["user.self"]]),
176 |                     sys = (end.parent[["sys.self"]] - start.parent[["sys.self"]]),
177 |                     elapsed = (end.parent[["elapsed"]] - start.parent[["elapsed"]])))
178 | }
179 | 
180 | bytes <- function(x) {
181 |   switch(mode(x),
182 |          raw = {
183 |            length(x)
184 |          },
185 |          character = {
186 |            length(x) + sum(nchar(x))
187 |          })
188 | }
189 | 
190 | timeConvert <- function(data,
191 |                         from = unserialize,
192 |                         to = function(data) serialize(data,NULL),
193 |                         wrap = identity, ...) {
194 |   force(data)
195 |   start.write <- proc.time()
196 |   enc <- to(data)
197 |   end.write <- proc.time()
198 |   as.read <- from(enc)
199 |   end.read <- proc.time()
200 |   bytes <- bytes(enc)
201 |   times(start.write, end.write,
202 |         end.write, end.read,
203 |         bytes(enc), as.read)
204 | }
205 | 
206 | 
207 | timeConnection <- function(..., raw = TRUE) {
208 |   if (raw) {
209 |     timeRawConnection(...)
210 |   } else {
211 |     timeTextConnection(...)
212 |   }
213 | }
214 | 
215 | timeRawConnection <- function(data,
216 |                               reader = unserialize,
217 |                               writer = serialize,
218 |                               wrap = identity, ...) {
219 |   force(data)
220 |   conn <- wrap(rawConnection(raw(0), open="wb"))
221 |   on.exit(close(conn), add=TRUE)
222 |   start.write <- proc.time()
223 |   writer(data, conn)
224 |   end.write <- proc.time()
225 |   bytes <- rawConnectionValue(conn)
226 |   conn2 <- wrap(rawConnection(bytes, open="rb"))
227 |   on.exit(close(conn2), add=TRUE)
228 |   start.read <- proc.time()
229 |   as.read <- reader(conn2)
230 |   end.read <- proc.time()
231 |   times(start.write, end.write,
232 |         end.write, end.read,
233 |         length(bytes), as.read)
234 | }
235 | 
236 | timeTextConnection <- function(data,
237 |                                reader = function(x) source(x, TRUE),
238 |                                writer = function(x, conn) dump("x", conn),
239 |                                wrap = identity, ...) {
240 |   force(data)
241 |   theText <- character(0)
242 |   conn <- textConnection(NULL, open="w")
243 |   on.exit(close(conn), add=TRUE)
244 |   start.write <- proc.time()
245 |   writer(data, conn)
246 |   end.write <- proc.time()
247 |   theText <- textConnectionValue(conn)
248 |   nbytes <- bytes(theText)
249 |   conn2 <- wrap(textConnection(theText, open="r"))
250 |   on.exit(close(conn2), add=TRUE)
251 |   start.read <- proc.time()
252 |   as.read <- reader(conn2)
253 |   end.read <- proc.time()
254 |   times(start.write, end.write,
255 |         end.write, end.read,
256 |         nbytes, as.read)
257 | }
258 | 
259 | timeFileIO <- function(data,
260 |                        reader = unserialize,
261 |                        writer = serialize,
262 |                        raw = TRUE,
263 |                        wrap = identity, ...) {
264 |   force(data)
265 |   fnam <- tempfile()
266 |   on.exit(unlink(fnam, force=TRUE))
267 |   con <- file(fnam, open=paste0("w", if(raw) "b" else ""), raw=raw)
268 |   start.write <- proc.time()
269 |   writer(data, con)
270 |   nbytes <- seek(con)
271 |   close(con)
272 |   end.write <- proc.time()
273 |   con <- wrap(file(fnam, open=paste0("r", if(raw) "b" else ""), raw=raw))
274 |   on.exit(close(con), add=TRUE)
275 |   start.read <- proc.time()
276 |   as.read <- reader(con)
277 |   end.read <- proc.time()
278 |   times(start.write, end.write,
279 |         start.read, end.read,
280 |         nbytes, as.read)
281 | }
282 | 
283 | timeSocketTransfer <- function(data,
284 |                                reader = unserialize,
285 |                                writer = serialize,
286 |                                wrap = identity,
287 |                                raw = TRUE,
288 |                                catch = FALSE, ...) {
289 |   force(data)
290 |   doRead <- function(other) {
291 |     conn <- wrap(socketConnection(port = port,
292 |                                   server = TRUE,
293 |                                   blocking = TRUE,
294 |                                   open = paste0("r", if(raw) "b" else "")))
295 |     on.exit(close(conn))
296 |     start.read <- proc.time()
297 |     as.read <- reader(conn)
298 |     end.read <- proc.time()
299 |     list(start.read = start.read,
300 |          end.read = end.read)
301 |   }
302 |   doWrite <- function(other) {
303 |     Sys.sleep(0.5)
304 |     conn <- wrap(socketConnection(port = port, server = FALSE,
305 |                                   blocking = TRUE,
306 |                                   open = paste0("w", if(raw) "b" else "")))
307 |     start.write <- proc.time()
308 |     on.exit(close(conn))
309 |     writer(data, conn)
310 |     flush(conn)
311 |     end.write <- proc.time()
312 |     list(start.write = start.write,
313 |          end.write = end.write, bytes=NA)
314 |   }
315 |   start.parent <- proc.time()
316 |   results <- forked(ifChild = doWrite, ifParent = doRead, catch = catch)
317 |   end.parent <- proc.time()
318 | 
319 |   do.call(times, c(results,
320 |                    list(start.parent = start.parent, end.parent = end.parent)),
321 |           quote=TRUE)
322 | }
323 | 
324 | timeFifoTransfer <- function(data,
325 |                              reader = unserialize,
326 |                              writer = serialize,
327 |                              wrap = identity,
328 |                              catch = FALSE, ...) {
329 |   force(data)
330 |   fnam <- tempfile()
331 |   on.exit(unlink(fnam, force = TRUE))
332 |   system(paste("mkfifo", fnam))
333 | 
334 |   doRead <- function(other) {
335 |     Sys.sleep(0.5)
336 |     conn <- wrap(fifo(fnam, open = "rb", blocking = TRUE))
337 |     on.exit({
338 |       close(conn)
339 |     })
340 |     start.read <- proc.time()
341 |     as.read <- reader(conn)
342 |     end.read <- proc.time()
343 |     list(start.read = start.read,
344 |          end.read = end.read)
345 |   }
346 |   doWrite <- function(other) {
347 |     conn <- fifo(fnam, open = "wb", blocking = TRUE)
348 |     on.exit({
349 |       close(conn)
350 |     })
351 |     start.write <- proc.time()
352 |     writer(data, conn)
353 |     flush(conn)
354 |     end.write <- proc.time()
355 |     list(start.write = start.write,
356 |          end.write = end.write,
357 |          bytes = NA)
358 |   }
359 | 
360 |   start.parent <- proc.time()
361 |   results <- forked(ifChild = doWrite, ifParent = doRead, catch = catch)
362 |   end.parent <- proc.time()
363 | 
364 |   do.call(times, c(results,
365 |                    list(start.parent = start.parent,
366 |                         end.parent = end.parent)),
367 |           quote=TRUE)
368 | }
369 | 
370 | clust <<- NULL
371 | startRemote <- function() {
372 |   message("starting remote")
373 |   ssh.utils::run.remote("killall R", remoteHost)
374 |   clust <- makePSOCKcluster(nnodes=1, c(remoteHost), rscript = "Rscript")
375 |   rtmp <- clusterCall(clust, tempdir)
376 |   message("rtmp is ", rtmp)
377 |   message("getwd is ", getwd())
378 | 
379 |   ssh.utils::cp.remote("", "../inst/benchmarking.R",
380 |                        remoteHost, paste0(rtmp, "/benchmarking.R"))
381 |   clusterCall(clust, options, repos = getOption("repos"))
382 |   parallel::clusterCall(clust, source, paste0(rtmp, "/benchmarking.R"))
383 |   clust <<- clust
384 |   message("started remote")
385 | }
386 | 
387 | doRemoteWrite <- function(data, host, port, wrap, raw, writer) {
388 |   tryCatch({
389 |     Sys.sleep(1)
390 |     conn <- wrap(socketConnection(host = host,
391 |                                   port = port,
392 |                                   server = FALSE,
393 |                                   blocking = TRUE,
394 |                                   open = paste0("w", if(raw) "b" else "")))
395 |     start.write <- proc.time()
396 |     on.exit(close(conn))
397 |     writer(data, conn)
398 |     flush(conn)
399 |     end.write <- proc.time()
400 |     list(start.write = start.write,
401 |          end.write = end.write, bytes=NA)
402 |   },
403 |   error = function(e) list(error=e, calls=sys.calls()))
404 | }
405 | 
406 | doRemoteRead <- function(port, wrap, raw, reader) {
407 |   # place logging statements in here...
408 | 
409 |   tryCatch({
410 |     sock <- socketConnection(port = port,
411 |                              server = TRUE,
412 |                              blocking = TRUE,
413 |                              open = paste0("r", if(raw) "b" else ""),
414 |                              timeout = 60)
415 |     conn <- wrap(sock)
416 |     on.exit(close(conn))
417 |     start.read <- proc.time()
418 |     as.read <- reader(conn)
419 |     end.read <- proc.time()
420 |     list(start.read = start.read,
421 |          end.read = end.read)
422 |   },
423 |   error = function(e) list(error=e, calls=sys.calls())
424 |   )
425 | }
426 | 
427 | timeRemoteTransfer <- function(data,
428 |                                reader = bufferBytes(unserialize),
429 |                                writer = serialize,
430 |                                wrap = identity,
431 |                                raw = TRUE,
432 |                                ...) {
433 |   force(data)
434 |   if (is.null(clust)) {
435 |     startRemote()
436 |   }
437 | 
438 |   parallel:::sendCall(clust[[1]],
439 |                       doRemoteRead,
440 |                       list(port, wrap, raw, reader))
441 |   local_results <- tryCatch(
442 |     doRemoteWrite(data, remoteHost, port, wrap, raw, writer),
443 |     error = function(e) list(error = e, calls = sys.calls()))
444 |   remote_results <- parallel:::recvResult(clust[[1]])
445 |   do.call(times, c(local_results, remote_results), quote=TRUE)
446 | }
447 | 
448 | timeCurve <- function(dataset,
449 |                       method,
450 |                       timeout = 60,
451 |                       start = 0.01,
452 |                       max = 1, ...
453 |                       ) {
454 |   results <- data_frame()
455 |   current <- start
456 |   if (missing(method)) {
457 |     message("method missing???")
458 |     return(data.frame())
459 |   }
460 |   while (current <= max) {
461 |     message(paste0("size = ", current))
462 |     data <- subsample(dataset, current)
463 |     result <- method(data, ...)
464 |     results <- bind_rows(results, as_tibble(c(size = current, result)))
465 |     if ("total.elapsed" %in% names(result)) {
466 |       if (result$total.elapsed > timeout) break() else NULL
467 |     } else  {
468 |       break()
469 |     }
470 |     if (current == max) break()
471 |     current = min(current * sqrt(2), max)
472 |   }
473 |   results
474 | }
475 | 
476 | arg_df <- function(tests) (tests
477 |   # produces an arg data frame: the labels in columns and the argument
478 |   # data structure in column "args"
479 |   %>% map(names)
480 |   %>% cross_df()
481 |   %>% pmap_dfr(function(...) {
482 |     labels <- list(...)
483 |     arglist <- pmap(list(labels, names(labels)),
484 |                     function(label, column) list(tests[[column]][[label]]))
485 |     c(labels, args = list(list(arglist)))
486 |   })
487 |   %>% mutate(args = (args # unlist the args twice
488 |     %>% map(.
489 |             %>% unlist(recursive = FALSE, use.names = FALSE)
490 |             %>% unlist(recursive = FALSE, use.names = TRUE))))
491 | )
492 | 
493 | `%*%` <- intersect
494 | 
495 | run_tests <- function(arg_df) {
496 |   pmap_dfr(arg_df, function(..., args) {
497 |     labels <- list(...)
498 |     message(paste0(collapse = "\n", deparse(labels, control=c())))
499 |     # what I want is a bind syntax like
500 |     ## bind({
501 |     ##   list(strategy = ?fun, ??args) <- args
502 |     ##   fun(!!args)
503 |     ## })
504 |     fun <- args$strategy
505 |     args <- args[names(args) != "strategy"]
506 |     the_env <- list2env(args, parent = environment())
507 | 
508 |     # this dance is to avoid calling do.call with a giant unnamed dataset
509 |     # (which causes R to spin several minutes on writing a traceback)
510 |     call <- as.call(c(quote(fun),
511 |                       structure(map(names(args), as.name),
512 |                                 names = names(args))))
513 |     results <- eval(call, the_env)
514 |     as_data_frame(c(labels, results))
515 |   })
516 | }
517 | 
518 | store <- function(data, dataset) {
519 |   # key on all character columns the two tables hav in common
520 |   existing.keys <- names(benchmarks)[map_lgl(benchmarks, is.character)]
521 |   new.keys <- names(data)[map_lgl(data, is.character)]
522 |   keys <- intersect(existing.keys, new.keys)
523 |   message(paste("Replacing on keys:", paste0("\"", keys, "\"", collapse=", ")))
524 |   if (length(keys) > 0) {
525 |     dataset <- (dataset          # update benchmarks
526 |       %>% anti_join(data, by=keys)
527 |       %>% bind_rows(data)
528 |     )
529 |   } else {
530 |     dataset <- data
531 |   }
532 |   dataset
533 | }
534 | 
535 | common.options <- list(
536 |   strategy = list(
537 |     timeCurve = list(strategy = timeCurve, timeout = 10, start = 0.001)),
538 |   dataset = list(
539 |     nycflights13 = list(data = dataset)))
540 | 
541 | conversion.methods <- list(
542 |   method = list(
543 |     convert = list(method = timeConvert))
544 | )
545 | 
546 | synchronous.methods <- list(
547 |   method = list(
548 |     conn = list(method = timeConnection),
549 |     file = list(method = timeFileIO)
550 | ))
551 | 
552 | concurrent.methods <- list(
553 |   method = list(
554 |     remote = list(method = timeRemoteTransfer),
555 |     socket = list(method = timeSocketTransfer),
556 |     fifo = list(method = timeFifoTransfer))
557 | )
558 | 
559 | connection.methods <- list(
560 |   method = c(concurrent.methods$method,
561 |              synchronous.methods$method)
562 | )
563 | 
564 | convert.common.options <- c(
565 |   common.options,
566 |   conversion.methods
567 | )
568 | 
569 | raw.common.options <- c(
570 |   common.options,
571 |   conversion.methods,
572 |   raw = list('TRUE' = list(raw = TRUE))
573 | )
574 | 
575 | text.common.options <- c(
576 |   common.options,
577 |   conversion.methods,
578 |   raw = list('FALSE' = list(raw = FALSE))
579 | )
580 | 
581 | all.common.options <- c(
582 |   common.options,
583 |   list(
584 |     method = c(connection.methods$method,
585 |                conversion.methods$method))
586 | )
587 | 
588 | buffer_read_options <- function(reader = identity,
589 |                               raw = FALSE,
590 |                               buffer = (if (raw) bufferBytes2 else bufferLines)) {
591 |   c(common.options,
592 |     list(raw = structure(list(list(raw = raw)), names = as.character(raw))),
593 |     list(method = list(
594 |       conn = list(method = timeConnection,
595 |                   reader = reader),
596 |       file = list(method = timeFileIO,
597 |                   reader = reader),
598 |       fifo = list(method = timeFifoTransfer,
599 |                   reader = buffer(reader)),
600 |       socket = list(method = timeSocketTransfer,
601 |                     reader = buffer(reader)),
602 |       remote = list(method = timeRemoteTransfer,
603 |                     reader = buffer(reader))
604 |     ))
605 |     )
606 | }
607 | 
608 | combine_opts <- function(x, ...) {
609 |   for (l in list(...)) {
610 |     for (n in names(l)) {
611 |       if (n %in% names(x)) {
612 |         x[[n]] <- c(x[[n]], l[[n]])
613 |       } else {
614 |         x[[n]] <- c(x[[n]], l[[n]])
615 |       }
616 |     }
617 |   }
618 |   x;
619 | }
620 | 
621 | `%but%` <- function(l, r) {
622 |   l[names(r)] <- r;
623 |   l
624 | }
625 | 
626 | showTimings <- function(timings, label, ...) {
627 |   stats <- with(timings, c(
628 |     "total.elapsed (s)" = total.elapsed,
629 |     "read.cpu (s)" = read.user + read.sys,
630 |     "read.elapsed (s)" = read.elapsed,
631 |     "write.cpu (s)" = write.user + write.sys,
632 |     "write.elapsed (s)" = write.elapsed,
633 |     "data size (MB)" = bytes / 0x100000
634 |   ))
635 |   kable(stats, col.names = label, digits=2)
636 | }
637 | 
638 | ## list(x = ?first, ??rest) -> list()
639 | 
640 | ## f <- function(a, call) bind({
641 | ##   list(?first, ??rest) <- callList
642 | ## })
643 | 
644 | ## Local Variables:
645 | ## ess-r-package-info: ("msgpack" . "~/msgpack/")
646 | ## End:
647 | 


--------------------------------------------------------------------------------
/inst/benchmarks.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crowding/msgpack-r/c77554d7c5fe0e10bb46d5284d451337813734e8/inst/benchmarks.RData


--------------------------------------------------------------------------------
/inst/comparison.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Which Data Format is Fastest Data Format?"
  3 | output:
  4 |   "rmarkdown::html_vignette":
  5 |     fig_width: 6
  6 |     fig_height: 4
  7 |     toc: true
  8 |     df_print: tibble
  9 |     self_contained: true
 10 |     dev: "svglite"
 11 | vignette: >
 12 |   %\VignetteIndexEntry{Which Data Format is Fastest Data Format\?}
 13 |   %\VignetteEngine{R.rsp::asis}
 14 |   %\VignetteKeyword{IO}
 15 |   %\VignetteKeyword{file}
 16 |   %\VignetteKeyword{benchmark}
 17 |   %\VignetteKeyword{connection}
 18 |   %\VignetteKeyword{interface}
 19 |   %\VignetteKeyword{msgpack}
 20 |   %\VignetteKeyword{vignette}
 21 |   %\VignetteKeyword{package}
 22 | ---
 23 | 
 24 | It's good to get data into and out of R! More compatibility, more
 25 | data, more better. But which is fastest?
 26 | 
 27 | Of course, speed might not be your only criterion. Different data
 28 | formats have different capabilities and purposes:
 29 | 
 30 | * `serialize()` is your best best for getting R objects out of and
 31 |   back into R the way you had them, but doesn't do much for you when
 32 |   communicating with other systems.
 33 | * `dump()` is your next best bet, while being a somewhat
 34 |   human-readable text format.
 35 | * Neither of above are good for exchanging data with other entities
 36 |   you don't trust.
 37 | * *JSON* is widely used on the web, but because it is based on
 38 |   Javascript data structures, it doesn't seamlessly represent R
 39 |   objects. R attributes like `dims` and `class` don't have equivalents
 40 |   in Javascript. Different packages also take different approaches in
 41 |   representing JSON objects in R, and vice versa.
 42 | * *CSV* is ubiquitous and can be read by most anything but only represents
 43 |   tabular data. Its data types are ambiguous.
 44 | * *Msgpack* has a data model compatible with JSON but is a binary
 45 |   format. For web applications, replacing JSON with msgpack is an easy
 46 |   way to save bandwidth and CPU usage. I'm considering msgpack as a
 47 |   convenient wire format for embedded devices.
 48 | 
 49 | 
 50 | ```{R setup, cache=FALSE, echo=FALSE, message=FALSE}
 51 | library(knitr)
 52 | opts_chunk$set(cache = TRUE, autodep = TRUE, message = FALSE, warning = FALSE)
 53 | message("cache path is '", opts_chunk$get("cache.path"), "'")
 54 | ```
 55 | 
 56 | All that said.... Which is fastest? Which is most efficient? Under
 57 | which set of arbitrarily chosen tests? I'll give more details below,
 58 | but first I'll show the results. In both graphs, lower is better.
 59 | 
 60 | ```{R, fig_width=6, fig_height=6, fig.asp = 1, cache=FALSE}
 61 | load_cache("timings","timing.plot")
 62 | ```
 63 | 
 64 | On the horizontal axis is the normalized size of the object being
 65 | transmitted. On the vertical is the time taken to transmit the test
 66 | dataset. There are four scenarios corresponding to how the dataset is
 67 | transmitted or stored. All tests are using the default options
 68 | for each respective library.
 69 | 
 70 | This following graph shows the amount of storage used to encode the
 71 | test dataset.
 72 | 
 73 | ```{R, cache=FALSE}
 74 | load_cache("datasize","datasize.plot")
 75 | ```
 76 | 
 77 | # Benchmarking Data Export / Import
 78 | 
 79 | This document is included as a static vignette, because running
 80 | benchmark code would not be kind to package builders.
 81 | 
 82 | I'll try picking a reference dataset and timing how long it takes to
 83 | encode and decode it. I'll start with the dataset `nycflights13`,
 84 | consisting of five data frames, since it's large enough to take a
 85 | measurable time to process.
 86 | 
 87 | The file `inst/benchmarking.R` contains code for test generation and
 88 | timing.
 89 | 
 90 | ```{R cache = FALSE, results = "hide"}
 91 | source(system.file("benchmarking.R", package = "msgpack"))
 92 | ``` 
 93 | 
 94 | I will test each encoder under the following scenarios:
 95 | 
 96 | * `convert` is for just converting an R object to an R raw or
 97 |   character object and back, in memory.
 98 | * `conn` also does an in-memory conversion but with R `textConnection`
 99 |       or `rawConnection` objects. (The `rawConnection` objects seem to
100 |       work faster, even with text formats.)
101 | * `file` writes to a temp file and then reads it back.
102 | * `fifo` forks the R process and writes from one while reading
103 |       from the other over a UNIX named pipe.
104 | * `tcp` forks the R process and writes from one to the other using a
105 |       TCP socket.
106 | * `remote` connects to a R process on a remote machine, and sends
107 |   data from the local to the remote over a TCP socket.
108 | 
109 | Each test will record the OS-reported CPU time, as well as elapsed
110 | clock time for reading, writing and total.
111 | 
112 | For `fifo`, `tcp`, and `remote` tests, the reading and writing may
113 | happen concurrently, so that `total.elapsed < read.elapsed +
114 | write.elapsed`. On the other hand, some packages cannot cope with
115 | reading from asynchronous connections (more on this in the
116 | details). In these cases I need to make sure all data is read into a
117 | buffer before calling the decoder.
118 | 
119 | These packages turn out to vary in performance by a couple orders of
120 | magnitude. So the test harness has to dynamically vary tne input size
121 | so as to not take forever to run. It starts with a small subsample
122 | of the dataset and increases its size until the encode and decode
123 | takes 10 seconds (or the entire dataset is transmitted). (See function
124 | `testCurve` in `inst/benchmarking.R`.).
125 | 
126 | On to the per-package benchmarks.
127 | 
128 | ## R serialization
129 | 
130 | `serialize` and `unserialize` produce faithful replications of R
131 | objects including R-specific structures like closures, environments,
132 | and attributes. But generally only R code can read it. It is useful
133 | for communicating between R processes.
134 | 
135 | For instance:
136 | 
137 | ```{R}
138 | unserialize.inmem <- timeConvert(dataset,
139 |                                  unserialize,
140 |                                  function(data) serialize(data, NULL))
141 | showTimings(unserialize.inmem, "R serialization (in memory)")
142 | ```
143 | 
144 | This is reasonably speedy, too.
145 | 
146 | However, I run into a problem if I try to transfer too large an object over
147 | a fifo or socket and read it with `unserialize().`
148 | 
149 | ```{R, error=TRUE}
150 | unserialize.bad <- timeFifoTransfer(dataset, unserialize, serialize, catch=FALSE)
151 | ```
152 | 
153 | This appears to be because `unserialize` doesn't operate concurrently,
154 | in the sense that it doesn't recover from finding the end of the line
155 | having only read part of a message. Meanwhile, on my machine `fifo()`
156 | and `socketConnection()` do not seem to block even if `blocking =
157 | TRUE` is set. They always wait for at least one byte, but may return
158 | fewer than requested. So `unserialize` does not work easily with socket
159 | connections.
160 | 
161 | One workaround is to exhaustively read the connection before handing
162 | off the data to `unserialize`. I handle that off screen in
163 | `bufferBytes` in `inst/benchmarking.R`.
164 | 
165 | ```{R}
166 | unserialize.socket <- timeSocketTransfer(dataset,
167 |                                          bufferBytes(unserialize),
168 |                                          serialize)
169 | 
170 | showTimings(unserialize.socket,
171 |             c("R serialization (over TCP, same host, blocking read)"))
172 | ```
173 | 
174 | But this strategy only works for transmitting one object per
175 | connection, and for one connection at a time.
176 | 
177 | Another way you can do it is to wrap R serialization with
178 | `msgpack::msgConnection`. This only adds a few bytes to each message,
179 | and msgpack will handle assembling complete messages. This also allows
180 | you to send several values per connection, or poll several connections
181 | connections until one of them returns a decoded message.
182 | 
183 | ```{R}
184 | unserialize.wrapped_socket <- timeSocketTransfer(dataset,
185 |                                                  unserialize,
186 |                                                  serialize,
187 |                                                  wrap = msgpack::msgConnection)
188 | ```
189 | 
190 | ```{R}
191 | showTimings(unserialize.wrapped_socket,
192 |             "R serialization over msgpack over TCP (same host)")
193 | ```
194 | 
195 | Surprisingly, this actually works faster. `R
196 | unserialize.wrapped_socket$total.elapsed <
197 | unserialize.socket$total.elapsed || stop()), include=FALSE`. One
198 | hypothesis that might account for this is that there are fewer write
199 | syscalls if the object is serialized into memory before sending, and
200 | fewer read syscalls if the object is prepended with its length before
201 | reading.
202 | 
203 | Now I'll start collecting benchmarks systematically. Since the methods
204 | we will explore have such a wide variance of performance, we will test
205 | them with successively larger datasets, until they exceed a
206 | timeout. The `serialize.spec` data structure annotated below specifies
207 | how to test R serialization and how to label the results. See the code
208 | file `inst/benchmarking.R` for more details.
209 | 
210 | ```{R serialization}
211 | # A spec is a named list of test factors.
212 | # A test factor is a named list of arguments that will be passed to the
213 | # test function at each factor level.
214 | # (Actually go look at the simpler test spec for msgpack before trying to
215 | # grok this)
216 | serialize.spec <- combine_opts( # options of the same name concatenated
217 |   list(
218 |     method = list( # the factor "method"
219 |       convert = list(method = timeConvert)), # has a level "convert"
220 |                                              # that passes timeConvert
221 |                                              # to the "method"
222 |                                              # argument of the test
223 |                                              # function
224 |     encoder = list( # the factor "encoder"
225 |       serialize = list( # has a level named "serialize"
226 |         writer = serialize, # with these arguments specifying, reader, writer, to, from
227 |         from = unserialize,
228 |         to = function(data) serialize(data, NULL)))),
229 |   # Combined with these common options (that include connection-based
230 |   # test methods
231 |   buffer_read_options(reader = unserialize, raw = TRUE)
232 | )
233 | 
234 | # arg_df takes the above spec and takes its outer product, generating
235 | # a data frame with case labels and arguments used
236 | serialize.calls <- arg_df(serialize.spec)
237 | ```
238 | 
239 | ```{R}
240 | serialize.timings <- run_tests(serialize.calls)
241 | ```
242 | 
243 | ```{R include=FALSE, results="hide"}
244 | benchmarks <- data.frame()
245 | ```
246 | 
247 | ```{R include=FALSE, results="hide"}
248 | benchmarks <- store(serialize.timings, benchmarks)
249 | ```
250 | 
251 | ```{R}
252 | test.plot <- (ggplot(
253 |   filter(benchmarks, encoder=="serialize"))
254 |   + aes(x = size, y = total.elapsed, color = method)
255 |   + geom_line()
256 |   + scale_x_continuous(limits = c(0, 1), name = "Fraction of nycflights13 transmitted")
257 |   + scale_y_continuous(limits = c(0, NA), name = "Elapsed time (s)")
258 | )
259 | test.plot + labs(title="R serialization performance")
260 | ```
261 | 
262 | Here the horizontal axis is the size of the test dataset, and the
263 | vertical axis is the time taken to transmit and receive.
264 | 
265 | R's implementation of fifo connections seem to be consistently slower
266 | than other methods on the test system (OS X). The next slowest, in the
267 | case of R serialization, is the test case where data is transmitted
268 | over a gigabit link (if it matters, the reading computer is an Ubuntu
269 | box slower than the writing computer.)
270 | 
271 | ## Dput/source
272 | 
273 | The `dput` and `deparse` functions render R data objects to an ASCII
274 | connection in R-like syntax.  The idea is that the text output "looks
275 | like" the code it takes to construct the object, to the extent that
276 | the mechanism for reading objects back in is to `eval` or `source` the
277 | text. (Hopefully one does not do this with untrusted data. A better
278 | technique may be to evaluate the data in a limited environment that
279 | just contains the needed constructors like `structure`, `list` and `c`
280 | etc.)
281 | 
282 | ```{R}
283 | ## Annoyingly, the behavior of "dump" and "dput" depend on this global option.
284 | options(deparse.max.lines = NULL)
285 | 
286 | #the output mimics the input
287 | dput(control=c(),
288 |      list(1, "2", verb=quote(buckle), my=c("s", "h", "o", "e")))
289 | ```
290 | 
291 | (Note how `dput` fails on transmitting language objects; if we try to
292 | eval the above we will try to evaluate "buckle" instead of getting
293 | just the name object. `as.name("buckle")`.)
294 | 
295 | Performance-wise, `dput` and `source` should not be used for large
296 | datasets, because they display an O(n^2) characteristic in terms of
297 | the data size.
298 | 
299 | ```{R dput}
300 | options(deparse.max.lines = NULL)
301 | 
302 | dput.timings <- run_tests(arg_df(c(
303 |   all.common.options,
304 |   list(
305 |     encoder = list(
306 |       dput = list(
307 |         from = function(t) eval(parse(text=t)),
308 |         to = deparse,
309 |         reader = function(c) eval(parse(c)),
310 |         writer = function(x, c) dput(x, file=c)))))))
311 | ```
312 | 
313 | ```{R, include = FALSE, results = "hide"}
314 | benchmarks <- store(dput.timings, benchmarks)
315 | ```
316 | 
317 | ```{R}
318 | (test.plot + labs(title="dput performance")) %+% filter(benchmarks, encoder == "dput")
319 | ```
320 | 
321 | It's interesting that `deparse` (which is used for the "in-memory
322 | conversion" test method labeled `conn`) is much slower than the
323 | connection-based `dput`.
324 | 
325 | ## jsonlite
326 | 
327 | The `jsonlite` package includes a fromJSON and toJSON implementation. It also
328 | supports streaming reads and write, but only of records consisting of
329 | one data frame per message. Data frames are sent row-wise.
330 | 
331 | Since the test dataset consists of several data frames, I will send
332 | one after the other over the lifetime of one connection.
333 | 
334 | ```{R jsonlite}
335 | jsonlite_reader <- function(conn) {
336 |   append <- msgpack:::catenator()
337 |   jsonlite::stream_in(conn, verbose = FALSE, handler = function(x) append(list(x)))
338 |   append(action="read")
339 | }
340 | 
341 | jsonlite_writer <- function(l, conn) {
342 |   lapply(l, function(x) jsonlite::stream_out(x, verbose=FALSE, conn))
343 | }
344 | 
345 | jsonlite.spec <- c(
346 |   all.common.options,
347 |   list(
348 |     encoder = list(
349 |       jsonlite = list(
350 |         to = jsonlite::toJSON,
351 |         from = jsonlite::fromJSON,
352 |         reader = jsonlite_reader,
353 |         writer = jsonlite_writer,
354 |         raw = FALSE))))
355 | 
356 | jsonlite.timings <- run_tests(arg_df(jsonlite.spec))
357 | ```
358 | 
359 | ```{R echo=FALSE, results="hide"}
360 | getElapsed <- function(d) filter(d, size == 1, method=="convert")$total.elapsed[[1]]
361 | getRemote <- function(d) filter(d, size == 1, method=="remote")$total.elapsed[[1]]
362 | `%digits%` <- function(x, y) format(x, digits=y)
363 | ```
364 | jsonlite performs reasonably well, but is several times slower than
365 | serialization.
366 | 
367 | ```{R, results = "hide"}
368 | benchmarks <- store(jsonlite.timings, benchmarks)
369 | ```
370 | 
371 | ```{R}
372 | (test.plot + labs(title="jsonlite performance")) %+% filter(benchmarks, encoder == "jsonlite")
373 | ```
374 | 
375 | ## Msgpack
376 | 
377 | `msgpack` is the package this vignette is written for.
378 | 
379 | ```{R msgpack_remote}
380 | msgpack_remote.spec <- c(
381 |   common.options,
382 |   list(method = list(remote = list(method=timeRemoteTransfer)),
383 |        encoder = list(
384 |          msgpack = list(
385 |            reader = msgpack::readMsg,
386 |            writer = msgpack::writeMsg,
387 |            wrap = msgpack::msgConnection))))
388 | 
389 | msgpackR_remote.timings <- run_tests(arg_df(msgpack_remote.spec))
390 | (test.plot + labs(title="msgpack")) %+% msgpackR_remote.timings
391 | 
392 | ```
393 | 
394 | ```{R msgpack}
395 | msgpack.spec <- c(
396 |   all.common.options,
397 |   list(
398 |     encoder = list(
399 |       msgpack = list(
400 |         wrap = msgpack::msgConnection,
401 |         reader = msgpack::readMsg,
402 |         writer = msgpack::writeMsg,
403 |         to = msgpack::packMsg,
404 |         from = msgpack::unpackMsg))))
405 | ```
406 | ```{R}
407 | msgpack.timings <- run_tests(arg_df(msgpack.spec))
408 | ```
409 | 
410 | ```{R, results = "hide"}
411 | benchmarks <- store(msgpack.timings, benchmarks)
412 | ```
413 | 
414 | ```{R}
415 | (test.plot + labs(title="msgpack performance")) %+% filter(benchmarks, encoder == "msgpack")
416 | ```
417 | 
418 | Implementing the streaming-mode callbacks has helped a lot, but there
419 | is still a quadratic characteristic going on here. Need to do some
420 | profiling of memory allocation.
421 | 
422 | Interestingly, msgpack is already faster than serialize for the remote
423 | use case (modulo some network glitches affecting one or two
424 | datapoints.)
425 | 
426 | ## msgpackR
427 | 
428 | There is an older pure-R implementation of msgpack on CRAN. One quirk is that
429 | it doesn't accept `NA` in R vectors.
430 | 
431 | ```{R, error = TRUE}
432 | msgpackR::pack(c(1, 2, 3))
433 | msgpackR::pack(c(1, 2, 3, NA))
434 | ```
435 | 
436 | As a workaround I'll substitute out all the NA values in the dataset.
437 | 
438 | ```{R}
439 | dataset_mungenull <- map(dataset, map_dfc,
440 |                          function(col) ifelse(is.na(col), 9999, col))
441 | ```
442 | 
443 | ```{R msgpackR}
444 | msgpackR.spec <- c(
445 |   all.common.options %but% list( 
446 |     dataset = list(nycflights13 = list(data = dataset_mungenull)),
447 |     note = list("no NAs" = list())),
448 |   list(
449 |     encoder = list(
450 |       msgpackR = list(
451 |         from = msgpackR::unpack,
452 |         to = msgpackR::pack,
453 |         reader = bufferBytes(msgpackR::unpack),
454 |         writer = function(data, conn) writeBin(msgpackR::pack(data), conn)))))
455 | ```
456 | 
457 | ```{R, results = "hide"}
458 | msgpackR.timings <- run_tests(arg_df(msgpackR.spec))
459 | ```
460 | 
461 | ```{R, include=FALSE, results="hide"}
462 | benchmarks <- store(msgpackR.timings, benchmarks)
463 | ```
464 | 
465 | ```{R}
466 | (test.plot + labs(title="msgpackR performance")) %+% filter(benchmarks, encoder == "msgpackR")
467 | ```
468 | 
469 | Performance-wise, it is quite slow, but a deeper concern is that if I
470 | send larger objects and inspect the results it sometimes looks garbled, and there
471 | are intermittent errors like:
472 | 
473 | ```{R error=TRUE}
474 | str(subsample(dataset_mungenull, .000044) %>% (msgpackR::pack) %>% (msgpackR::unpack))
475 | ```
476 | 
477 | ## rjson
478 | 
479 | ```{R rjson}
480 | rjson.spec <- combine_opts(
481 |   list(
482 |     method = list(
483 |       convert = list(method = timeConvert)),
484 |     encoder = list(
485 |       rjson = list(
486 |         from = rjson::fromJSON,
487 |         to = rjson::toJSON,
488 |         writer = function(data, con) writeChar(rjson::toJSON(data), con)))),
489 |   buffer_read_options(reader = function(con) rjson::fromJSON(file=con),
490 |                       raw = TRUE,
491 |                       buffer = bufferRawConn))
492 | rjson.timings <- run_tests(arg_df(rjson.spec))
493 | ```
494 | 
495 | ```{R include=FALSE, echo=FALSE}
496 | inmemory.factor <- (getElapsed(rjson.timings) / getElapsed(msgpack.timings))
497 | remote.factor <- (getRemote(rjson.timings) / getRemote(msgpack.timings))
498 | ```
499 | 
500 | `rjson` is the fastest JSON implementation -- only `r inmemory.factor %digits% 2` 
501 | times slower than `msgpack`, in memory, and `r remote.factor %digits% 2`
502 | times slower across the wire. It does not
503 | support streaming reads, so we must byte-buffer to read from
504 | connections. But that turns out to be quite fast as well.
505 | 
506 | ```{R results="hide"}
507 | benchmarks <- store(rjson.timings, benchmarks)
508 | ```
509 | 
510 | ```{R}
511 | (test.plot + labs(title="rjson performance")) %+% filter(benchmarks, encoder == "rjson")
512 | ```
513 | 
514 | ## RJSONIO
515 | 
516 | I am getting the following intermittent error in RJSONIO (i.e. this
517 | document fails to render once in a while.)
518 | 
519 | ```
520 | Error in RJSONIO::readJSONStream(con) : failed to parse json at 10240
521 | ```
522 | 
523 | ```{R RJSONIO}
524 | RJSONIO.spec <- c(
525 |   all.common.options,
526 |   list(
527 |     encoder = list(
528 |       RJSONIO = list(
529 |         from = RJSONIO::fromJSON,
530 |         to = RJSONIO::toJSON,
531 |         writer = function(x, con) writeBin(RJSONIO::toJSON(x), con),
532 |         reader = RJSONIO::readJSONStream,
533 |         raw = TRUE))))
534 | 
535 | RJSONIO.timings <- run_tests(arg_df(RJSONIO.spec))
536 | ```
537 | ```{R, include = FALSE, results = "hide"}
538 | benchmarks <- store(RJSONIO.timings, benchmarks)
539 | ```
540 | 
541 | RJSONIO offers a function to do streaming reads from connection, but
542 | it has much overhead compared with in-memory conversion.
543 | 
544 | ```{R}
545 | (test.plot + labs(title="RJSONIO performance")) %+% filter(benchmarks, encoder == "RJSONIO")
546 | ```
547 | 
548 | ## YAML
549 | 
550 | YAML is kind of "JSON, but more like Markdown." Allegedly easier to
551 | read but with a more complex grammar. It's popular for config files.
552 | 
553 | ```{R}
554 | cat(yaml::as.yaml(list(compact=TRUE, schema = 0)))
555 | ```
556 | 
557 | Unfortunately, the YAML package produces a protection stack overflow
558 | when decoding too large a message.
559 | 
560 | ```{R, error = TRUE}
561 | oops <- yaml::yaml.load(yaml::as.yaml(subsample(dataset, 0.15)))
562 | ```
563 | 
564 | ```{R}
565 | yaml.timings <- run_tests(arg_df(c(
566 |   all.common.options,
567 |   list(
568 |     note = list("Max 0.14" = list(max = 0.14)),
569 |     encoder = list(
570 |       yaml = list(
571 |         reader = yaml::yaml.load_file,
572 |         writer = function(data, conn) writeChar(yaml::as.yaml(data), conn),
573 |         to = yaml::as.yaml,
574 |         from = yaml::yaml.load,
575 |         raw = TRUE))))))
576 | ```
577 | 
578 | ```{R, results = "hide"}
579 | benchmarks <- store(yaml.timings, benchmarks)
580 | ```
581 | 
582 | ```{R}
583 | (test.plot + labs(title="yaml performance")) %+% filter(benchmarks, encoder == "yaml")
584 | ```
585 | 
586 | ## write.csv
587 | 
588 | We're going here aren't we. Let's see if we can send messages with
589 | CSV. To send several CSV tables over one connection, I'll prepend to
590 | each message a header saying how many rows to read following.
591 | 
592 | ```{R}
593 | writeCsvs <- function(data, con) {
594 |   for (nm in names(data)) {
595 |     write.csv(as_tibble(list(name = nm, nrows = nrow(data[[nm]]))), con)
596 |     write.csv(data[[nm]], con, row.names = FALSE)
597 |   }
598 | }
599 | 
600 | readCsvs <- function(data, con) {
601 |   output <- list()
602 |   tryCatch(
603 |     repeat {
604 |       header <- read.csv(con2, nrows=1, stringsAsFactors = FALSE)
605 |       output[[header$name]] <- read.csv(con2, nrows=header$nrows, stringsAsFactors = FALSE)
606 |     },
607 |     error = force)
608 |   output
609 | }
610 | ```
611 | 
612 | I have just had a sinking feeling that a lot of people actually build
613 | web services that talk in CSV.
614 | 
615 | Unfortunately `read.table` can't cope with non-blocking connections, so
616 | I have to buffer the read into memory when reading from a fifo or socket.
617 | 
618 | ```{R, results="hide"}
619 | csv.timings <- run_tests(
620 |   arg_df(c(
621 |     list(encoder = list(csv = list(writer = writeCsvs))),
622 |     buffer_read_options(reader = readCsvs, raw = TRUE))))
623 | ```
624 | 
625 | ```{R, results = "hide"}
626 | benchmarks <- store(csv.timings, benchmarks)
627 | ```
628 | 
629 | ```{R}
630 | (test.plot + labs(title="R csv performance")) %+% filter(benchmarks, encoder == "csv")
631 | ```
632 | 
633 | Surprisingly fast at writing to files.
634 | 
635 | ## Comparison of Encoder Speed
636 | 
637 | ```{R results="hide", include=FALSE}
638 | ## Derive some kind of ordering from fastest to slowest (for choosing our
639 | ## color palette):
640 | library(gnm)
641 | library(broom)
642 | library(stringr)
643 | 
644 | # order the factors from high to low by ad-hoc regression model...
645 | m <- gnm(data = benchmarks,
646 |          total.elapsed ~ Mult(size, method, encoder))
647 | 
648 | getEncoder <- function(term) {
649 |   match <- str_match(term, "\\.(size|method|encoder)(\\w*)")
650 |   match <- match[,-1]
651 |   dimnames(match) <- list(NULL, list("param", "value"))
652 |   as_data_frame(match)
653 | }
654 | 
655 | coefs <- (m %>% tidy %>% bind_cols(., getEncoder(.$term)))
656 | 
657 | encoder_order <- (coefs
658 |   %>% filter(param == "encoder")
659 |   %>% arrange(desc(abs(estimate)))
660 |   %>% .$value
661 | )
662 | 
663 | method_order <- (coefs
664 |   %>% filter(param == "method")
665 |   %>% arrange(desc(abs(estimate)))
666 |   %>% .$value
667 | )
668 | ```
669 | 
670 | The most importent scenarios are conversion in memory, writing to
671 | file, writing to another process on the same host, and writing to
672 | remote host.
673 | 
674 | ```{R timings, fig_width=6, fig_height=6, fig.asp = 1}
675 | library(directlabels)
676 | timing.plot <- ( benchmarks
677 |   %>% subset(method %in% c("convert", "socket", "remote", "file"))
678 |   %>% mutate(method = c(convert="Conversion in memory", socket="TCP (same host)",
679 |                         remote="TCP (over LAN)", file = "File I/O")[method])
680 |   %>% mutate(encoder = factor(encoder, levels=encoder_order))
681 |   %>% { (ggplot(.)
682 |     + aes(y = total.elapsed, x = size, color = encoder, group = encoder)
683 |     + facet_wrap(~method)
684 |     + geom_point()
685 |     + geom_line()
686 |     + theme(aspect.ratio=1)
687 |     + scale_x_continuous(name = "Fraction of nycflights13 transmitted")
688 |     + scale_y_continuous(limits = c(0, NA), name = "Elapsed time (s)")
689 |     + labs(title="Elapsed time to encode, write, read, decode, by package")
690 |     + expand_limits(x=1.5, y=30)
691 |     + geom_dl(aes(label=encoder),
692 |               debug=FALSE,
693 |               method = list(
694 |                 "last.points",
695 |                 rot=45,
696 |                 "bumpup",
697 |                 dl.trans(x=x+0.2, y=y+0.2)))
698 |     + guides(color=FALSE)
699 |   )})
700 | timing.plot
701 | ```
702 | 
703 | ## Comparison of Encoder Data Usage
704 | 
705 | ```{R datasize}
706 | # We didn't have time to test each encoder with the full
707 | # dataset, so we'll extrapolate from the largest data set tested
708 | 
709 | needed <- (benchmarks
710 |   %>% filter(!is.na(bytes))
711 |   %>% select(., encoder, method, size, bytes)
712 |   %>% group_by(encoder)
713 |   %>% filter(row_number() == 1)
714 |   %>% arrange(desc(size), .by_group=TRUE)
715 |   %>% slice(1)
716 |   %>% select(encoder, method)
717 | )
718 | 
719 | model <- ( needed
720 |            %>% inner_join(benchmarks)
721 |            %>% select(bytes, encoder, size)
722 |            %>% lm(formula = bytes ~ encoder * size))
723 | 
724 | predicted <- (needed
725 |   %>% mutate(size = 1)
726 |   %>% augment(model, newdata=.)
727 |   %>% rename(bytes = .fitted)
728 | )
729 | 
730 | datasize.plot <- (predicted
731 |   %>% mutate(encoder = factor(encoder, levels=encoder_order))
732 |   %>% ggplot
733 |   %>% +aes(x = reorder(encoder, bytes), y=bytes/2^20, fill=encoder)
734 |   %>% +geom_col()
735 |   %>% +labs(y = "MiB", x = NULL,
736 |             title = "Space used to encode nycflights13 (shorter is better)")
737 |   %>% +geom_dl(aes(label=encoder, group=encoder),
738 |                method=c(dl.trans(y=y+0.1), "top.bumpup"))
739 |   %>% +guides(fill = FALSE)
740 |   %>% +theme(axis.title.x=element_blank(),
741 |              axis.text.x=element_blank(),
742 |              axis.ticks.x=element_blank()
743 |              )
744 | )
745 | datasize.plot
746 | ```
747 | 
748 | The difference in size between different JSON and msgpack
749 | implementations could use some investigation. It may down to
750 | whitespace, sending data row-wise vs. col-wise, differences in the
751 | mapping between R and data format types, or bugs.
752 | 
753 | ```{R}
754 | save(benchmarks, file="../inst/benchmarks.RData")
755 | ```
756 | 


--------------------------------------------------------------------------------
/man/msgConnection.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/conn.R
  3 | \name{msgConnection}
  4 | \alias{msgConnection}
  5 | \alias{close.msgConnection}
  6 | \alias{partial}
  7 | \alias{partial.msgConnection}
  8 | \alias{readMsgs}
  9 | \alias{status}
 10 | \alias{status.msgConnection}
 11 | \alias{seek.msgConnection}
 12 | \alias{readMsg}
 13 | \alias{writeMsg}
 14 | \alias{writeMsgs}
 15 | \title{Read and write msgpack formatted messages over R connections.}
 16 | \usage{
 17 | msgConnection(con, read_size = 2^16, max_size = NA, ...)
 18 | 
 19 | \method{close}{msgConnection}(con, ...)
 20 | 
 21 | partial(con)
 22 | 
 23 | \method{partial}{msgConnection}(con)
 24 | 
 25 | readMsgs(con, n = NA, ...)
 26 | 
 27 | status(con)
 28 | 
 29 | \method{status}{msgConnection}(con)
 30 | 
 31 | \method{seek}{msgConnection}(con, rw = summary(con)$mode, ...)
 32 | 
 33 | readMsg(con, ...)
 34 | 
 35 | writeMsg(obj, con, ...)
 36 | 
 37 | writeMsgs(objs, con, ...)
 38 | }
 39 | \arguments{
 40 | \item{con}{A \link{connection} object open in binary mode.}
 41 | 
 42 | \item{read_size}{How many bytes to read at a time.}
 43 | 
 44 | \item{max_size}{The largest partial message to store, in
 45 | bytes. \code{NA} means do not enforce a limit.}
 46 | 
 47 | \item{...}{Unpacking options (see \link{unpackMsg}).}
 48 | 
 49 | \item{n}{The maximum number of messages to read. A value of NA
 50 | means to parse all available messages until end of input.}
 51 | 
 52 | \item{rw}{See \code{\link[=seek]{seek()}}.}
 53 | 
 54 | \item{obj}{An R object.}
 55 | 
 56 | \item{objs}{A list of R objects.}
 57 | }
 58 | \value{
 59 | \code{msgConnection()} returns an object of class
 60 | \code{msgConnection}.
 61 | 
 62 | \code{partial(con)} returns any data that has been read ahead of
 63 | the last decoded message.
 64 | 
 65 | \code{readMsgs(con, n)} returns a list of up to \code{n} decoded messages.
 66 | 
 67 | \code{status(con)} returns the status of msgpack decoding on the
 68 | connection. A value of \code{"ok"} indicates all requested messages
 69 | were read, \code{"buffer underflow"} for a non-blocking connection
 70 | indicates that only part of a message has been received, and
 71 | \code{"end of input"} means the last available message has been read.
 72 | Other values indicate errors encountered in decoding, which will
 73 | effectively halt reading.
 74 | 
 75 | \code{seek(con)} returns the number of bytes that have been
 76 | successfully read or written, depending on the mode of the
 77 | connection. (Repositioning is not supported.)
 78 | 
 79 | \code{readMsg(con)} returns one decoded message.
 80 | }
 81 | \description{
 82 | A \code{msgConnection} object decodes msgpack messages from an
 83 | underlying R raw connection.
 84 | 
 85 | \code{writeMsg} will work with any R connection in raw mode, but reading
 86 | requires a msgConnection object.
 87 | }
 88 | \details{
 89 | Because msgpack messages have unpredictable length, the decoder
 90 | reads ahead in chunks, then finds the boundaries between messages.
 91 | Therefore when reading over a socket or a fifo it is best to use a
 92 | nonblocking connection, and it will not work to mix readMsg and
 93 | readBin on the same connection.
 94 | 
 95 | If you are reading data from a not completely trusted source you
 96 | should specify options \code{max_size} and \code{max_depth} (see
 97 | \link{unpackOpts}). Without it, some deeply nested or cleverly designed
 98 | messages can cause a stack overflow or out-of-memory error.  With
 99 | these options set, you will get an R exception instead.
100 | }
101 | \examples{
102 | out <- rawConnection(raw(0), open="wb")
103 | apply(quakes, 1, function(x) writeMsg(x, out)) # one message for each row
104 | length(rawConnectionValue(out))
105 | inn <- msgConnection(rawConnection(rawConnectionValue(out), open="rb"))
106 | close(out)
107 | readMsg(inn)
108 | readMsgs(inn, 3)
109 | length(readMsgs(inn))
110 | close(inn)
111 | }
112 | 


--------------------------------------------------------------------------------
/man/packMsg.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/encode.R
 3 | \name{packMsg}
 4 | \alias{packMsg}
 5 | \alias{packMsgs}
 6 | \alias{packOpts}
 7 | \alias{prepack}
 8 | \alias{prepack.default}
 9 | \alias{prepack.data.frame}
10 | \title{Convert R objects to msgpack format.}
11 | \usage{
12 | packMsg(x, ...)
13 | 
14 | packMsgs(xs, ...)
15 | 
16 | packOpts(compatible = FALSE, as_is = FALSE, use_dict = TRUE,
17 |   max_size = NA, buf_size = 512)
18 | 
19 | prepack(x)
20 | 
21 | \method{prepack}{default}(x)
22 | 
23 | \method{prepack}{data.frame}(x)
24 | }
25 | \arguments{
26 | \item{x}{An R object, which can be null, a vector, list,
27 | environment, raw, or any combinations thereof.}
28 | 
29 | \item{...}{Options passed to \code{\link[=packOpts]{packOpts()}}}
30 | 
31 | \item{xs}{a list of objects to pack.}
32 | 
33 | \item{compatible}{If TRUE, emitted bytes conform to version 1.0 of
34 | msgpack encoding. This means that msgpack strings are used for
35 | raw objects.}
36 | 
37 | \item{as_is}{If TRUE, singletons (R primitive vectors of length 1
38 | having no names attribute) are encoded as msgpack arrays of
39 | length 1. Otherwise singletons are simplified to msgpack scalars.}
40 | 
41 | \item{use_dict}{If TRUE, vectors having a "names" attribute are
42 | encoded as dicts. If false, they are encoded as arrays and the
43 | names are discarded.}
44 | 
45 | \item{max_size}{The largest buffer that will be allocated.}
46 | 
47 | \item{buf_size}{The initial amount of memory, in bytes, to allocate
48 | for packing each message. This will be dynamically grown if a
49 | larger message is passed, so there is little reason to change
50 | this.}
51 | }
52 | \value{
53 | An object of class "raw".
54 | }
55 | \description{
56 | Convert R objects to msgpack format.
57 | 
58 | \code{\link[=packOpts]{packOpts()}} interprets the \code{...} argument of packMsg and
59 | packMsgs. it is not exported.
60 | }
61 | \details{
62 | Strings are re-encoded to UTF-8 if necessary. Real numbers
63 | taking integral values may be emitted as integers to save space.
64 | 
65 | Normally an R vector of length 1 will be unboxed, e.g. packMsg(1)
66 | will make a msgpack integer, but packMsg(c(1,2)) will make a
67 | msgpack list. To prevent this and produce a list of length 1 in the
68 | first case, specify \code{as_is = TRUE}. Objects of class \code{AsIs} or
69 | \code{data.frame} will always be encoded as-is.
70 | 
71 | A hook for pre-processing R objects before packing is supported, by
72 | giving the object an S3 \link{class} and implementing a method
73 | \code{prepack}. For instance, \code{prepack.data.frame(x)} simply adds the
74 | \code{"AsIs"} class to \code{x}.
75 | 
76 | Environment objects are written out with the keys in sorted order,
77 | but named vectors are written in the order which the entries
78 | appear.
79 | 
80 | Object attributes other than \code{names} and \code{class} are ignored.
81 | }
82 | \examples{
83 | packMsg( list(compact=TRUE, schema=0) )
84 | x <- packMsgs(list("one", "two", "three"))
85 | unpackMsgs(x, 2)
86 | }
87 | 


--------------------------------------------------------------------------------
/man/unpackMsg.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/decode.R
 3 | \name{unpackMsg}
 4 | \alias{unpackMsg}
 5 | \alias{unpackMsgs}
 6 | \alias{unpackOpts}
 7 | \title{Decode msgpack messages.}
 8 | \usage{
 9 | unpackMsg(x, ...)
10 | 
11 | unpackMsgs(x, n = NA, reader = NULL, ...)
12 | 
13 | unpackOpts(parent = NULL, df = TRUE, simplify = TRUE,
14 |   max_size = NA, max_depth = NA, underflow_handler = NULL)
15 | }
16 | \arguments{
17 | \item{x}{A \code{\link[=raw]{raw()}} object, perhaps read from a file or socket.}
18 | 
19 | \item{...}{Options passed to \link{unpackOpts}.}
20 | 
21 | \item{n}{How many messages to read. An "NA" here means to read as
22 | much as possible.}
23 | 
24 | \item{reader}{For implementing connections; a function that takes
25 | no arguments and returns a raw containing more data.}
26 | 
27 | \item{parent}{When an environment is given, (such as \code{\link[=emptyenv]{emptyenv()}}),
28 | unpack msgpack dicts into environment objects, with the given
29 | value as parent. This option overrides \code{use_df=TRUE}. Otherwise,
30 | unpack dicts into named vectors / lists.}
31 | 
32 | \item{df}{When \code{TRUE}, convert msgpack dicts, whose elements are
33 | all arrays having the same length, into \code{\link[=data.frame]{data.frame()}}s.}
34 | 
35 | \item{simplify}{If \code{TRUE}, simplify msgpack lists into primitive
36 | vectors.}
37 | 
38 | \item{max_size}{The maximum length of message to decode.}
39 | 
40 | \item{max_depth}{The maximum degree of nesting to support.}
41 | 
42 | \item{underflow_handler}{Used internally.}
43 | }
44 | \value{
45 | \code{unpackMsg(x)} returns one decoded message (which might be
46 | shorter than the input raw), or throws an error.
47 | 
48 | \code{unpackMsgs(r, n)} returns a list \code{X} with four elements:
49 | \itemize{
50 | \item \code{X$msgs} is a list of the messages unpacked.
51 | \item \code{X$remaining} is data remaining to be parsed.
52 | \item \code{X$status} is a status message, typically "ok", "end of input",
53 | or "buffer underflow".
54 | \item \code{x$bytes_read} the number of bytes consumed.
55 | }
56 | }
57 | \description{
58 | \code{unpackMsg} converts a raw array containing one message in msgpack
59 | format into the corresponding R data structure.
60 | }
61 | \details{
62 | The msgpack format does not have typed arrays, so all msgpack
63 | arrays are effectively lists from the R perspective. However, if an
64 | array containing compatibly typed elements is read, \code{unpack} will
65 | return a logical, integer, real or string vector as
66 | appropriate. This behavior is disabled with \code{simplify=FALSE}.  The
67 | coercion used is more conservative than R's coercion: Integer
68 | values may be converted to real, but boolean values will not be
69 | cast to numeric, nor any types to string. If conversion from a
70 | large integer to real loses precision, a warning is printed.
71 | 
72 | Msgpack also does not distinguish between \code{NA} and \code{NULL}. All nils
73 | will be decoded as NA.
74 | 
75 | Strings are assumed to be UTF-8 encoded. If a msgpack string does
76 | not appear to be valid UTF-8, a warning is printed and a raw object
77 | is produced instead.
78 | 
79 | Msgpack allows any type to be the key of a dict, but R only
80 | supports strings. If a non-string appears as key in a msgpack dict,
81 | it will be converted to string with \code{\link[=deparse]{deparse()}}.
82 | 
83 | Extension types will be decoded as raw objects with a class like
84 | \code{"ext120"} and a warning.
85 | }
86 | \examples{
87 | msg <- as.raw(c(0x82, 0xa7, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x63, 0x74, 0xc3,
88 |                 0xa6, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x00))
89 | unpackMsg(msg)
90 | x <- packMsgs(list("one", "two", "three"))
91 | unpackMsgs(x, 2)
92 | }
93 | 


--------------------------------------------------------------------------------
/src/cwpack.c:
--------------------------------------------------------------------------------
  1 | /*      CWPack - cwpack.c   */
  2 | /*
  3 |  The MIT License (MIT)
  4 |  
  5 |  Copyright (c) 2017 Claes Wihlborg
  6 |  
  7 |  Permission is hereby granted, free of charge, to any person obtaining a copy of this
  8 |  software and associated documentation files (the "Software"), to deal in the Software
  9 |  without restriction, including without limitation the rights to use, copy, modify,
 10 |  merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
 11 |  persons to whom the Software is furnished to do so, subject to the following conditions:
 12 |  
 13 |  The above copyright notice and this permission notice shall be included in all copies or
 14 |  substantial portions of the Software.
 15 |  
 16 |  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 17 |  BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 18 |  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 19 |  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #include <string.h>
 24 | 
 25 | #include "cwpack.h"
 26 | #include "cwpack_defines.h"
 27 | 
 28 | 
 29 | 
 30 | /*************************   C   S Y S T E M   L I B R A R Y   ****************/
 31 | 
 32 | #ifdef FORCE_NO_LIBRARY
 33 | 
 34 | static void	*memcpy(void *dst, const void *src, size_t n)
 35 | {
 36 |     unsigned int i;
 37 |     uint8_t *d=(uint8_t*)dst, *s=(uint8_t*)src;
 38 |     for (i=0; i<n; i++)
 39 |     {
 40 |         *d++ = *s++;
 41 |     }
 42 |     return dst;
 43 | }
 44 | 
 45 | #endif
 46 | 
 47 | 
 48 | 
 49 | /*************************   B Y T E   O R D E R   ****************************/
 50 | 
 51 | 
 52 | static int test_byte_order(void)
 53 | {
 54 | #ifdef COMPILE_FOR_BIG_ENDIAN
 55 |     const char *endianness = "1234";
 56 |     if (*(uint32_t*)endianness != 0x31323334UL)
 57 |         return CWP_RC_WRONG_BYTE_ORDER;
 58 | #else
 59 | 
 60 | #ifdef COMPILE_FOR_LITTLE_ENDIAN
 61 |     const char *endianness = "1234";
 62 |     if (*(uint32_t*)endianness != 0x34333231UL)
 63 |         return CWP_RC_WRONG_BYTE_ORDER;
 64 | #endif
 65 | #endif
 66 |     return CWP_RC_OK;
 67 | }
 68 | 
 69 | 
 70 | /*******************************   P A C K   **********************************/
 71 | 
 72 | 
 73 | 
 74 | int cw_pack_context_init (cw_pack_context* pack_context, void* data, unsigned long length, pack_overflow_handler hpo)
 75 | {
 76 |     pack_context->start = pack_context->current = (uint8_t*)data;
 77 |     pack_context->end = pack_context->start + length;
 78 |     pack_context->be_compatible = false;
 79 |     pack_context->err_no = 0;
 80 |     pack_context->handle_pack_overflow = hpo;
 81 |     pack_context->return_code = test_byte_order();
 82 |     return pack_context->return_code;
 83 | }
 84 | 
 85 | void cw_pack_set_compatibility (cw_pack_context* pack_context, bool be_compatible)
 86 | {
 87 |     pack_context->be_compatible = be_compatible;
 88 | }
 89 | 
 90 | 
 91 | 
 92 | /*  Packing routines  --------------------------------------------------------------------------------  */
 93 | 
 94 | 
 95 | void cw_pack_unsigned(cw_pack_context* pack_context, uint64_t i)
 96 | {
 97 |     if (pack_context->return_code)
 98 |         return;
 99 |     
100 |     if (i < 128)
101 |         tryMove0(i);
102 | 
103 |     if (i < 256)
104 |         tryMove1(0xcc, i);
105 | 
106 |     if (i < 0x10000L)
107 |     {
108 |         tryMove2(0xcd, i);
109 |     }
110 |     if (i < 0x100000000LL)
111 |         tryMove4(0xce, i);
112 | 
113 |     tryMove8(0xcf,i);
114 | }
115 | 
116 | 
117 | void cw_pack_signed(cw_pack_context* pack_context, int64_t i)
118 | {
119 |     if (pack_context->return_code)
120 |         return;
121 |     
122 |     if (i >127)
123 |     {
124 |         if (i < 256)
125 |             tryMove1(0xcc, i);
126 |         
127 |         if (i < 0x10000L)
128 |             tryMove2(0xcd, i);
129 |         
130 |         if (i < 0x100000000LL)
131 |             tryMove4(0xce, i);
132 |         
133 |         tryMove8(0xcf,i);
134 |     }
135 |     
136 |     if (i >= -32)
137 |         tryMove0(i);
138 | 
139 |     if (i >= -128)
140 |         tryMove1(0xd0, i);
141 |     
142 |     if (i >= -32768)
143 |         tryMove2(0xd1,i);
144 | 
145 |     if (i >= (int64_t)0xffffffff80000000LL)
146 |         tryMove4(0xd2,i);
147 | 
148 |     tryMove8(0xd3,i);
149 | }
150 | 
151 | 
152 | void cw_pack_float(cw_pack_context* pack_context, float f)
153 | {
154 |     if (pack_context->return_code)
155 |         return;
156 |     
157 |     uint32_t tmp = *((uint32_t*)&f);
158 |     tryMove4(0xca,tmp);
159 | }
160 | 
161 | 
162 | void cw_pack_double(cw_pack_context* pack_context, double d)
163 | {
164 |     if (pack_context->return_code)
165 |         return;
166 |     
167 |     uint64_t tmp = *((uint64_t*)&d);
168 |     tryMove8(0xcb,tmp);
169 | }
170 | 
171 | 
172 | void cw_pack_real (cw_pack_context* pack_context, double d)
173 | {
174 |     float f = (float)d;
175 |     double df = f;
176 |     if (df == d)
177 |         cw_pack_float (pack_context, f);
178 |     else
179 |         cw_pack_double (pack_context, d);
180 | }
181 | 
182 | 
183 | void cw_pack_nil(cw_pack_context* pack_context)
184 | {
185 |     if (pack_context->return_code)
186 |         return;
187 |     
188 |     tryMove0(0xc0);
189 | }
190 | 
191 | 
192 | void cw_pack_true (cw_pack_context* pack_context)
193 | {
194 |     if (pack_context->return_code)
195 |         return;
196 |     
197 |     tryMove0(0xc3);
198 | }
199 | 
200 | 
201 | void cw_pack_false (cw_pack_context* pack_context)
202 | {
203 |     if (pack_context->return_code)
204 |         return;
205 |     
206 |     tryMove0(0xc2);
207 | }
208 | 
209 | 
210 | void cw_pack_boolean(cw_pack_context* pack_context, bool b)
211 | {
212 |     if (pack_context->return_code)
213 |         return;
214 |     
215 |     tryMove0(b? 0xc3: 0xc2);
216 | }
217 | 
218 | 
219 | void cw_pack_array_size(cw_pack_context* pack_context, uint32_t n)
220 | {
221 |     if (pack_context->return_code)
222 |         return;
223 |     
224 |     if (n < 16)
225 |         tryMove0(0x90 | n);
226 | 
227 |     if (n < 65536)
228 |         tryMove2(0xdc, n);
229 | 
230 |     tryMove4(0xdd, n);
231 | }
232 | 
233 | 
234 | void cw_pack_map_size(cw_pack_context* pack_context, uint32_t n)
235 | {
236 |     if (pack_context->return_code)
237 |         return;
238 |     
239 |     if (n < 16)
240 |         tryMove0(0x80 | n);
241 | 
242 |     if (n < 65536)
243 |         tryMove2(0xde, n);
244 | 
245 |     tryMove4(0xdf, n);
246 | }
247 | 
248 | 
249 | void cw_pack_str(cw_pack_context* pack_context, const char* v, uint32_t l)
250 | {
251 |     if (pack_context->return_code)
252 |         return;
253 |     
254 |     uint8_t *p;
255 |     
256 |     if (l < 32)             // Fixstr
257 |     {
258 |         cw_pack_reserve_space(l+1);
259 |         *p = (uint8_t)(0xa0 + l);
260 |         memcpy(p+1,v,l);
261 |         return;
262 |     }
263 |     if (l < 256 && !pack_context->be_compatible)       // Str 8
264 |     {
265 |         cw_pack_reserve_space(l+2);
266 |         *p++ = (uint8_t)(0xd9);
267 |         *p = (uint8_t)(l);
268 |         memcpy(p+1,v,l);
269 |         return;
270 |     }
271 |     if (l < 65536)     // Str 16
272 |     {
273 |         cw_pack_reserve_space(l+3)
274 |         *p++ = (uint8_t)0xda;
275 |         cw_store16(l);
276 |         memcpy(p+2,v,l);
277 |         return;
278 |     }
279 |     // Str 32
280 |     cw_pack_reserve_space(l+5)
281 |     *p++ = (uint8_t)0xdb;
282 |     cw_store32(l);
283 |     memcpy(p+4,v,l);
284 |     return;
285 | }
286 | 
287 | 
288 | void cw_pack_bin(cw_pack_context* pack_context, const void* v, uint32_t l)
289 | {
290 |     if (pack_context->return_code)
291 |         return;
292 |     
293 |     if (pack_context->be_compatible)
294 |     {
295 |         cw_pack_str( pack_context, v, l);
296 |         return;
297 |     }
298 |     
299 |     uint8_t *p;
300 |     
301 |     if (l < 256)            // Bin 8
302 |     {
303 |         cw_pack_reserve_space(l+2);
304 |         *p++ = (uint8_t)(0xc4);
305 |         *p = (uint8_t)(l);
306 |         memcpy(p+1,v,l);
307 |         return;
308 |     }
309 |     if (l < 65536)     // Bin 16
310 |     {
311 |         cw_pack_reserve_space(l+3)
312 |         *p++ = (uint8_t)0xc5;
313 |         cw_store16(l);
314 |         memcpy(p+2,v,l);
315 |         return;
316 |     }
317 |     // Bin 32
318 |     cw_pack_reserve_space(l+5)
319 |     *p++ = (uint8_t)0xc6;
320 |     cw_store32(l);
321 |     memcpy(p+4,v,l);
322 |     return;
323 | }
324 | 
325 | 
326 | void cw_pack_ext (cw_pack_context* pack_context, int8_t type, const void* v, uint32_t l)
327 | {
328 |     if (pack_context->return_code)
329 |         return;
330 |     
331 |     if (pack_context->be_compatible)
332 |         PACK_ERROR(CWP_RC_ILLEGAL_CALL);
333 |     
334 |     uint8_t *p;
335 |     
336 |     switch (l)
337 |     {
338 |         case 1:                                         // Fixext 1
339 |             cw_pack_reserve_space(3);
340 |             *p++ = (uint8_t)0xd4;
341 |             *p++ = (uint8_t)type;
342 |             *p++ = *(uint8_t*)v;
343 |             return;
344 |         case 2:                                         // Fixext 2
345 |             cw_pack_reserve_space(4);
346 |             *p++ = (uint8_t)0xd5;
347 |             break;
348 |         case 4:                                         // Fixext 4
349 |             cw_pack_reserve_space(6);
350 |             *p++ = (uint8_t)0xd6;
351 |             break;
352 |         case 8:                                         // Fixext 8
353 |             cw_pack_reserve_space(10);
354 |             *p++ = (uint8_t)0xd7;
355 |             break;
356 |         case 16:                                        // Fixext16
357 |             cw_pack_reserve_space(18);
358 |             *p++ = (uint8_t)0xd8;
359 |             break;
360 |         default:
361 |             if (l < 256)                                // Ext 8
362 |             {
363 |                 cw_pack_reserve_space(l+3);
364 |                 *p++ = (uint8_t)0xc7;
365 |                 *p++ = (uint8_t)(l);
366 |             }
367 |             else if (l < 65536)                         // Ext 16
368 |             {
369 |                 cw_pack_reserve_space(l+4)
370 |                 *p++ = (uint8_t)0xc8;
371 |                 cw_store16(l);
372 |                 p += 2;
373 |             }
374 |             else                                        // Ext 32
375 |             {
376 |                 cw_pack_reserve_space(l+6)
377 |                 *p++ = (uint8_t)0xc9;
378 |                 cw_store32(l);
379 |                 p += 4;
380 |             }
381 |     }
382 |     *p++ = (uint8_t)type;
383 |     memcpy(p,v,l);
384 | }
385 | 
386 | 
387 | void cw_pack_insert (cw_pack_context* pack_context, const void* v, uint32_t l)
388 | {
389 |     uint8_t *p;
390 |     cw_pack_reserve_space(l);
391 |     memcpy(p,v,l);
392 | }
393 | 
394 | /*******************************   U N P A C K   **********************************/
395 | 
396 | 
397 | int cw_unpack_context_init (cw_unpack_context* unpack_context, void* data, unsigned long length, unpack_underflow_handler huu)
398 | {
399 |     unpack_context->start = unpack_context->current = (uint8_t*)data;
400 |     unpack_context->end = unpack_context->start + length;
401 |     unpack_context->return_code = test_byte_order();
402 |     unpack_context->err_no = 0;
403 |     unpack_context->handle_unpack_underflow = huu;
404 |     return unpack_context->return_code;
405 | }
406 | 
407 | 
408 | /*  Unpacking routines  ----------------------------------------------------------  */
409 | 
410 | 
411 | 
412 | void cw_unpack_next (cw_unpack_context* unpack_context)
413 | {
414 |     if (unpack_context->return_code)
415 |         return;
416 | 
417 |     uint64_t    tmpu64;
418 |     uint32_t    tmpu32;
419 |     uint16_t    tmpu16;
420 |     uint8_t*    p;
421 |     
422 | #define buffer_end_return_code  CWP_RC_END_OF_INPUT;
423 |     cw_unpack_assert_space(1);
424 |     uint8_t c = *p;
425 | #undef buffer_end_return_code
426 | #define buffer_end_return_code  CWP_RC_BUFFER_UNDERFLOW;
427 |     switch (c)
428 |     {
429 |         case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
430 |         case 0x08: case 0x09: case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x0e: case 0x0f:
431 |         case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
432 |         case 0x18: case 0x19: case 0x1a: case 0x1b: case 0x1c: case 0x1d: case 0x1e: case 0x1f:
433 |         case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27:
434 |         case 0x28: case 0x29: case 0x2a: case 0x2b: case 0x2c: case 0x2d: case 0x2e: case 0x2f:
435 |         case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:
436 |         case 0x38: case 0x39: case 0x3a: case 0x3b: case 0x3c: case 0x3d: case 0x3e: case 0x3f:
437 |         case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
438 |         case 0x48: case 0x49: case 0x4a: case 0x4b: case 0x4c: case 0x4d: case 0x4e: case 0x4f:
439 |         case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57:
440 |         case 0x58: case 0x59: case 0x5a: case 0x5b: case 0x5c: case 0x5d: case 0x5e: case 0x5f:
441 |         case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67:
442 |         case 0x68: case 0x69: case 0x6a: case 0x6b: case 0x6c: case 0x6d: case 0x6e: case 0x6f:
443 |         case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
444 |         case 0x78: case 0x79: case 0x7a: case 0x7b: case 0x7c: case 0x7d: case 0x7e: case 0x7f:
445 |                     getDDItem(CWP_ITEM_POSITIVE_INTEGER, i64, c);       return;  // positive fixnum
446 |         case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87:
447 |         case 0x88: case 0x89: case 0x8a: case 0x8b: case 0x8c: case 0x8d: case 0x8e: case 0x8f:
448 |                     getDDItem(CWP_ITEM_MAP, map.size, c & 0x0f);        return;  // fixmap
449 |         case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97:
450 |         case 0x98: case 0x99: case 0x9a: case 0x9b: case 0x9c: case 0x9d: case 0x9e: case 0x9f:
451 |                     getDDItem(CWP_ITEM_ARRAY, array.size, c & 0x0f);    return;  // fixarray
452 |         case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: case 0xa5: case 0xa6: case 0xa7:
453 |         case 0xa8: case 0xa9: case 0xaa: case 0xab: case 0xac: case 0xad: case 0xae: case 0xaf:
454 |         case 0xb0: case 0xb1: case 0xb2: case 0xb3: case 0xb4: case 0xb5: case 0xb6: case 0xb7:
455 |         case 0xb8: case 0xb9: case 0xba: case 0xbb: case 0xbc: case 0xbd: case 0xbe: case 0xbf:
456 |                     getDDItem(CWP_ITEM_STR, str.length, c & 0x1f);              // fixstr
457 |                     cw_unpack_assert_blob(str);
458 |         case 0xc0:  unpack_context->item.type = CWP_ITEM_NIL;           return;  // nil
459 |         case 0xc2:  getDDItem(CWP_ITEM_BOOLEAN, boolean, false);        return;  // false
460 |         case 0xc3:  getDDItem(CWP_ITEM_BOOLEAN, boolean, true);         return;  // true
461 |         case 0xc4:  getDDItem1(CWP_ITEM_BIN, bin.length, uint8_t);              // bin 8
462 |                     cw_unpack_assert_blob(bin);
463 |         case 0xc5:  getDDItem2(CWP_ITEM_BIN, bin.length, uint16_t);             // bin 16
464 |                     cw_unpack_assert_blob(bin);
465 |         case 0xc6:  getDDItem4(CWP_ITEM_BIN, bin.length, uint32_t);             // bin 32
466 |                     cw_unpack_assert_blob(bin);
467 |         case 0xc7:  getDDItem1(CWP_ITEM_EXT, ext.length, uint8_t);              // ext 8
468 |                     cw_unpack_assert_space(1);
469 |                     unpack_context->item.type = *(int8_t*)p;
470 |                     cw_unpack_assert_blob(ext);
471 |         case 0xc8:  getDDItem2(CWP_ITEM_EXT, ext.length, uint16_t);             // ext 16
472 |                     cw_unpack_assert_space(1);
473 |                     unpack_context->item.type = *(int8_t*)p;
474 |                     cw_unpack_assert_blob(ext);
475 |         case 0xc9:  getDDItem4(CWP_ITEM_EXT, ext.length, uint32_t);             // ext 32
476 |                     cw_unpack_assert_space(1);
477 |                     unpack_context->item.type = *(int8_t*)p;
478 |                     cw_unpack_assert_blob(ext);
479 |         case 0xca:  unpack_context->item.type = CWP_ITEM_FLOAT;                 // float
480 |                     cw_unpack_assert_space(4);
481 |                     cw_load32(p);
482 |                     unpack_context->item.as.real = *(float*)&tmpu32;     return;
483 |         case 0xcb:  getDDItem8(CWP_ITEM_DOUBLE);                         return;  // double
484 |         case 0xcc:  getDDItem1(CWP_ITEM_POSITIVE_INTEGER, u64, uint8_t); return;  // unsigned int  8
485 |         case 0xcd:  getDDItem2(CWP_ITEM_POSITIVE_INTEGER, u64, uint16_t); return; // unsigned int 16
486 |         case 0xce:  getDDItem4(CWP_ITEM_POSITIVE_INTEGER, u64, uint32_t); return; // unsigned int 32
487 |         case 0xcf:  getDDItem8(CWP_ITEM_POSITIVE_INTEGER);               return;  // unsigned int 64
488 |         case 0xd0:  getDDItem1(CWP_ITEM_NEGATIVE_INTEGER, i64, int8_t);          // signed int  8
489 |                     if (unpack_context->item.as.i64 >= 0)
490 |                         unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
491 |                     return;
492 |         case 0xd1:  getDDItem2(CWP_ITEM_NEGATIVE_INTEGER, i64, int16_t);        // signed int 16
493 |                     if (unpack_context->item.as.i64 >= 0)
494 |                         unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
495 |                     return;
496 |         case 0xd2:  getDDItem4(CWP_ITEM_NEGATIVE_INTEGER, i64, int32_t);        // signed int 32
497 |                     if (unpack_context->item.as.i64 >= 0)
498 |                         unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
499 |                     return;
500 |         case 0xd3:  getDDItem8(CWP_ITEM_NEGATIVE_INTEGER);						// signed int 64
501 |                     if (unpack_context->item.as.i64 >= 0)
502 |                         unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
503 |                     return;
504 |         case 0xd4:  getDDItemFix(1);                                            // fixext 1
505 |         case 0xd5:  getDDItemFix(2);                                            // fixext 2
506 |         case 0xd6:  getDDItemFix(4);                                            // fixext 4
507 |         case 0xd7:  getDDItemFix(8);                                            // fixext 8
508 |         case 0xd8:  getDDItemFix(16);                                           // fixext 16
509 |         case 0xd9:  getDDItem1(CWP_ITEM_STR, str.length, uint8_t);              // str 8
510 |                     cw_unpack_assert_blob(str);
511 |         case 0xda:  getDDItem2(CWP_ITEM_STR, str.length, uint16_t);             // str 16
512 |                     cw_unpack_assert_blob(str);
513 |         case 0xdb:  getDDItem4(CWP_ITEM_STR, str.length, uint32_t);             // str 32
514 |                     cw_unpack_assert_blob(str);
515 |         case 0xdc:  getDDItem2(CWP_ITEM_ARRAY, array.size, uint16_t);   return;  // array 16
516 |         case 0xdd:  getDDItem4(CWP_ITEM_ARRAY, array.size, uint32_t);   return;  // array 32
517 |         case 0xde:  getDDItem2(CWP_ITEM_MAP, map.size, uint16_t);       return;  // map 16
518 |         case 0xdf:  getDDItem4(CWP_ITEM_MAP, map.size, uint32_t);       return;  // map 32
519 |         case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5: case 0xe6: case 0xe7:
520 |         case 0xe8: case 0xe9: case 0xea: case 0xeb: case 0xec: case 0xed: case 0xee: case 0xef:
521 |         case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6: case 0xf7:
522 |         case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xfc: case 0xfd: case 0xfe: case 0xff:
523 |                     getDDItem(CWP_ITEM_NEGATIVE_INTEGER, i64, (int8_t)c); return;    // negative fixnum
524 |         default:
525 |                     UNPACK_ERROR(CWP_RC_MALFORMED_INPUT)
526 |     }
527 |     
528 |     return;
529 | }
530 | 
531 | #define cw_skip_bytes(n)                                \
532 |     cw_unpack_assert_space((n));                          \
533 |     break;
534 | 
535 | void cw_skip_items (cw_unpack_context* unpack_context, long item_count)
536 | {
537 |     if (unpack_context->return_code)
538 |         return;
539 |     
540 |     uint32_t    tmpu32;
541 |     uint16_t    tmpu16;
542 |     uint8_t*    p;
543 |     
544 |     while (item_count-- > 0)
545 |     {
546 | #undef buffer_end_return_code
547 | #define buffer_end_return_code  CWP_RC_END_OF_INPUT;
548 |         cw_unpack_assert_space(1);
549 |         uint8_t c = *p;
550 | 
551 | #undef buffer_end_return_code
552 | #define buffer_end_return_code  CWP_RC_BUFFER_UNDERFLOW;
553 |         switch (c)
554 |         {
555 |             case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
556 |             case 0x08: case 0x09: case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x0e: case 0x0f:
557 |             case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
558 |             case 0x18: case 0x19: case 0x1a: case 0x1b: case 0x1c: case 0x1d: case 0x1e: case 0x1f:
559 |             case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27:
560 |             case 0x28: case 0x29: case 0x2a: case 0x2b: case 0x2c: case 0x2d: case 0x2e: case 0x2f:
561 |             case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:
562 |             case 0x38: case 0x39: case 0x3a: case 0x3b: case 0x3c: case 0x3d: case 0x3e: case 0x3f:
563 |             case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
564 |             case 0x48: case 0x49: case 0x4a: case 0x4b: case 0x4c: case 0x4d: case 0x4e: case 0x4f:
565 |             case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57:
566 |             case 0x58: case 0x59: case 0x5a: case 0x5b: case 0x5c: case 0x5d: case 0x5e: case 0x5f:
567 |             case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67:
568 |             case 0x68: case 0x69: case 0x6a: case 0x6b: case 0x6c: case 0x6d: case 0x6e: case 0x6f:
569 |             case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
570 |             case 0x78: case 0x79: case 0x7a: case 0x7b: case 0x7c: case 0x7d: case 0x7e: case 0x7f:
571 |                                                                 // unsigned fixint
572 |             case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5: case 0xe6: case 0xe7:
573 |             case 0xe8: case 0xe9: case 0xea: case 0xeb: case 0xec: case 0xed: case 0xee: case 0xef:
574 |             case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6: case 0xf7:
575 |             case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xfc: case 0xfd: case 0xfe: case 0xff:
576 |                                                                 // signed fixint
577 |             case 0xc0:                                          // nil
578 |             case 0xc2:                                          // false
579 |             case 0xc3:  break;                                  // true
580 |             case 0xcc:                                          // unsigned int  8
581 |             case 0xd0:	cw_skip_bytes(1);                       // signed int  8
582 |             case 0xcd:                                          // unsigned int 16
583 |             case 0xd1:                                          // signed int 16
584 |             case 0xd4:  cw_skip_bytes(2);                       // fixext 1
585 |             case 0xd5:  cw_skip_bytes(3);                       // fixext 2
586 |             case 0xca:                                          // float
587 |             case 0xce:                                          // unsigned int 32
588 |             case 0xd2:  cw_skip_bytes(4);                       // signed int 32
589 |             case 0xd6:  cw_skip_bytes(5);                       // fixext 4
590 |             case 0xcb:                                          // double
591 |             case 0xcf:                                          // unsigned int 64
592 |             case 0xd3:  cw_skip_bytes(8);                       // signed int 64
593 |             case 0xd7:  cw_skip_bytes(9);                       // fixext 8
594 |             case 0xd8:  cw_skip_bytes(17);                      // fixext 16
595 |             case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: case 0xa5: case 0xa6: case 0xa7:
596 |             case 0xa8: case 0xa9: case 0xaa: case 0xab: case 0xac: case 0xad: case 0xae: case 0xaf:
597 |             case 0xb0: case 0xb1: case 0xb2: case 0xb3: case 0xb4: case 0xb5: case 0xb6: case 0xb7:
598 |             case 0xb8: case 0xb9: case 0xba: case 0xbb: case 0xbc: case 0xbd: case 0xbe: case 0xbf:
599 |                 cw_skip_bytes(c & 0x1f);                        // fixstr
600 |             case 0xd9:                                          // str 8
601 |             case 0xc4:                                          // bin 8
602 |                 cw_unpack_assert_space(1);
603 |                 tmpu32 = *p;
604 |                 cw_skip_bytes(tmpu32);
605 |                 
606 |             case 0xda:                                          // str 16
607 |             case 0xc5:                                          // bin 16
608 |                 cw_unpack_assert_space(2);
609 |                 cw_load16(p);
610 |                 cw_skip_bytes(tmpu16);
611 |                 
612 |             case 0xdb:                                          // str 32
613 |             case 0xc6:                                          // bin 32
614 |                 cw_unpack_assert_space(4);
615 |                 cw_load32(p);
616 |                 cw_skip_bytes(tmpu32);
617 |                 
618 |             case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87:
619 |             case 0x88: case 0x89: case 0x8a: case 0x8b: case 0x8c: case 0x8d: case 0x8e: case 0x8f:
620 |                 item_count += 2*(c & 15);                       // FixMap
621 |                 break;
622 | 
623 |             case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97:
624 |             case 0x98: case 0x99: case 0x9a: case 0x9b: case 0x9c: case 0x9d: case 0x9e: case 0x9f:
625 |                 item_count += c & 15;                           // FixArray
626 |                 break;
627 |                 
628 |             case 0xdc:                                          // array 16
629 |                 cw_unpack_assert_space(2);
630 |                 cw_load16(p);
631 |                 item_count += tmpu16;
632 |                 break;
633 |                 
634 |             case 0xde:                                          // map 16
635 |                 cw_unpack_assert_space(2);
636 |                 cw_load16(p);
637 |                 item_count += 2*tmpu16;
638 |                 break;
639 |                 
640 |             case 0xdd:                                          // array 32
641 |                 cw_unpack_assert_space(4);
642 |                 cw_load32(p);
643 |                 item_count += tmpu32;
644 |                 break;
645 |                 
646 |             case 0xdf:                                          // map 32
647 |                 cw_unpack_assert_space(4);
648 |                 cw_load32(p);
649 |                 item_count += 2*tmpu32;
650 |                 break;
651 |                 
652 |             case 0xc7:                                          // ext 8
653 |                 cw_unpack_assert_space(1);
654 |                 tmpu32 = *p;
655 |                 cw_skip_bytes(tmpu32 +1);
656 |                 
657 |             case 0xc8:                                          // ext 16
658 |                 cw_unpack_assert_space(2);
659 |                 cw_load16(p);
660 |                 cw_skip_bytes(tmpu16 +1);
661 |                 
662 |             case 0xc9:                                          // ext 32
663 |                 cw_unpack_assert_space(4);
664 |                 cw_load32(p);
665 |                 cw_skip_bytes(tmpu32 +1);
666 |                 
667 |             default:                                            // illegal
668 |                 UNPACK_ERROR(CWP_RC_MALFORMED_INPUT)
669 |         }
670 |     }
671 |     return;
672 | }
673 | 
674 | /* end cwpack.c */
675 | 


--------------------------------------------------------------------------------
/src/cwpack.h:
--------------------------------------------------------------------------------
  1 | /*      CWPack - cwpack.h   */
  2 | /*
  3 |  The MIT License (MIT)
  4 |  
  5 |  Copyright (c) 2017 Claes Wihlborg
  6 |  
  7 |  Permission is hereby granted, free of charge, to any person obtaining a copy of this
  8 |  software and associated documentation files (the "Software"), to deal in the Software
  9 |  without restriction, including without limitation the rights to use, copy, modify,
 10 |  merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
 11 |  persons to whom the Software is furnished to do so, subject to the following conditions:
 12 |  
 13 |  The above copyright notice and this permission notice shall be included in all copies or
 14 |  substantial portions of the Software.
 15 |  
 16 |  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 17 |  BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 18 |  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 19 |  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | #ifndef CWPack_H__
 24 | #define CWPack_H__
 25 | 
 26 | 
 27 | #include <stdint.h>
 28 | #include <stdbool.h>
 29 | #include "opts.h"
 30 | 
 31 | 
 32 | 
 33 | /*******************************   Return Codes   *****************************/
 34 | 
 35 | #define CWP_RC_OK                     0
 36 | #define CWP_RC_END_OF_INPUT          -1
 37 | #define CWP_RC_BUFFER_OVERFLOW       -2
 38 | #define CWP_RC_BUFFER_UNDERFLOW      -3
 39 | #define CWP_RC_MALFORMED_INPUT       -4
 40 | #define CWP_RC_WRONG_BYTE_ORDER      -5
 41 | #define CWP_RC_ERROR_IN_HANDLER      -6
 42 | #define CWP_RC_ILLEGAL_CALL          -7
 43 | #define CWP_RC_MALLOC_ERROR          -8
 44 | #define CWP_RC_STOPPED               -9
 45 | 
 46 | 
 47 | 
 48 | /*******************************   P A C K   **********************************/
 49 | 
 50 | 
 51 | struct cw_pack_context;
 52 | 
 53 | typedef int (*pack_overflow_handler)(struct cw_pack_context*, unsigned long);
 54 | 
 55 | typedef struct cw_pack_context {
 56 |     uint8_t*                start;
 57 |     uint8_t*                current;
 58 |     uint8_t*                end;
 59 |     bool                    be_compatible;
 60 |     int                     return_code;
 61 |     int                     err_no;          /* handlers can save error here */
 62 |     pack_overflow_handler   handle_pack_overflow;
 63 |     pack_opts               *opts;
 64 | } cw_pack_context;
 65 | 
 66 | 
 67 | int cw_pack_context_init (cw_pack_context* pack_context, void* data, unsigned long length, pack_overflow_handler hpo);
 68 | void cw_pack_set_compatibility (cw_pack_context* pack_context, bool be_compatible);
 69 | 
 70 | void cw_pack_nil (cw_pack_context* pack_context);
 71 | void cw_pack_true (cw_pack_context* pack_context);
 72 | void cw_pack_false (cw_pack_context* pack_context);
 73 | void cw_pack_boolean (cw_pack_context* pack_context, bool b);
 74 | 
 75 | void cw_pack_signed (cw_pack_context* pack_context, int64_t i);
 76 | void cw_pack_unsigned (cw_pack_context* pack_context, uint64_t i);
 77 | 
 78 | void cw_pack_float (cw_pack_context* pack_context, float f);
 79 | void cw_pack_double (cw_pack_context* pack_context, double d);
 80 | void cw_pack_real (cw_pack_context* pack_context, double d);   /* Pack as float if precision isn't destroyed */
 81 | 
 82 | void cw_pack_array_size (cw_pack_context* pack_context, uint32_t n);
 83 | void cw_pack_map_size (cw_pack_context* pack_context, uint32_t n);
 84 | void cw_pack_str (cw_pack_context* pack_context, const char* v, uint32_t l);
 85 | void cw_pack_bin (cw_pack_context* pack_context, const void* v, uint32_t l);
 86 | void cw_pack_ext (cw_pack_context* pack_context, int8_t type, const void* v, uint32_t l);
 87 | 
 88 | void cw_pack_insert (cw_pack_context* pack_context, const void* v, uint32_t l);
 89 | 
 90 | 
 91 | /*****************************   U N P A C K   ********************************/
 92 | 
 93 | 
 94 | typedef enum
 95 | {
 96 |     CWP_ITEM_MIN_RESERVED_EXT       = -128,
 97 |     CWP_ITEM_MAX_RESERVED_EXT       = -1,
 98 |     CWP_ITEM_MIN_USER_EXT           = 0,
 99 |     CWP_ITEM_MAX_USER_EXT           = 127,
100 |     CWP_ITEM_NIL                    = 300,
101 |     CWP_ITEM_BOOLEAN                = 301,
102 |     CWP_ITEM_POSITIVE_INTEGER       = 302,
103 |     CWP_ITEM_NEGATIVE_INTEGER       = 303,
104 |     CWP_ITEM_FLOAT                  = 304,
105 |     CWP_ITEM_DOUBLE                 = 305,
106 |     CWP_ITEM_STR                    = 306,
107 |     CWP_ITEM_BIN                    = 307,
108 |     CWP_ITEM_ARRAY                  = 308,
109 |     CWP_ITEM_MAP                    = 309,
110 |     CWP_ITEM_EXT                    = 310,
111 |     CWP_NOT_AN_ITEM                 = 999,
112 | } cwpack_item_types;
113 | 
114 | 
115 | typedef struct {
116 |     const void*     start;
117 |     uint32_t        length;
118 | } cwpack_blob; 
119 | 
120 | 
121 | typedef struct {
122 |     uint32_t    size;
123 | } cwpack_container;
124 | 
125 | 
126 | typedef struct {
127 |     cwpack_item_types   type;
128 |     union
129 |     {
130 |         bool            boolean;
131 |         uint64_t        u64;
132 |         int64_t         i64;
133 |         float           real;
134 |         double          long_real;
135 |         cwpack_container array;
136 |         cwpack_container map;
137 |         cwpack_blob     str;
138 |         cwpack_blob     bin;
139 |         cwpack_blob     ext;
140 |     } as;
141 | } cwpack_item;
142 | 
143 | struct cw_unpack_context;
144 | 
145 | typedef int (*unpack_underflow_handler)(struct cw_unpack_context*, unsigned long);
146 | 
147 | typedef struct cw_unpack_context {
148 |     cwpack_item                 item;
149 |     uint8_t*                    start;
150 |     uint8_t*                    current;
151 |     uint8_t*                    end;             /* logical end of buffer */
152 |     int                         return_code;
153 |     int                         err_no;          /* handlers can save error here */
154 |     unpack_underflow_handler    handle_unpack_underflow;
155 |     unpack_opts                 *opts;
156 | } cw_unpack_context;
157 | 
158 | 
159 | 
160 | int cw_unpack_context_init (cw_unpack_context* unpack_context, void* data, unsigned long length, unpack_underflow_handler huu);
161 | 
162 | void cw_unpack_next (cw_unpack_context* unpack_context);
163 | void cw_skip_items (cw_unpack_context* unpack_context, long item_count);
164 | 
165 | 
166 | #endif  /* CWPack_H__ */
167 | 


--------------------------------------------------------------------------------
/src/cwpack_defines.h:
--------------------------------------------------------------------------------
  1 | /*      CWPack - cwpack_defines.h   */
  2 | /*
  3 |  The MIT License (MIT)
  4 |  
  5 |  Copyright (c) 2017 Claes Wihlborg
  6 |  
  7 |  Permission is hereby granted, free of charge, to any person obtaining a copy of this
  8 |  software and associated documentation files (the "Software"), to deal in the Software
  9 |  without restriction, including without limitation the rights to use, copy, modify,
 10 |  merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
 11 |  persons to whom the Software is furnished to do so, subject to the following conditions:
 12 |  
 13 |  The above copyright notice and this permission notice shall be included in all copies or
 14 |  substantial portions of the Software.
 15 |  
 16 |  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
 17 |  BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 18 |  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
 19 |  DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 20 |  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 21 |  */
 22 | 
 23 | 
 24 | #ifndef cwpack_defines_h
 25 | #define cwpack_defines_h
 26 | 
 27 | 
 28 | 
 29 | /*************************   A L I G N M E N T   ******************************/
 30 | 
 31 | /*
 32 |  * Sometime the processor demands that integer access is to an even memory address.
 33 |  * In that case define FORCE_ALIGNMENT
 34 |  */
 35 | 
 36 | /* #define FORCE_ALIGNMENT */
 37 |  
 38 | 
 39 | /*************************   C   S Y S T E M   L I B R A R Y   ****************/
 40 | 
 41 | /*
 42 |  * The packer uses "memcpy" to move blobs. If you dont want to load C system library
 43 |  * for just that, define FORCE_NO_LIBRARY and CWPack will use an internal "memcpy"
 44 |  */
 45 | 
 46 | /* #define FORCE_NO_LIBRARY */
 47 | 
 48 | 
 49 | 
 50 | /*************************   B Y T E   O R D E R   ****************************/
 51 | 
 52 | /*
 53 |  * The pack/unpack routines are written in three versions: for big endian, for 
 54 |  * little endian and insensitive to byte order. As you can get some speed gain
 55 |  * if the byte order is known, we try that when we can certainly detect it.
 56 |  * Define COMPILE_FOR_BIG_ENDIAN or COMPILE_FOR_LITTLE_ENDIAN if you know.
 57 |  */
 58 | 
 59 | #ifndef FORCE_ALIGNMENT
 60 | #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && defined(__ORDER_BIG_ENDIAN__)
 61 | 
 62 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
 63 | #define COMPILE_FOR_BIG_ENDIAN
 64 | #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
 65 | #define COMPILE_FOR_LITTLE_ENDIAN
 66 | #endif
 67 | 
 68 | #elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
 69 | 
 70 | #if __BYTE_ORDER == __BIG_ENDIAN
 71 | #define COMPILE_FOR_BIG_ENDIAN
 72 | #elif __BYTE_ORDER == __LITTLE_ENDIAN
 73 | #define COMPILE_FOR_LITTLE_ENDIAN
 74 | #endif
 75 | 
 76 | #elif defined(__BIG_ENDIAN__)
 77 | #define COMPILE_FOR_BIG_ENDIAN
 78 | 
 79 | #elif defined(__LITTLE_ENDIAN__)
 80 | #define COMPILE_FOR_LITTLE_ENDIAN
 81 | 
 82 | #elif defined(__i386__) || defined(__x86_64__)
 83 | #define COMPILE_FOR_LITTLE_ENDIAN
 84 | 
 85 | #endif
 86 | #endif
 87 | 
 88 | //#undef COMPILE_FOR_LITTLE_ENDIAN
 89 | 
 90 | 
 91 | /*******************************   P A C K   **********************************/
 92 | 
 93 | 
 94 | 
 95 | #define PACK_ERROR(error_code)                          \
 96 | {                                                       \
 97 |     pack_context->return_code = error_code;             \
 98 |     return;                                             \
 99 | }
100 | 
101 | 
102 | 
103 | #ifdef COMPILE_FOR_BIG_ENDIAN
104 | 
105 | #define cw_store16(x)  *(uint16_t*)p = *(uint16_t*)&x;
106 | #define cw_store32(x)  *(uint32_t*)p = *(uint32_t*)&x;
107 | #define cw_store64(x)  *(uint64_t*)p = *(uint64_t*)&x;
108 | 
109 | #else    /* Byte order little endian or undetermined */
110 | 
111 | #ifdef COMPILE_FOR_LITTLE_ENDIAN
112 | 
113 | #define cw_store16(d)                                       \
114 |     *(uint16_t*)p = (uint16_t)((((d) >> 8) & 0x0ff) | (d) << 8)
115 | 
116 | #define cw_store32(x)                                       \
117 |     *(uint32_t*)p =                                         \
118 |         ((uint32_t)((((uint32_t)(x)) >> 24) |               \
119 |         (((uint32_t)(x) & 0x00ff0000) >>  8) |              \
120 |         (((uint32_t)(x) & 0x0000ff00) <<  8) |              \
121 |         (((uint32_t)(x)) << 24)));                          \
122 | 
123 | #define cw_store64(x)                                       \
124 |     *(uint64_t*)p =                                         \
125 |         ((uint64_t)(                                        \
126 |         (((((uint64_t)(x)) >> 40) |                         \
127 |         (((uint64_t)(x)) << 24)) & 0x0000ff000000ff00ULL) | \
128 |         (((((uint64_t)(x)) >> 24) |                         \
129 |         (((uint64_t)(x)) << 40)) & 0x00ff000000ff0000ULL) | \
130 |         (((uint64_t)(x) & 0x000000ff00000000ULL) >>  8) |   \
131 |         (((uint64_t)(x) & 0x00000000ff000000ULL) <<  8) |   \
132 |         (((uint64_t)(x)) >> 56) |                           \
133 |         (((uint64_t)(x)) << 56)));                          \
134 | 
135 | #else   /* Byte order undetermined */
136 | 
137 | #define cw_store16(d)           \
138 |     *p = (uint8_t)(d >> 8);     \
139 |     p[1] = (uint8_t)d;
140 | 
141 | #define cw_store32(d)           \
142 |     *p = (uint8_t)(d >> 24);    \
143 |     p[1] = (uint8_t)(d >> 16);  \
144 |     p[2] = (uint8_t)(d >> 8);   \
145 |     p[3] = (uint8_t)d;
146 | 
147 | #define cw_store64(z)           \
148 |     *p = (uint8_t)(z >> 56);    \
149 |     p[1] = (uint8_t)(z >> 48);  \
150 |     p[2] = (uint8_t)(z >> 40);  \
151 |     p[3] = (uint8_t)(z >> 32);  \
152 |     p[4] = (uint8_t)(z >> 24);  \
153 |     p[5] = (uint8_t)(z >> 16);  \
154 |     p[6] = (uint8_t)(z >> 8);   \
155 |     p[7] = (uint8_t)z;
156 | #endif
157 | #endif
158 | 
159 | 
160 | 
161 | #define cw_pack_reserve_space(more)                                                         \
162 | {                                                                                           \
163 |     p = pack_context->current;                                                              \
164 |     uint8_t* nyp = p + more;                                                                \
165 |     if (nyp > pack_context->end)                                                            \
166 |     {                                                                                       \
167 |         if (!pack_context->handle_pack_overflow)                                            \
168 |             PACK_ERROR(CWP_RC_BUFFER_OVERFLOW)                                              \
169 |         int rc = pack_context->handle_pack_overflow (pack_context, (unsigned long)(more));  \
170 |         if (rc)                                                                             \
171 |             PACK_ERROR(rc)                                                                  \
172 |         p = pack_context->current;                                                          \
173 |         nyp = p + more;                                                                     \
174 |     }                                                                                       \
175 |     pack_context->current = nyp;                                                            \
176 | }
177 | 
178 | 
179 | #define tryMove0(t)                                     \
180 | {                                                       \
181 |     uint8_t *p;                                         \
182 |     cw_pack_reserve_space(1)                            \
183 |     *p = (uint8_t)(t);                                  \
184 |     return;                                             \
185 | }
186 | 
187 | #define tryMove1(t,d)                                   \
188 | {                                                       \
189 |     uint8_t *p;                                         \
190 |     cw_pack_reserve_space(2)                            \
191 |     *p++ = (uint8_t)t;                                  \
192 |     *p = (uint8_t)d;                                    \
193 |     return;                                             \
194 | }
195 | 
196 | #define tryMove2(t,d)                                   \
197 | {                                                       \
198 |     uint8_t *p;                                         \
199 |     cw_pack_reserve_space(3)                            \
200 |     *p++ = (uint8_t)t;                                  \
201 |     cw_store16(d);                                      \
202 |     return;                                             \
203 | }
204 | 
205 | #define tryMove4(t,d)                                   \
206 | {                                                       \
207 |     uint8_t *p;                                         \
208 |     cw_pack_reserve_space(5)                            \
209 |     *p++ = (uint8_t)t;                                  \
210 |     cw_store32(d);                                      \
211 |     return;                                             \
212 | }
213 | 
214 | #define tryMove8(t,d)                                   \
215 | {                                                       \
216 |     uint8_t *p;                                         \
217 |     cw_pack_reserve_space(9)                            \
218 |     *p++ = (uint8_t)t;                                  \
219 |     cw_store64(d);                                      \
220 |     return;                                             \
221 | }
222 | 
223 | 
224 | 
225 | 
226 | /*******************************   U N P A C K   **********************************/
227 | 
228 | 
229 | 
230 | #define UNPACK_ERROR(error_code)                        \
231 | {                                                       \
232 |     unpack_context->item.type = CWP_NOT_AN_ITEM;        \
233 |     unpack_context->return_code = error_code;           \
234 |     return;                                             \
235 | }
236 | 
237 | 
238 | 
239 | #ifdef COMPILE_FOR_BIG_ENDIAN
240 | 
241 | #define cw_load16(ptr)  tmpu16 = *(uint16_t*)ptr;
242 | #define cw_load32(ptr)  tmpu32 = *(uint32_t*)ptr;
243 | #define cw_load64(ptr)  tmpu64 = *(uint64_t*)ptr;
244 | 
245 | #else    /* Byte order little endian or undetermined */
246 | 
247 | #ifdef COMPILE_FOR_LITTLE_ENDIAN
248 | 
249 | #define cw_load16(ptr)                                          \
250 |     tmpu16 = *(uint16_t*)ptr;                                   \
251 |     tmpu16 = (uint16_t)((tmpu16<<8) | (tmpu16>>8))
252 | 
253 | #define cw_load32(ptr)                                          \
254 |     tmpu32 = *(uint32_t*)ptr;                                   \
255 |     tmpu32 = (tmpu32<<24) | ((tmpu32 & 0xff00)<<8) |            \
256 |             ((tmpu32 & 0xff0000)>>8) | (tmpu32>>24)
257 | 
258 | #define cw_load64(ptr)                                          \
259 |     tmpu64 = *((uint64_t*)ptr);                                 \
260 |     tmpu64 =                   (                                \
261 |         (((tmpu64 >> 40) |                                      \
262 |         (tmpu64 << 24)) & 0x0000ff000000ff00ULL) |              \
263 |         (((tmpu64 >> 24) |                                      \
264 |         (tmpu64 << 40)) & 0x00ff000000ff0000ULL) |              \
265 |         ((tmpu64 & 0x000000ff00000000ULL) >>  8) |              \
266 |         ((tmpu64 & 0x00000000ff000000ULL) <<  8) |              \
267 |         (tmpu64 >> 56) |                                        \
268 |         (tmpu64 << 56)                              )
269 | 
270 | #else /* Byte order undetermined */
271 | 
272 | #define cw_load16(ptr)                                          \
273 |     tmpu16 = (uint16_t)((*ptr++) << 8);                         \
274 |     tmpu16 |= (uint16_t)(*ptr++)
275 | 
276 | #define cw_load32(ptr)                                          \
277 |     tmpu32 = (uint32_t)(*ptr++ << 24);                          \
278 |     tmpu32 |= (uint32_t)(*ptr++ << 16);                         \
279 |     tmpu32 |= (uint32_t)(*ptr++ << 8);                          \
280 |     tmpu32 |= (uint32_t)(*ptr++)
281 | 
282 | #define cw_load64(ptr)                                          \
283 |     tmpu64 = ((uint64_t)*ptr++) << 56;                          \
284 |     tmpu64 |= ((uint64_t)*ptr++) << 48;                         \
285 |     tmpu64 |= ((uint64_t)*ptr++) << 40;                         \
286 |     tmpu64 |= ((uint64_t)*ptr++) << 32;                         \
287 |     tmpu64 |= ((uint64_t)*ptr++) << 24;                         \
288 |     tmpu64 |= ((uint64_t)*ptr++) << 16;                         \
289 |     tmpu64 |= ((uint64_t)*ptr++) << 8;                          \
290 |     tmpu64 |= (uint64_t)*ptr++
291 | 
292 | #endif
293 | #endif
294 | 
295 | 
296 | 
297 | #define cw_unpack_assert_space(more)                                                                \
298 | {                                                                                                   \
299 |     p = unpack_context->current;                                                                    \
300 |     uint8_t* nyp = p + more;                                                                        \
301 |     if (nyp > unpack_context->end)                                                                  \
302 |     {                                                                                               \
303 |         if (!unpack_context->handle_unpack_underflow)                                               \
304 |             UNPACK_ERROR(buffer_end_return_code)                                                    \
305 |         int rc = unpack_context->handle_unpack_underflow (unpack_context, (unsigned long)(more));   \
306 |         if (rc != CWP_RC_OK)                                                                        \
307 |         {                                                                                           \
308 |             if (rc != CWP_RC_END_OF_INPUT)                                                          \
309 |                 UNPACK_ERROR(rc)                                                                    \
310 |             else                                                                                    \
311 |                 UNPACK_ERROR(buffer_end_return_code)                                                \
312 |         }                                                                                           \
313 |         p = unpack_context->current;                                                                \
314 |         nyp = p + more;                                                                             \
315 |     }                                                                                               \
316 |     unpack_context->current = nyp;                                                                  \
317 | }
318 | 
319 | 
320 | #define cw_unpack_assert_blob(blob)                                         \
321 |     cw_unpack_assert_space(unpack_context->item.as.blob.length);            \
322 |     unpack_context->item.as.blob.start = p;                                 \
323 |     return;
324 | 
325 | 
326 | #define getDDItem(typ,var,val)                                              \
327 |     unpack_context->item.type = typ;                                        \
328 |     unpack_context->item.as.var = val;
329 | 
330 | #define getDDItem1(typ,var,cast)                                            \
331 |     unpack_context->item.type = typ;                                        \
332 |     cw_unpack_assert_space(1);                                              \
333 |     unpack_context->item.as.var = (cast)*p;
334 | 
335 | #define getDDItem2(typ,var,cast)                                            \
336 |     unpack_context->item.type = typ;                                        \
337 |     cw_unpack_assert_space(2);                                              \
338 |     cw_load16(p);                                                           \
339 |     unpack_context->item.as.var = (cast)tmpu16;
340 | 
341 | #define getDDItem4(typ,var,cast)                                            \
342 |     unpack_context->item.type = typ;                                        \
343 |     cw_unpack_assert_space(4);                                              \
344 |     cw_load32(p);                                                           \
345 |     unpack_context->item.as.var = (cast)tmpu32;
346 | 
347 | #define getDDItem8(typ)                                                     \
348 |     unpack_context->item.type = typ;                                        \
349 |     cw_unpack_assert_space(8);                                              \
350 |     cw_load64(p);                                                           \
351 |     unpack_context->item.as.u64 = tmpu64;
352 | 
353 | #define getDDItemFix(len)                                                   \
354 |     cw_unpack_assert_space(1);                                              \
355 |     unpack_context->item.type = *(int8_t*)p;                                \
356 |     unpack_context->item.as.ext.length = len;                               \
357 |     cw_unpack_assert_blob(ext);
358 | 
359 | 
360 | 
361 | 
362 | #endif /* cwpack_defines_h */
363 | 


--------------------------------------------------------------------------------
/src/decode.c:
--------------------------------------------------------------------------------
  1 | #include <R_ext/Arith.h>
  2 | #include "decode.h"
  3 | 
  4 | #include <inttypes.h>
  5 | #include "utf8.h"
  6 | 
  7 | #ifdef DEBUG
  8 | #define LOGD(FMT, ...)                                   \
  9 |   LOG("%.*s " FMT,                                       \
 10 |       MIN (cxt->opts->depth, 40),                        \
 11 |       ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>!",        \
 12 |       ##__VA_ARGS__);
 13 | #else
 14 | #define LOGD(...) NULL
 15 | #endif
 16 | 
 17 | SEXP extract_sexp(cw_unpack_context *);
 18 | int handle_unpack_underflow(cw_unpack_context *, unsigned long);
 19 | void cw_unpack_next_or_fail(cw_unpack_context *);
 20 | SEXP make_sexp_from_context(cw_unpack_context *);
 21 | 
 22 | SEXP extract_simplified_vector(cw_unpack_context *);
 23 | SEXP fill_vector(cw_unpack_context *, SEXP, uint32_t, PROTECT_INDEX, SEXP);
 24 | SEXP coerce(SEXP, SEXPTYPE);
 25 | 
 26 | SEXP extract_env(cw_unpack_context *);
 27 | SEXP new_env(SEXP parent);
 28 | 
 29 | SEXP make_charsxp_or_null(cw_unpack_context *);
 30 | SEXP make_charsxp_or_raw_from_context(cw_unpack_context *);
 31 | SEXP make_raw_from_context(cw_unpack_context *);
 32 | SEXP make_ext_from_context(cw_unpack_context *);
 33 | 
 34 | SEXP always_make_charsxp_from_context(cw_unpack_context *);
 35 | 
 36 | double i64_to_double(cw_unpack_context *, int64_t);
 37 | double u64_to_double(cw_unpack_context *, uint64_t);
 38 | 
 39 | SEXPTYPE type_to_sexptype(int t);
 40 | 
 41 | const char *decode_item_type(cwpack_item_types);
 42 | 
 43 | /* used to make WARN_ONCE work: once per call */
 44 | static long calls = 0;
 45 | 
 46 | void * dataptr_or_null(SEXP x) {
 47 |   switch(TYPEOF(x)) {
 48 |   case VECSXP: return VECTOR_PTR(x);
 49 |   case RAWSXP: return RAW(x);
 50 |   case LGLSXP: return LOGICAL(x);
 51 |   case INTSXP: return INTEGER(x);
 52 |   case REALSXP: return REAL(x);
 53 |   case CPLXSXP: return COMPLEX(x);
 54 |   case STRSXP: return STRING_PTR(x);
 55 |   default: return 0;
 56 |   }
 57 | }
 58 | 
 59 | 
 60 | 
 61 | SEXP _unpack_opts(SEXP dict,
 62 |                   SEXP use_df,
 63 |                   SEXP simplify,
 64 |                   SEXP package,
 65 |                   SEXP max_size,
 66 |                   SEXP max_depth,
 67 |                   SEXP underflow_handler) {
 68 |   SEXP optsxp = PROTECT(allocVector(RAWSXP, sizeof(unpack_opts)));
 69 |   unpack_opts *opts = (unpack_opts*)(RAW(optsxp));
 70 |   opts->use_df = asLogical(use_df);
 71 |   opts->dict = dict;
 72 |   opts->simplify = asLogical(simplify);
 73 |   opts->package = package;
 74 | 
 75 |   opts->underflow_handler = underflow_handler;
 76 |   opts->buf = R_NilValue; /* the buf field is not filled out
 77 |                              here. Whoever fills it out is responsible
 78 |                              for managing protection. */
 79 |   opts->buf_index = -1;
 80 |   if (!R_finite(asReal(max_size))) {
 81 |     opts->max_pending = ULONG_MAX;
 82 |     LOG("max_size passed is not finite [%s: 0x%02lx]", type2char(TYPEOF(max_size)), *((unsigned long *) dataptr_or_null(max_size)));
 83 |   } else {
 84 |     LOG("max_size passed is finite [%s: 0x%02lx]",  type2char(TYPEOF(max_size)), *((unsigned long *) dataptr_or_null(max_size)));
 85 |     opts->max_pending = asReal(max_size); /* as real since it may be > 2^32 */ 
 86 |   }
 87 |   if (!R_finite(asReal(max_depth))) {
 88 |     opts->max_depth = UINT_MAX;
 89 |   } else {
 90 |     opts->max_depth = asInteger(max_depth); 
 91 |   }
 92 |   LOG("max_size = %f -> %lu; max_depth = %f -> %lu", asReal(max_size), opts->max_pending, asReal(max_depth), opts->max_depth);
 93 | 
 94 |   /* To make sure that R does not finalize the objects I'm using,
 95 |      also hang the object references off of it.
 96 |      
 97 |      The decode buffer is handled in init_unpack_context. */
 98 |   SEXP refs = PROTECT(CONS(dict,
 99 |                            CONS(package,
100 |                                 CONS(underflow_handler,
101 |                                      R_NilValue))));
102 |   setAttrib(optsxp,
103 |             install("refs"),
104 |             refs);
105 |   UNPROTECT(2);
106 |   return optsxp;
107 | }
108 | 
109 | 
110 | int init_unpack_context(cw_unpack_context *cxt,
111 |                         SEXP optsxp,
112 |                         SEXP buf,
113 |                         unsigned long start,
114 |                         unsigned long end) {
115 |   assert_type(optsxp, RAWSXP);
116 |   ASSERT(LENGTH(optsxp) == sizeof(unpack_opts));
117 | 
118 |   unpack_opts *opts = (unpack_opts *)RAW(optsxp);
119 |   opts->depth = 0;
120 |   opts->pending = 0;
121 | 
122 |   unpack_underflow_handler huu;
123 |   if (opts->underflow_handler != R_NilValue) {
124 |     huu = &handle_unpack_underflow;
125 |   } else {
126 |     huu = NULL;
127 |   }
128 | 
129 |   int r = cw_unpack_context_init(cxt, RAW(buf) + start, end - start, huu);
130 |   if (r)
131 |     error("Could not initialize cw_unpack, %s", decode_return_code(r));
132 | 
133 |   cxt->opts = opts;
134 | 
135 |   /* Here is where we start holding a reference to the
136 |      buffer... needss to be released on returning to R */
137 |   if (cxt->opts->buf_index != -1) {
138 |     LOGD("optsxp previously held a buffer iwth protextion index %d",
139 |          cxt->opts->buf_index);
140 |   }
141 |   PROTECT_WITH_INDEX(cxt->opts->buf = buf, &(cxt->opts->buf_index));
142 |   LOGD("First buffer is, ( 0x%x[%d:%d][%d] )",
143 |        RAW(cxt->opts->buf),
144 |        cxt->start - RAW(cxt->opts->buf),
145 |        cxt->end - RAW(cxt->opts->buf),
146 |        cxt->current - RAW(cxt->opts->buf));
147 |   LOGD("buffer protected at index %d", cxt->opts->buf_index);
148 |   
149 |   return 1;
150 | }
151 | 
152 | 
153 | 
154 | SEXP _unpack_msg(SEXP buf, SEXP optsxp) {
155 |   calls++;
156 |   cw_unpack_context cxt;
157 |   int protections = init_unpack_context(&cxt, optsxp, buf, 0, LENGTH(buf));
158 |   LOG("depth = %u, pending = %lu", cxt.opts->depth, cxt.opts->pending); 
159 |   SEXP out = PROTECT(extract_sexp(&cxt));
160 |   protections++;
161 |   LOG("depth = %u, pending = %lu", cxt.opts->depth, cxt.opts->pending); 
162 |   ASSERT(cxt.opts->depth == 0);
163 |   ASSERT(cxt.opts->pending == 0);
164 |   UNPROTECT(protections);
165 |   return out;
166 | }
167 | 
168 | 
169 | int handle_unpack_underflow(cw_unpack_context *cxt, unsigned long x) {
170 |   unsigned long need = x - ((cxt->end - cxt->current) / sizeof(*(cxt->end)));
171 |   /* bounce back to R to read X bytes */
172 | 
173 |   if (cxt->opts->underflow_handler != R_NilValue) {
174 |     unsigned long had = 0;
175 |     while(TRUE) {
176 |       assert_type(cxt->opts->underflow_handler, CLOSXP);
177 |       LOGD("Asking R to read %d bytes", need - had);
178 |       SEXP scurrent = PROTECT(ScalarInteger(cxt->current - RAW(cxt->opts->buf)));
179 |       SEXP sneed = PROTECT(ScalarInteger(need - had));
180 |       SEXP call = PROTECT(lang3(cxt->opts->underflow_handler,
181 |                                 scurrent,
182 |                                 sneed));
183 |       /* result should be a list( rawsxp, start, end, current ) */
184 |       SEXP result = PROTECT(eval(call, cxt->opts->package));
185 |       assert_type(result, VECSXP);
186 |       ASSERT(LENGTH(result) == 4);
187 |       SEXP buf = VECTOR_ELT(result, 0);
188 |       assert_type(buf, RAWSXP);
189 |       unsigned long start = asInteger(VECTOR_ELT(result, 1));
190 |       unsigned long end = asInteger(VECTOR_ELT(result, 2));
191 |       unsigned long current = asInteger(VECTOR_ELT(result, 3));
192 |       unsigned long have = (end - current) - (cxt->end - cxt->current);
193 |       if (have >= need) {
194 |         LOGD("Swapping buffers, ( 0x%x[%d:%d][%d] -> 0x%x[%d:%d][%d] )",
195 |              RAW(cxt->opts->buf),
196 |              cxt->start - RAW(cxt->opts->buf),
197 |              cxt->end - RAW(cxt->opts->buf),
198 |              cxt->current - RAW(cxt->opts->buf),
199 |              RAW(buf), start, end, current);
200 | 
201 |         ASSERT(cxt->opts->buf_index != -1);
202 |         REPROTECT(cxt->opts->buf = buf, cxt->opts->buf_index);
203 |         cxt->start = RAW(buf) + start;
204 |         cxt->end = RAW(buf) + end;
205 |         cxt->current = RAW(buf) + current;
206 |         UNPROTECT(4);
207 |         return CWP_RC_OK;
208 |       } else if (have > had) {
209 |         LOGD("Not enough new data read, have %d, need %d", have, need);
210 |         /* try again */
211 |         had = have;
212 |         UNPROTECT(4);
213 |       } else {
214 |         LOGD("No new data read");
215 |         /* give up */
216 |         UNPROTECT(4);
217 |         return CWP_RC_END_OF_INPUT;        
218 |       }
219 |     }
220 |   } else {
221 |     LOGD("No underflow handler");
222 |     return CWP_RC_END_OF_INPUT;
223 |   }
224 | }
225 | 
226 | 
227 | 
228 | SEXP _unpack_msg_partial(SEXP buf, SEXP startx, SEXP endx, SEXP optsxp) {
229 |   /* Note that this must be able to cope with underflow handlers. */
230 |   calls++; /* reset WARN_ONCE */
231 | 
232 |   cw_unpack_context cxt;
233 |   assert_type(optsxp, RAWSXP);
234 |   
235 |   unsigned long start = asInteger(startx);
236 |   unsigned long end = asInteger(endx);
237 |   int protections = init_unpack_context(&cxt, optsxp, buf, start, end);
238 |   LOG("Before: buf = 0x%x[%d:%d][%d], status = '%s'",
239 |       RAW(cxt.opts->buf),
240 |       cxt.start - RAW(cxt.opts->buf),
241 |       cxt.end - RAW(cxt.opts->buf),
242 |       cxt.current - RAW(cxt.opts->buf),
243 |       decode_return_code(cxt.return_code));
244 | 
245 |   SEXP msg = PROTECT(extract_sexp(&cxt));
246 |   protections++;
247 | 
248 |   LOG("After:  buf = 0x%x[%d:%d][%d], status = '%s'",
249 |       RAW(cxt.opts->buf),
250 |       cxt.start - RAW(cxt.opts->buf),
251 |       cxt.end - RAW(cxt.opts->buf),
252 |       cxt.current - RAW(cxt.opts->buf),
253 |       decode_return_code(cxt.return_code));
254 |   
255 |   ASSERT(cxt.opts->depth == 0);
256 |   ASSERT(cxt.opts->pending == 0);
257 | 
258 |   SEXP out = list3( 
259 |     PROTECT(ScalarString(mkChar(decode_return_code(cxt.return_code)))),
260 |     msg,
261 |     PROTECT(ScalarInteger((cxt.current - RAW(cxt.opts->buf)) / sizeof(Rbyte))));
262 |   
263 |   protections += 2;
264 |   
265 |   UNPROTECT(protections);
266 |   return out;
267 | }
268 | 
269 | 
270 | SEXP extract_sexp(cw_unpack_context *cxt) {
271 |   cw_unpack_next_or_fail(cxt);
272 |   return make_sexp_from_context(cxt);
273 | }
274 | 
275 | 
276 | void cw_unpack_next_or_fail(cw_unpack_context *cxt) {
277 |   cw_unpack_next(cxt);
278 |   if (cxt->return_code != CWP_RC_OK) {
279 |     error("%s",
280 |           decode_return_code(cxt->return_code));
281 |   }
282 | }
283 | 
284 | 
285 | void add_pending(cw_unpack_context *cxt, unsigned long howmany) {
286 |   if (cxt->opts->depth >= cxt->opts->max_depth) {
287 |     error("Message too deeply nested");
288 |   }
289 |   cxt->opts->pending += howmany;
290 |   if (cxt->opts->pending >= cxt->opts->max_pending) {
291 |     error("Pending message of %lu bytes too long, max is %lu", cxt->opts->pending, cxt->opts->max_pending);
292 |   }
293 |   LOGD("depth = %u, pending = %lu", cxt->opts->depth, cxt->opts->pending); 
294 | }
295 | 
296 | 
297 | SEXP alloc_vector(cw_unpack_context *cxt, SEXPTYPE type, unsigned long len) {
298 |   add_pending(cxt, len);
299 |   return allocVector(type, len);
300 | }
301 | 
302 | 
303 | SEXP make_sexp_from_context(cw_unpack_context *cxt) {
304 |   LOGD("Making sexp from a %s", decode_item_type(cxt->item.type));
305 |   cwpack_item_types x = cxt->item.type;
306 |   switch (x) {
307 |   case CWP_ITEM_NIL:
308 |     return ScalarLogical(NA_LOGICAL);
309 | 
310 |   case CWP_ITEM_BOOLEAN:
311 |     return ScalarLogical(cxt->item.as.boolean);
312 | 
313 |   case CWP_ITEM_ARRAY:
314 |     return extract_simplified_vector(cxt);
315 | 
316 |   case CWP_ITEM_BIN:
317 |     return make_raw_from_context(cxt);
318 | 
319 |   case CWP_ITEM_STR: {
320 |     SEXP x = PROTECT(make_charsxp_or_raw_from_context(cxt));
321 |     LOGD("x is a %s", type2char(TYPEOF(x)));
322 |     if (TYPEOF(x) == CHARSXP) {
323 |       UNPROTECT(1);
324 |       return ScalarString(x);
325 |     } else {
326 |       UNPROTECT(1);
327 |       return x;
328 |     }
329 |   }
330 | 
331 |   case CWP_ITEM_NEGATIVE_INTEGER:
332 |   case CWP_ITEM_POSITIVE_INTEGER:
333 |   case CWP_ITEM_FLOAT:
334 |   case CWP_ITEM_DOUBLE: {
335 |     SEXP buf;
336 |     PROTECT_INDEX ix;
337 |     PROTECT_WITH_INDEX(buf = alloc_vector(cxt, type_to_sexptype(cxt->item.type), 1),
338 |                        &ix);
339 |     buf = fill_vector(cxt, buf, 1, ix, R_NilValue);
340 |     UNPROTECT(1);
341 |     return buf;
342 |   }
343 | 
344 |   case CWP_ITEM_MAP:
345 |     if (TYPEOF(cxt->opts->dict) == ENVSXP) {
346 |       return extract_env(cxt);
347 |     } else {
348 |       return extract_simplified_vector(cxt);
349 |     }
350 | 
351 |   default:
352 |     if (x <= CWP_ITEM_MAX_USER_EXT
353 |         && x >= CWP_ITEM_MIN_USER_EXT) {
354 |       return make_ext_from_context(cxt);
355 |     } else if (x <= CWP_ITEM_MAX_RESERVED_EXT
356 |                && x >= CWP_ITEM_MIN_RESERVED_EXT) {
357 |       return make_ext_from_context(cxt);
358 |     } else {
359 |       error("Unsupported item: %s", decode_item_type(cxt->item.type));
360 |     }
361 |   }
362 | }
363 | 
364 | 
365 | SEXP extract_simplified_vector(cw_unpack_context *cxt) {
366 |   int has_names = 0;
367 |   switch(cxt->item.type) {
368 |   case CWP_ITEM_ARRAY:
369 |     has_names = 0; break;
370 |   case CWP_ITEM_MAP:
371 |     has_names = 1; break;
372 |   default:
373 |     error("Can't extract vector from a %s", decode_item_type(cxt->item.type));
374 |   }
375 | 
376 |   uint32_t len = cxt->item.as.array.size;
377 |   LOGD("Extracting array of %d elements, simplifying", len);
378 |   SEXP buf;
379 |   PROTECT_INDEX ix;
380 |   int protections = 0;
381 | 
382 |   SEXP names = R_NilValue;
383 |   if(has_names) {
384 |     LOGD("Allocating names[%d]", len);
385 |     PROTECT(names = alloc_vector(cxt, STRSXP, len));
386 |     protections++;
387 |   }
388 | 
389 |   /* Peek at the first item, allocate a buffer */
390 |   if (len > 0) {
391 |     if(has_names) {
392 |       cw_unpack_next_or_fail(cxt);
393 |       SEXP name = PROTECT(always_make_charsxp_from_context(cxt));
394 |       SET_STRING_ELT(names, 0, name);
395 |       UNPROTECT(1);
396 |     }
397 |     cw_unpack_next_or_fail(cxt);
398 |     LOGD("first is a %s", decode_item_type(cxt->item.type));
399 | 
400 |     SEXPTYPE type;
401 |     if (cxt->opts->simplify) {
402 |       type = type_to_sexptype(cxt->item.type);
403 |     } else {
404 |       type = VECSXP;
405 |     }
406 |     PROTECT_WITH_INDEX(buf = alloc_vector(cxt, type, len),
407 |                        &ix);
408 |     protections++;
409 |     LOGD("Allocated %s vector of length %d, protection id %d",
410 |          type2char(type), len, ix);
411 | 
412 |     buf = fill_vector(cxt, buf, len, ix, names);
413 | 
414 |   } else { /* len == 0 */
415 |     buf = allocVector(LGLSXP, 0);
416 |   }
417 | 
418 |   UNPROTECT(protections);
419 |   return buf;
420 | }
421 | 
422 | 
423 | SEXP fill_vector(cw_unpack_context *cxt, SEXP buf, uint32_t len,
424 |                  PROTECT_INDEX ix, SEXP names) {
425 |   cxt->opts->depth++;
426 | 
427 |   /* when called, we have the first item "primed" for us in cxt. Also
428 |      buf has been protected. */
429 | 
430 |   int non_null_seen = 0;
431 |   int output_data_frame = ((names != R_NilValue)
432 |                            && cxt->opts->use_df
433 |                            && cxt->item.type == CWP_ITEM_ARRAY
434 |                            && len > 0);
435 |   long df_rows = 0;
436 |   if (output_data_frame) {
437 |     LOGD("Might be a data frame");
438 |     df_rows = cxt->item.as.array.size;
439 |   }
440 | 
441 |   for (int i = 0; i < len; i++) {                    /* loop once per item read */
442 |     if (i > 0) {        /* skip what we did when we peeked at first item */
443 |       if (names != R_NilValue) {
444 |         LOGD("Reading name %d", i);
445 |         cw_unpack_next_or_fail(cxt);
446 |         SEXP c = PROTECT(always_make_charsxp_from_context(cxt));
447 |         SET_STRING_ELT(names, i, c);
448 |         UNPROTECT(1);
449 |       }
450 | 
451 |       LOGD("Unpacking item %d", i);
452 |       cw_unpack_next_or_fail(cxt);
453 | 
454 |       if (output_data_frame
455 |           && (cxt->item.type != CWP_ITEM_ARRAY
456 |               || cxt->item.as.array.size != df_rows)) {
457 |         LOGD("Not a data frame after all");
458 |         output_data_frame = 0;
459 |       }
460 |     }
461 | 
462 |     cxt->opts->pending -= (names == R_NilValue ? 1 : 2);
463 |     
464 |   UNPACK_VALUE:                 /* goto here to "try again" after coercion */
465 |     LOGD("Storing item %d, a %s, into a %s",
466 |          i, decode_item_type(cxt->item.type), type2char(TYPEOF(buf)));
467 | 
468 |     switch (TYPEOF(buf)) {
469 | 
470 |     case LGLSXP:
471 | 
472 |       switch(cxt->item.type) {
473 | 
474 |       case CWP_ITEM_NIL:
475 |         LOGICAL(buf)[i] = NA_LOGICAL;
476 |         break;
477 | 
478 |       case CWP_ITEM_BOOLEAN:
479 |         non_null_seen = 1;
480 |         LOGICAL(buf)[i] = (cxt->item.as.boolean ? TRUE : FALSE);
481 |         break;
482 | 
483 |       default:
484 |         if (non_null_seen) {
485 |           REPROTECT(buf = coerce(buf, VECSXP), ix);
486 |         } else {
487 |           non_null_seen = 1;
488 |           REPROTECT(buf = coerce(buf, type_to_sexptype(cxt->item.type)), ix);
489 |         }
490 |         goto UNPACK_VALUE;
491 |       }
492 |       break;
493 | 
494 | 
495 |     case INTSXP:
496 | 
497 |       switch(cxt->item.type) {
498 | 
499 |       case CWP_ITEM_NIL:
500 |         INTEGER(buf)[i] = NA_INTEGER;
501 |         break;
502 | 
503 | 
504 |       case CWP_ITEM_NEGATIVE_INTEGER:
505 |         if (cxt->item.as.i64 == NA_INTEGER
506 |             || cxt->item.as.i64 < INT_MIN) {
507 |           REPROTECT(buf = coerce(buf, REALSXP), ix);
508 |           goto UNPACK_VALUE;
509 |         } else {
510 |           INTEGER(buf)[i] = cxt->item.as.i64;
511 |         }
512 |         break;
513 | 
514 |       case CWP_ITEM_POSITIVE_INTEGER:
515 |         if (cxt->item.as.u64 > INT_MAX) {
516 |           REPROTECT(buf = coerce(buf, REALSXP), ix);
517 |           goto UNPACK_VALUE;
518 |         } else {
519 |           INTEGER(buf)[i] = cxt->item.as.u64;
520 |         }
521 |         break;
522 | 
523 |       case CWP_ITEM_FLOAT:
524 |       case CWP_ITEM_DOUBLE:
525 |         REPROTECT(buf = coerce(buf, REALSXP), ix);
526 |         goto UNPACK_VALUE;
527 | 
528 |       default:
529 |         REPROTECT(buf = coerce(buf, VECSXP), ix);
530 |         goto UNPACK_VALUE;
531 |       }
532 | 
533 |       break;
534 | 
535 | 
536 |     case REALSXP:
537 |       switch(cxt->item.type) {
538 | 
539 |       case CWP_ITEM_NIL:
540 |         REAL(buf)[i] = NA_REAL;
541 |         break;
542 | 
543 |       case CWP_ITEM_NEGATIVE_INTEGER:
544 |         REAL(buf)[i] = i64_to_double(cxt, cxt->item.as.i64);
545 |         break;
546 | 
547 |       case CWP_ITEM_POSITIVE_INTEGER:
548 |         REAL(buf)[i] = u64_to_double(cxt, cxt->item.as.u64);
549 |         break;
550 | 
551 |       case CWP_ITEM_FLOAT:
552 |         REAL(buf)[i] = cxt->item.as.real;
553 |         break;
554 | 
555 |       case CWP_ITEM_DOUBLE:
556 |         REAL(buf)[i] = cxt->item.as.long_real;
557 |         break;
558 | 
559 |       default:
560 |         REPROTECT(buf = coerce(buf, VECSXP), ix);
561 |         goto UNPACK_VALUE;
562 |       }
563 | 
564 |       break;
565 | 
566 | 
567 |     case STRSXP:
568 |       switch(cxt->item.type) {
569 | 
570 |       case CWP_ITEM_NIL:
571 |         SET_STRING_ELT(buf, i, NA_STRING);
572 |         break;
573 | 
574 |       case CWP_ITEM_STR:
575 |         {
576 |           SEXP s = PROTECT(make_charsxp_or_null(cxt));
577 |           if (s == R_NilValue) {
578 |             REPROTECT(buf = coerce(buf, VECSXP), ix);
579 |             UNPROTECT(1);
580 |             goto UNPACK_VALUE;
581 |           } else {
582 |             SET_STRING_ELT(buf, i, s);
583 |           }
584 |           UNPROTECT(1);
585 |         }
586 |         break;
587 | 
588 |       default:
589 |         REPROTECT(buf = coerce(buf, VECSXP), ix);
590 |         goto UNPACK_VALUE;
591 |       }
592 | 
593 |       break;
594 | 
595 | 
596 |     case VECSXP:
597 |       SET_VECTOR_ELT(buf, i, make_sexp_from_context(cxt));
598 | 
599 |       break;
600 | 
601 | 
602 |     default:
603 |       error("Don't know how to fill out a %s (this shouldn't happen)",
604 |             type2char(TYPEOF(buf)));
605 | 
606 |     } /* switch(TYPEOF(buf)) */
607 | 
608 |   } /* for(i = ... */
609 | 
610 |   if (names != R_NilValue) {
611 |     setAttrib(buf, R_NamesSymbol, names);
612 |   }
613 | 
614 |   if (output_data_frame) {
615 |     LOGD("Making a data frame");
616 |     SEXP nrows = PROTECT(ScalarInteger(df_rows));
617 |     SEXP call = PROTECT(lang2(install(".set_row_names"), nrows));
618 |     SEXP rn = PROTECT(eval(call, R_BaseEnv));
619 |     setAttrib(buf, R_RowNamesSymbol, rn);
620 |     setAttrib(buf, R_ClassSymbol, ScalarString(mkChar("data.frame")));
621 |     UNPROTECT(3);
622 |   }
623 | 
624 |   LOGD("Returning a %s", type2char(TYPEOF(buf)));
625 |   cxt->opts->depth--;
626 |   return buf;
627 | } /* SEXP fill_vector(... */
628 | 
629 | 
630 | SEXP coerce(SEXP buf, SEXPTYPE type) {
631 |   LOG("Coercing %s to %s", type2char(TYPEOF(buf)), type2char(type));
632 |   return coerceVector(buf, type);
633 | }
634 | 
635 | 
636 | SEXP extract_env(cw_unpack_context *cxt) {
637 |   ASSERT(cxt->item.type == CWP_ITEM_MAP);
638 |   ASSERT(TYPEOF(cxt->opts->dict) == ENVSXP);
639 |   int nkeys = cxt->item.as.map.size;
640 |   add_pending(cxt, 2*nkeys);
641 |   SEXP env = PROTECT(new_env(cxt->opts->dict));
642 |   for (int i = 0; i < nkeys; i++) {
643 |     cw_unpack_next_or_fail(cxt);
644 |     cxt->opts->pending--;
645 |     SEXP key = PROTECT(always_make_charsxp_from_context(cxt));
646 |     SEXP value = PROTECT(extract_sexp(cxt));
647 |     cxt->opts->pending--;
648 |     if (LENGTH(key) == 0) {
649 |       WARN_ONCE("Item with empty name was discarded");
650 |     } else {
651 |       SEXP sym = PROTECT(installChar(key));
652 | 
653 |       if (sym == R_MissingArg || sym == R_DotsSymbol || DDVAL(sym)) {
654 |         WARN_ONCE("Item with key `%s` was discarded", CHAR(PRINTNAME(sym)));
655 |       } else {
656 |         LOGD("storing variable `%s` into a %s",
657 |              CHAR(PRINTNAME(sym)), type2char(TYPEOF(env)));
658 |         defineVar(sym, value, env);
659 |       }
660 |       UNPROTECT(1);
661 |     }
662 |     UNPROTECT(2);
663 |   }
664 |   UNPROTECT(1);
665 |   return env;
666 | }
667 | 
668 | 
669 | SEXP new_env(SEXP parent) {
670 |   ASSERT(TYPEOF(parent) == ENVSXP);
671 |   SEXP call = PROTECT(lang3(install("new.env"), ScalarLogical(TRUE), parent));
672 |   SEXP x = eval(call, R_BaseEnv);
673 |   UNPROTECT(1);
674 |   return x;
675 | }
676 | 
677 | 
678 | SEXP make_charsxp_or_raw_from_context(cw_unpack_context *cxt) {
679 |   ASSERT(cxt->item.type == CWP_ITEM_STR || cxt->item.type == CWP_ITEM_BIN);
680 |   SEXP c = PROTECT(make_charsxp_or_null(cxt));
681 |   if (c == R_NilValue) {
682 |     UNPROTECT(1);
683 |     return make_raw_from_context(cxt);
684 |   } else {
685 |     UNPROTECT(1);
686 |     return c;
687 |   }
688 | }
689 | 
690 | 
691 | SEXP make_raw_from_context(cw_unpack_context *cxt) {
692 |   uint32_t len = cxt->item.as.str.length;
693 |   SEXP out = PROTECT(allocVector(RAWSXP, len));
694 |   memcpy(RAW(out), cxt->item.as.str.start, len * sizeof(uint8_t));
695 |   UNPROTECT(1);
696 |   return out;
697 | }
698 | 
699 | 
700 | SEXP make_ext_from_context(cw_unpack_context *cxt) {
701 |   char typename[32];
702 |   WARN_ONCE("Extension type %d decoded as raw", cxt->item.type);
703 |   snprintf(typename, sizeof(typename)/sizeof(char), "ext%d", cxt->item.type);
704 |   SEXP out = PROTECT(make_raw_from_context(cxt));
705 |   setAttrib(out, R_ClassSymbol, ScalarString(mkChar(typename)));
706 |   UNPROTECT(1);
707 |   return out;
708 | }
709 | 
710 | 
711 | SEXP make_charsxp_or_null(cw_unpack_context *cxt) {
712 |   if (cxt->item.type == CWP_ITEM_STR || cxt->item.type == CWP_ITEM_BIN) {
713 |     const char *buf = cxt->item.as.str.start;
714 |     int len = cxt->item.as.str.length;
715 |     int non_ascii = 0;
716 | 
717 |     for (int i = 0; i < len; i++) {
718 |       if (buf[i] == 0) {
719 |         WARN_ONCE("Embedded null in string, returning raw instead.");
720 |         return R_NilValue;
721 |       }
722 |       if (buf[i] & 0x80) {
723 |         non_ascii = 1;
724 |       }
725 |     }
726 | 
727 |     if (non_ascii) {
728 |       /* maybe overly paranoid but I didn't see mkCharLenCE doing such
729 |          verification */
730 |       if (!verify_utf8(buf, len)) {
731 |         WARN_ONCE("String is not valid UTF-8, returning raw instead.");
732 |         return R_NilValue;
733 |       }
734 |     }
735 | 
736 |     return mkCharLenCE(buf, len, CE_UTF8);
737 |   } else if (cxt->item.type == CWP_ITEM_NIL) {
738 |     return NA_STRING;
739 |   } else {
740 |     return R_NilValue;
741 |   }
742 | }
743 | 
744 | 
745 | SEXP always_make_charsxp_from_context(cw_unpack_context *cxt) {
746 |   SEXP x = PROTECT(make_charsxp_or_null(cxt));
747 |   if (x == R_NilValue) {
748 |     WARN_ONCE("Non-string used as key was coerced to string");
749 |     SEXP item = PROTECT(make_sexp_from_context(cxt));
750 |     SEXP call = PROTECT(lang2(install("repr"), item));
751 |     SEXP convert = PROTECT(eval(call, cxt->opts->package));
752 |     LOG("repr returned a %s", type2char(TYPEOF(convert)));
753 |     SEXP chr = STRING_ELT(convert, 0);
754 |     UNPROTECT(4);
755 |     return chr;
756 |   } else {
757 |     UNPROTECT(1);
758 |     return x;
759 |   }
760 | }
761 | 
762 | 
763 | double i64_to_double(cw_unpack_context *cxt, int64_t x) {
764 |   double xx = x;
765 |   if ((int64_t)xx != x) {
766 |     WARN_ONCE("Cast of integer %" PRId64 " to double loses precision", x);
767 |   };
768 |   return xx;
769 | }
770 | 
771 | 
772 | double u64_to_double(cw_unpack_context *cxt, uint64_t x) {
773 |   double xx = x;
774 |   if ((uint64_t)xx != x) {
775 |     WARN_ONCE("Cast of integer %" PRIu64 " to double loses precision", x);
776 |   }
777 |   return xx;
778 | }
779 | 
780 | 
781 | SEXPTYPE type_to_sexptype(int t) {
782 |   switch(t) {
783 |   case CWP_ITEM_NIL:
784 |   case CWP_ITEM_BOOLEAN: return LGLSXP;
785 |   case CWP_ITEM_POSITIVE_INTEGER:
786 |   case CWP_ITEM_NEGATIVE_INTEGER: return INTSXP;
787 |   case CWP_ITEM_FLOAT:
788 |   case CWP_ITEM_DOUBLE: return REALSXP;
789 |   case CWP_ITEM_STR: return STRSXP;
790 |   default:
791 |     {
792 |       if (t <= CWP_ITEM_MAX_USER_EXT
793 |           && t >= CWP_ITEM_MIN_USER_EXT) {
794 |         return RAWSXP;
795 |       } else if (t <= CWP_ITEM_MAX_RESERVED_EXT
796 |                  && t >= CWP_ITEM_MIN_RESERVED_EXT) {
797 |         return RAWSXP;
798 |       } else {
799 |         return VECSXP;
800 |       }
801 |     }
802 |   }
803 | }
804 | 
805 | 
806 | const char *decode_item_type(cwpack_item_types x) {
807 |   switch(x) {
808 |   case CWP_ITEM_NIL: return "nil";
809 |   case CWP_ITEM_BOOLEAN: return "boolean";
810 |   case CWP_ITEM_POSITIVE_INTEGER: return "positive integer";
811 |   case CWP_ITEM_NEGATIVE_INTEGER: return "negative integer";
812 |   case CWP_ITEM_FLOAT: return "float";
813 |   case CWP_ITEM_DOUBLE: return "double";
814 |   case CWP_ITEM_STR: return "string";
815 |   case CWP_ITEM_BIN: return "binary";
816 |   case CWP_ITEM_ARRAY: return "array";
817 |   case CWP_ITEM_MAP: return "map";
818 |   case CWP_NOT_AN_ITEM: return "not an item";
819 |   default:
820 |     if (x <= CWP_ITEM_MAX_USER_EXT
821 |         && x >= CWP_ITEM_MIN_USER_EXT) {
822 |       return "CWP_ITEM_USER_EXT(n)";
823 |     } else if (x <= CWP_ITEM_MAX_RESERVED_EXT
824 |                && x >= CWP_ITEM_MIN_RESERVED_EXT) {
825 |       return "CWP_ITEM_RESERVED_EXT(n)";
826 |     } else {
827 |       return "???";
828 |     }
829 |   }
830 | }
831 | 


--------------------------------------------------------------------------------
/src/decode.h:
--------------------------------------------------------------------------------
 1 | #ifndef DECODE_H
 2 | #define DECODE_H
 3 | 
 4 | #include "vadr.h"
 5 | #include "cwpack.h"
 6 | 
 7 | SEXP extract_sexp(cw_unpack_context *);
 8 | int init_unpack_context(cw_unpack_context *, SEXP, SEXP, unsigned long, unsigned long);
 9 | 
10 | SEXP _unpack_msg(SEXP, SEXP);
11 | SEXP _unpack_msg_partial(SEXP, SEXP, SEXP, SEXP);
12 | SEXP _unpack_opts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
13 | 
14 | #endif
15 | 


--------------------------------------------------------------------------------
/src/encode.c:
--------------------------------------------------------------------------------
  1 | #include "cwpack.h"
  2 | #include "vadr.h"
  3 | 
  4 | int handle_overflow(cw_pack_context *, unsigned long);
  5 | SEXP copy_to_new_raw(SEXP, unsigned long, unsigned long);
  6 | 
  7 | void pack_sexp(cw_pack_context *, SEXP);
  8 | 
  9 | void pack_singleton(cw_pack_context *, SEXP);
 10 | void pack_vector(cw_pack_context*, SEXP);
 11 | void pack_named_vector(cw_pack_context*, SEXP, SEXP);
 12 | 
 13 | void pack_logical(cw_pack_context *, int);
 14 | void pack_integer(cw_pack_context *, int);
 15 | void pack_real(cw_pack_context *, double);
 16 | void pack_string(cw_pack_context *, SEXP);
 17 | void pack_raw(cw_pack_context *, SEXP);
 18 | 
 19 | static long calls = 0;
 20 | 
 21 | SEXP _pack_opts(SEXP compatible,
 22 |                 SEXP as_is,
 23 |                 SEXP use_dict,
 24 |                 SEXP max_size,
 25 |                 SEXP buf_size,
 26 |                 SEXP package) {
 27 |   SEXP optsxp = PROTECT(allocVector(RAWSXP, sizeof(pack_opts)));
 28 |   pack_opts *opts = (pack_opts*)RAW(optsxp);
 29 |   opts->buf = R_NilValue;
 30 |   opts->buf_index = -1;
 31 |   opts->as_is = asLogical(as_is);
 32 |   opts->use_dict = asLogical(use_dict);
 33 |   opts->buf_size = asInteger(buf_size);
 34 |   double max_sized = asInteger(max_size);
 35 |   if (isinf(max_sized)) {
 36 |     opts->max_size = LONG_MAX;
 37 |   } else {
 38 |     opts->max_size = max_sized;
 39 |   }
 40 |   opts->compatible = asLogical(compatible);
 41 | 
 42 |   opts->package = package;
 43 |   /* also hang onto explicit refs for the SEXP values*/
 44 |   SEXP pkg = PROTECT(CONS(package, R_NilValue));
 45 |   setAttrib(optsxp, install("refs"), pkg);
 46 |   UNPROTECT(2);
 47 |   return optsxp;
 48 | }
 49 | 
 50 |             
 51 | int init_pack_context(cw_pack_context *cxt, SEXP opts) {
 52 |   ASSERT(TYPEOF(opts) == RAWSXP && LENGTH(opts) == sizeof(pack_opts));
 53 |   cxt->opts = (pack_opts *) RAW(opts);
 54 |   
 55 |   /* allocate a buffer and protect it. This is unprotected by caller */
 56 |   PROTECT_WITH_INDEX(cxt->opts->buf = allocVector(RAWSXP, cxt->opts->buf_size),
 57 |                      &cxt->opts->buf_index);
 58 |   cw_pack_context_init(cxt,
 59 |                        RAW(cxt->opts->buf),
 60 |                        LENGTH(cxt->opts->buf),
 61 |                        &handle_overflow);
 62 |   cw_pack_set_compatibility(cxt, cxt->opts->compatible);
 63 |   return 1;                     /* return protections */
 64 | }
 65 | 
 66 |             
 67 | SEXP _pack_msg(SEXP input, SEXP opts) {
 68 |   calls++;
 69 |   cw_pack_context cxt;
 70 |   int protections = init_pack_context(&cxt, opts);
 71 |   
 72 |   pack_sexp(&cxt, input);
 73 | 
 74 |   if (cxt.return_code != CWP_RC_OK) {
 75 |     error("%s", decode_return_code(cxt.return_code));
 76 |   }
 77 |   unsigned long newlength = (cxt.current - cxt.start) / sizeof(Rbyte);
 78 |   SETLENGTH(cxt.opts->buf, newlength);
 79 |   UNPROTECT(protections);
 80 |   return cxt.opts->buf;
 81 | }
 82 | 
 83 | 
 84 | int handle_overflow(cw_pack_context *cxt, unsigned long more) {
 85 |   /* allocate a vector twice as big, copy data into the new vector,
 86 |      and update context. */
 87 |   unsigned long newlen = LENGTH(cxt->opts->buf);
 88 |   unsigned long req = LENGTH(cxt->opts->buf) + more;
 89 | 
 90 |   if (req > cxt->opts->max_size) return CWP_RC_BUFFER_OVERFLOW;
 91 |   
 92 |   while (newlen < req) newlen *= 2;
 93 |   if (newlen > cxt->opts->max_size) {
 94 |     newlen = cxt->opts->max_size;
 95 |   }
 96 |   
 97 |   LOG("%d / %d bytes used, need %d more, resizing to %d\n",
 98 |       cxt->current - cxt->start,
 99 |       LENGTH(cxt->opts->buf),
100 |       more,
101 |       newlen);
102 | 
103 |   REPROTECT(cxt->opts->buf =
104 |               copy_to_new_raw(cxt->opts->buf,
105 |                               newlen,
106 |                               LENGTH(cxt->opts->buf)),
107 |             cxt->opts->buf_index);
108 |   
109 |   // update the context structure to point to the new buf
110 |   cxt->current = cxt->current - cxt->start + RAW(cxt->opts->buf);
111 |   cxt->end = RAW(cxt->opts->buf) + LENGTH(cxt->opts->buf) * sizeof(Rbyte);
112 |   cxt->start = RAW(cxt->opts->buf);
113 |   return 0;
114 | }
115 | 
116 |             
117 | SEXP copy_to_new_raw(SEXP from, unsigned long new_len, unsigned long copy_len) {
118 |   assert_type(from, RAWSXP);
119 |   SEXP to = PROTECT(allocVector(RAWSXP, new_len));
120 |   memcpy(RAW(to), RAW(from), MIN(new_len, copy_len) * sizeof(Rbyte));
121 |   UNPROTECT(1);
122 |   return to;
123 | }
124 | 
125 | 
126 | int containsString(SEXP cl, const char *ch) {
127 |   assert_type(cl, STRSXP);
128 |   SEXP cmp = PROTECT(mkChar(ch));
129 |   int found = 0;
130 |   for (int i = 0; i < LENGTH(cl); i++) {
131 |     if (STRING_ELT(cl, i) == cmp) {
132 |       found = 1; break;
133 |     }
134 |   }
135 |   UNPROTECT(1);
136 |   return found;
137 | }
138 | 
139 | 
140 | void pack_sexp(cw_pack_context* cxt, SEXP dat) {
141 |   int as_is_sto = cxt->opts->as_is;
142 |   int unp = 0;
143 | 
144 |   /* check for asIs, classes */
145 |   SEXP cl = getAttrib(dat, R_ClassSymbol);
146 |   if ((cl) != R_NilValue) {
147 |     if (containsString(cl, "AsIs")) {
148 |       cxt->opts->as_is = TRUE;
149 |     } else {
150 |       /* Preprocess (unless in the middle of an AsIs) */
151 |       LOG("Preprocessing a %s of class '%s'\n",
152 |               type2char(TYPEOF(cxt->opts->package)),
153 |               CHAR(STRING_ELT(cl, 0)));
154 |       SEXP call = PROTECT(lang2(install("prepack"), dat));
155 |       dat = PROTECT(eval(call, cxt->opts->package));
156 |       unp += 2;
157 |       /* Check if the preprocessor gave us an AsIs, but don't
158 |          preprocess again */
159 |       cl = getAttrib(dat, R_ClassSymbol);
160 |       if (cl != R_NilValue && containsString(cl, "AsIs")) {
161 |         cxt->opts->as_is = TRUE;
162 |         LOG("Preprocessor returned an AsIs!");
163 |       }
164 |     }
165 |   }
166 |   
167 |   if (isVector(dat)) {
168 |     SEXP names = getAttrib(dat, R_NamesSymbol);
169 |     if (names != R_NilValue && cxt->opts->use_dict) {
170 |       PROTECT(names); unp += 1;
171 |       pack_named_vector(cxt, dat, names);
172 |     } else if (LENGTH(dat) == 1 && !cxt->opts->as_is) {
173 |       pack_singleton(cxt, dat);
174 |     } else {
175 |       pack_vector(cxt, dat);
176 |     }
177 |   } else {
178 |     switch (TYPEOF(dat)) {
179 | 
180 |     case NILSXP:
181 |       cw_pack_nil(cxt); break;
182 | 
183 |     case ENVSXP:
184 |       {
185 |         SEXP args = PROTECT(lang4(install("as.list.environment"),
186 |                                   dat,
187 |                                   ScalarLogical(1),
188 |                                   ScalarLogical(1)));
189 |         SEXP list = PROTECT(eval(args, R_BaseEnv));
190 |         unp += 2;
191 |         SEXP names = getAttrib(list, R_NamesSymbol);
192 |         pack_named_vector(cxt, list, names);
193 |         break;
194 |       }
195 | 
196 |     default:
197 |       cxt->opts->buf = NULL;
198 |       error("can't pack a %s", type2char(TYPEOF(dat)));
199 |     }
200 |   }
201 |   UNPROTECT(unp);
202 |   cxt->opts->as_is = as_is_sto;
203 | }
204 | 
205 | 
206 | void pack_singleton(cw_pack_context *cxt, SEXP dat) {
207 |   switch (TYPEOF(dat)) {
208 |   case LGLSXP:
209 |     pack_logical(cxt, LOGICAL(dat)[0]);
210 |     break;
211 |     
212 |   case INTSXP:
213 |     pack_integer(cxt, INTEGER(dat)[0]);
214 |     break;
215 |     
216 |   case REALSXP:
217 |     pack_real(cxt, REAL(dat)[0]);
218 |     break;
219 |     
220 |   case STRSXP:
221 |     pack_string(cxt, STRING_ELT(dat, 0));
222 |     break;
223 | 
224 |   case RAWSXP:
225 |     pack_raw(cxt, dat);
226 |     break;
227 |     
228 |   case VECSXP:
229 |     pack_vector(cxt, dat);
230 |     break;
231 |  
232 |   default:
233 |     cxt->opts->buf = NULL;
234 |     error("can't pack a singleton %s", type2char(TYPEOF(dat)));
235 |   }
236 | }
237 | 
238 | // generic for-loop macro
239 | #define PACK_VECTOR(CXT, X, ACCESSOR, STORE) {     \
240 |     int len = LENGTH(X);                           \
241 |     cw_pack_array_size(cxt, len);                  \
242 |     for (int i = 0; i < len; i++) {                \
243 |       STORE(CXT, ACCESSOR(X, i));                  \
244 |     }                                              \
245 |   }
246 | 
247 | void pack_vector(cw_pack_context *cxt, SEXP x) {
248 |   ASSERT(isVector(x));
249 |   
250 |   switch(TYPEOF(x)) {
251 | 
252 |   case RAWSXP:
253 |     pack_raw(cxt, x);
254 |     break;
255 |     
256 |   case LGLSXP:
257 |     PACK_VECTOR(cxt, x, LOGICAL_ELT, pack_logical);
258 |     break;
259 |     
260 |   case INTSXP:
261 |     PACK_VECTOR(cxt, x, INTEGER_ELT, pack_integer);
262 |     break;
263 | 
264 |   case REALSXP:
265 |     PACK_VECTOR(cxt, x, REAL_ELT, pack_real);
266 |     break;
267 | 
268 |   case VECSXP:
269 |     PACK_VECTOR(cxt, x, VECTOR_ELT, pack_sexp);
270 |     break;
271 | 
272 |   case STRSXP:
273 |     PACK_VECTOR(cxt, x, STRING_ELT, pack_string);
274 |     break;
275 | 
276 |   default:
277 |     cxt->opts->buf = NULL;
278 |     error("Don't know how to pack a %s vector", type2char(TYPEOF(x)));
279 |   }
280 | }
281 | 
282 | // generic for-loop macro
283 | #define PACK_NAMED_VECTOR(CXT, X, NAMES, ACCESSOR, STORE) {    \
284 |     int len = LENGTH(X);                        \
285 |     cw_pack_map_size(cxt, len);                 \
286 |     for (int i = 0; i < len; i++) {             \
287 |       pack_string(CXT, STRING_ELT(NAMES, i));   \
288 |       STORE(CXT, ACCESSOR(X, i));               \
289 |     }                                           \
290 |   }
291 | 
292 | void pack_named_vector(cw_pack_context *cxt, SEXP x, SEXP names) {
293 |   ASSERT(isVector(x));
294 |   
295 |   switch(TYPEOF(x)) {
296 | 
297 |   case RAWSXP:
298 |     WARN_ONCE("Names discarded from raw object");
299 |     pack_raw(cxt, x);
300 |     break;
301 |     
302 |   case LGLSXP:
303 |     PACK_NAMED_VECTOR(cxt, x, names, LOGICAL_ELT, pack_logical);
304 |     break;
305 |     
306 |   case INTSXP:
307 |     PACK_NAMED_VECTOR(cxt, x, names, INTEGER_ELT, pack_integer);
308 |     break;
309 | 
310 |   case REALSXP:
311 |     PACK_NAMED_VECTOR(cxt, x, names, REAL_ELT, pack_real);
312 |     break;
313 | 
314 |   case VECSXP:
315 |     PACK_NAMED_VECTOR(cxt, x, names, VECTOR_ELT, pack_sexp);
316 |     break;
317 | 
318 |   case STRSXP:
319 |     PACK_NAMED_VECTOR(cxt, x, names, STRING_ELT, pack_string);
320 |     break;
321 | 
322 |   default:
323 |     cxt->opts->buf = NULL;
324 |     error("Don't know how to pack a %s vector", type2char(TYPEOF(x)));
325 |   }
326 | }
327 | 
328 | 
329 | void pack_logical(cw_pack_context *cxt, int x) {
330 |   if (x == NA_LOGICAL) {
331 |     cw_pack_nil(cxt);
332 |   } else if (x) {
333 |     cw_pack_true(cxt);
334 |   } else {
335 |     cw_pack_false(cxt);
336 |   }
337 | }
338 | 
339 | void pack_integer(cw_pack_context *cxt, int x) {
340 |   if (x == NA_INTEGER) {
341 |     cw_pack_nil(cxt);
342 |   } else {
343 |     cw_pack_signed(cxt, x);
344 |   }
345 | }
346 | 
347 | void pack_real(cw_pack_context *cxt, double x) {
348 |   /* save bytes by packing integer if possible. */
349 |   if (ISNA(x)) {
350 |     cw_pack_nil(cxt);
351 |   } else if (ceil(x) == x) {
352 |     if (x >= 0 && x <= UINT32_MAX) {
353 |       cw_pack_unsigned(cxt, x);
354 |     } else if (x < 0 && x >= INT32_MIN) {
355 |       cw_pack_signed(cxt, x);
356 |     } else {
357 |       /* NaN's end up here. */
358 |       cw_pack_real(cxt, x);
359 |     }
360 |   } else {
361 |     cw_pack_real(cxt, x);
362 |   }
363 | }
364 | 
365 | void pack_string(cw_pack_context *cxt, SEXP x) {
366 |   assert_type(x, CHARSXP);
367 |   if (x == NA_STRING) {
368 |     cw_pack_nil(cxt);
369 |   } else {
370 |     if (getCharCE(x) != CE_UTF8) {
371 |       /* reEnc allocates temp memory */
372 |       void *vmax = vmaxget();
373 |       const char *newbuf = reEnc(CHAR(x), getCharCE(x), CE_UTF8, 1);
374 |       x = PROTECT(mkCharCE(newbuf, CE_UTF8));
375 |       int len = R_nchar(x, Bytes, 0, 0, "");
376 |       cw_pack_str(cxt, CHAR(x), len);
377 |       UNPROTECT(1);
378 |       /* free temp memory */
379 |       vmaxset(vmax);
380 |     } else {
381 |       int len = R_nchar(x, Bytes, 0, 0, "");
382 |       cw_pack_str(cxt, CHAR(x), len);
383 |     }
384 |   }
385 | }
386 | 
387 | void pack_raw(cw_pack_context *cxt, SEXP x) {
388 |   assert_type(x, RAWSXP);
389 |   cw_pack_bin(cxt, RAW(x), LENGTH(x));
390 | }
391 | 


--------------------------------------------------------------------------------
/src/encode.h:
--------------------------------------------------------------------------------
 1 | #ifndef ENCODE_H
 2 | #define ENCODE_H
 3 | 
 4 | #include "vadr.h"
 5 | #include "cwpack.h"
 6 | 
 7 | SEXP _pack_opts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
 8 | SEXP _pack_msg(SEXP, SEXP);
 9 | 
10 | void pack_sexp(cw_pack_context *, SEXP);
11 | int init_pack_context(cw_pack_context *, SEXP);
12 | 
13 | #endif
14 |  
15 | 


--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
 1 | #include <R.h>
 2 | #include <Rinternals.h>
 3 | #include <stdlib.h> // for NULL
 4 | #include <R_ext/Rdynload.h>
 5 | #include "encode.h"
 6 | #include "decode.h"
 7 | 
 8 | /* FIXME: 
 9 |    Check these declarations against the C/Fortran source code.
10 | */
11 | 
12 | static const R_CallMethodDef CallEntries[] = {
13 |     {"_pack_msg",           (DL_FUNC) &_pack_msg,           2},
14 |     {"_pack_opts",          (DL_FUNC) &_pack_opts,          6},
15 |     {"_unpack_msg",         (DL_FUNC) &_unpack_msg,         2},
16 |     {"_unpack_msg_partial", (DL_FUNC) &_unpack_msg_partial, 4},
17 |     {"_unpack_opts",        (DL_FUNC) &_unpack_opts,        7},
18 |     {NULL, NULL, 0}
19 | };
20 | 
21 | void R_init_msgpack(DllInfo *dll)
22 | {
23 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
24 |     R_useDynamicSymbols(dll, FALSE);
25 | }
26 | 


--------------------------------------------------------------------------------
/src/opts.h:
--------------------------------------------------------------------------------
 1 | #ifndef OPTS_H
 2 | #define OPTS_H
 3 | 
 4 | #include <R.h>
 5 | #include <Rinternals.h>
 6 | 
 7 | /* To hand options down from R code into the callbacks, I attach this
 8 |    options structure to the CWpack state structures. 
 9 |    
10 |    These structures exist at R level as the contents of some RAWSXP.
11 |    This seems safe enough. But part of these structures are also
12 |    pointers to SEXPs. I cover for this by also holding the SEXP
13 |    pointers as an attrubute. But this feels tacky.
14 | */
15 | 
16 | typedef struct unpack_opts {
17 |   SEXP dict;
18 |   int use_df;
19 |   int simplify;
20 |   SEXP package;
21 |   unsigned int max_depth;
22 |   unsigned long max_pending;
23 | 
24 |   /* state variables used in unpack */
25 |   SEXP buf;
26 |   PROTECT_INDEX buf_index;
27 |   unsigned int depth;
28 |   unsigned long pending;
29 |   SEXP underflow_handler;
30 |   
31 | } unpack_opts;
32 | 
33 | 
34 | typedef struct pack_opts {
35 |   int as_is;
36 |   int compatible;
37 |   int use_dict;
38 |   long max_size;
39 |   long buf_size;
40 | 
41 |   /* state used by pack / write */
42 |   SEXP buf;
43 |   PROTECT_INDEX buf_index;
44 |   SEXP package;
45 | 
46 |   /* state used by write */
47 |   SEXP conn;
48 | 
49 | } pack_opts;
50 | 
51 | #endif
52 | 


--------------------------------------------------------------------------------
/src/utf8.c:
--------------------------------------------------------------------------------
 1 | #include "utf8.h"
 2 | #include "vadr.h"
 3 | 
 4 | /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
 5 |    See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
 6 | 
 7 | #define UTF8_ACCEPT 0
 8 | #define UTF8_REJECT 1
 9 | 
10 | static const uint8_t utf8d[] = {
11 |   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
12 |   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
13 |   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
14 |   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
15 |   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
16 |   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
17 |   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
18 |   0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
19 |   0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
20 |   0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
21 |   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
22 |   1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
23 |   1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
24 |   1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
25 | };
26 | 
27 | uint32_t
28 | decode_utf8(uint32_t* state, uint32_t* codep, uint32_t byte) {
29 |   uint32_t type = utf8d[byte];
30 | 
31 |   *codep = (*state != UTF8_ACCEPT) ?
32 |     (byte & 0x3fu) | (*codep << 6) :
33 |     (0xff >> type) & (byte);
34 | 
35 |   *state = utf8d[256 + *state*16 + type];
36 |   return *state;
37 | }
38 | 
39 | /* LICENSE for above code: */
40 | 
41 | /* Permission is hereby granted, free of charge, to any person
42 |    obtaining a copy of this software and associated documentation
43 |    files (the "Software"), to deal in the Software without
44 |    restriction, including without limitation the rights to use, copy,
45 |    modify, merge, publish, distribute, sublicense, and/or sell copies
46 |    of the Software, and to permit persons to whom the Software is
47 |    furnished to do so, subject to the following conditions: */
48 | 
49 | /* The above copyright notice and this permission notice shall be
50 |    included in all copies or substantial portions of the Software. */
51 | 
52 | /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
53 |    EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
54 |    MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
55 |    NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
56 |    BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
57 |    ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
58 |    CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
59 |    SOFTWARE. */
60 | 
61 | 
62 | int verify_utf8(const char *x, uint32_t len) {
63 |   uint32_t codepoint = 0;
64 |   uint32_t state = 0;
65 | 
66 |   for (uint32_t i = 0; i < len; i++) {
67 |     decode_utf8(&state, &codepoint, (unsigned char) x[i]);
68 |   }
69 | 
70 |   return state == UTF8_ACCEPT;
71 | }
72 | 


--------------------------------------------------------------------------------
/src/utf8.h:
--------------------------------------------------------------------------------
1 | #ifndef UTF8_H
2 | #define UTF8_H
3 | 
4 | #include <inttypes.h>
5 | 
6 | int verify_utf8(const char *, uint32_t len);
7 | 
8 | #endif
9 | 


--------------------------------------------------------------------------------
/src/vadr.c:
--------------------------------------------------------------------------------
 1 | #include "vadr.h"
 2 | #include "cwpack.h"
 3 | #include <R_ext/Rdynload.h>
 4 | 
 5 | void assert_type5(SEXP x, SEXPTYPE type, const char *where,
 6 |                          const char *file, int line) {
 7 |   if (TYPEOF(x) != type) {
 8 |     error("Expected %s in %s, got %s (%s:%d)",
 9 |           type2char(type), where, type2char(TYPEOF(x)), file, line);
10 |   }
11 | }
12 | 
13 | const char *decode_return_code(int x) {
14 |   switch(x) {
15 |   case CWP_RC_OK: return "ok";
16 |   case CWP_RC_END_OF_INPUT: return "end of input";
17 |   case CWP_RC_BUFFER_OVERFLOW: return "buffer overflow";
18 |   case CWP_RC_BUFFER_UNDERFLOW: return "buffer underflow";
19 |   case CWP_RC_MALFORMED_INPUT: return "malformed input";
20 |   case CWP_RC_WRONG_BYTE_ORDER: return "wrong byte order";
21 |   case CWP_RC_ERROR_IN_HANDLER: return "error in handler";
22 |   case CWP_RC_ILLEGAL_CALL: return "illegal call";
23 |   case CWP_RC_MALLOC_ERROR: return "malloc error";
24 |   case CWP_RC_STOPPED: return "stopped";
25 |   default: return "unknown error";
26 |   }
27 | }
28 | 
29 | 


--------------------------------------------------------------------------------
/src/vadr.h:
--------------------------------------------------------------------------------
 1 | #ifndef _VADR_H
 2 | #define _VADR_H
 3 | 
 4 | #include <R.h>
 5 | #include <Rinternals.h>
 6 | 
 7 | #define MIN(x,y) ((x) < (y) ? (x) : (y))
 8 | #define MAX(x,y) ((x) > (y) ? (x) : (y))
 9 | #define ASSERT(X) {if (! (X)) { error("%s: expected that (%s) @%s:%d\n", __func__, #X, __FILE__, __LINE__); }}
10 | 
11 | #define LOGICAL_ELT(O, I) (LOGICAL(O)[I])
12 | #define INTEGER_ELT(O, I) (INTEGER(O)[I])
13 | #define REAL_ELT(O, I) (REAL(O)[I])
14 | 
15 | #define WARN_ONCE(...) {                        \
16 |       static long last_warned = 0;              \
17 |       if (last_warned < calls) {                \
18 |         last_warned = calls;                    \
19 |         warning(__VA_ARGS__);                   \
20 |       }                                         \
21 |     }
22 | 
23 | #undef DEBUG 
24 | 
25 | #ifdef DEBUG
26 | #define LOG(FMT, ...) Rprintf("%s: "  FMT " @%s:%d\n",   \
27 |                               __func__, ##__VA_ARGS__, __FILE__, __LINE__)
28 | #else
29 | #define LOG(...) NULL
30 | #endif
31 | 
32 | #define assert_type(x, type) assert_type3(x, type, __func__)
33 | #define assert_type3(x, type, where) assert_type5(x, type, where, __FILE__, __LINE__)
34 | void assert_type5(SEXP x, SEXPTYPE type, const char *where, const char *file, int line);
35 | const char *decode_return_code(int);
36 | 
37 | #endif
38 | 
39 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(msgpack)
3 | 
4 | test_check("msgpack")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-static.R:
--------------------------------------------------------------------------------
  1 | context("msgpackr static API")
  2 | 
  3 | `%is%` <- expect_equal
  4 | 
  5 | 
  6 | roundtrip <- function(start) {
  7 |   bin <- packMsg(start)
  8 |   end <- unpackMsg(bin)
  9 |   expect_equal(start, end)
 10 |   bin
 11 | }
 12 | 
 13 | 
 14 | pack_rt <- function (start, cmp) {
 15 |   bin <- packMsg(start)
 16 |   expect_equal(bin, cmp)
 17 |   end <- unpackMsg(bin)
 18 |   expect_equivalent(start, end)
 19 | }
 20 | 
 21 | test_that("pack singletons", {
 22 |   #null
 23 |   pack_rt(NA, as.raw(0xc0))
 24 |   packMsg(NULL) %is% as.raw(0xc0)
 25 | 
 26 |   #logical
 27 |   pack_rt(FALSE, as.raw(0xc2))
 28 |   pack_rt(TRUE, as.raw(0xc3))
 29 | 
 30 |   #small ints
 31 |   pack_rt(12L, as.raw(0x0c))
 32 |   pack_rt(-4L, as.raw(0xfc))
 33 | 
 34 |   #32 bit ints
 35 |   pack_rt(2147483647L, as.raw(c(0xCE, 0x7f, 0xff, 0xff, 0xff)))
 36 | 
 37 |   # cwpack will use 32 bit float if precision is preserved. For example, Inf:
 38 |   pack_rt(Inf, as.raw(c(0xca, 0x7f, 0x80, 0x00, 0x00)))
 39 | 
 40 |   # and a float64:
 41 |   x <- 1.7976931348623157e308 # .Machine$double.xmax
 42 |   # 0 11111111110 1111111111111111111111111111111111111111111111111111
 43 |   pack_rt(x, as.raw(c(0xCB, 0x7F, 0xEF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)))
 44 | 
 45 |   # character
 46 |   pack_rt("hello",
 47 |           as.raw(c(0xa5, 0x68, 0x65, 0x6c, 0x6c, 0x6f)))
 48 | 
 49 |   # raw bytes
 50 |   pack_rt(as.raw(0xab),
 51 |           as.raw(c(0xc4, 0x01, 0xab)))
 52 | 
 53 |   #NAs and NULL all collapse to nil
 54 |   packMsg(NA_character_) %is% as.raw(0xc0)
 55 |   packMsg(NA_real_) %is% as.raw(0xc0)
 56 |   packMsg(NA_integer_) %is% as.raw(0xc0)
 57 |   packMsg(NULL) %is% as.raw(0xc0)
 58 | })
 59 | 
 60 | test_that("unpack large ints to float", {
 61 |   bigint = as.raw(c(0xcf, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01))
 62 |   negint = as.raw(c(0xd3, 0xff, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff))
 63 |   not_na = as.raw(c(0xd3, 0xff, 0xff, 0xff, 0xff, 0x80, 0x00, 0x00, 0x00))
 64 |   bigfloat = 9007199254740993 #read as float loses precision
 65 |   negfloat = -2147483649
 66 |   not_na_float = -2147483648
 67 |   expect_warning(unpackMsg(bigint) %is% bigfloat, "precision")
 68 |   unpackMsg(negint) %is% negfloat
 69 |   unpackMsg(not_na) %is% not_na_float
 70 |   #
 71 |   # and all in a vector...
 72 |   expect_warning(
 73 |    unpackMsg(c(as.raw(0x93), bigint, negint, not_na)) %is%
 74 |      c(bigfloat, negfloat, not_na_float),
 75 |    "precision")
 76 | })
 77 | 
 78 | test_that("nice errors from unpack", {
 79 |   expect_error(unpackMsg(as.raw(c(0x92, 0xc0))),
 80 |                "end of input")
 81 | })
 82 | 
 83 | 
 84 | test_that("Pack raws", {
 85 |   roundtrip(as.raw(c(0xab, 0xbc, 0x00)))
 86 | })
 87 | 
 88 | 
 89 | test_that("Pack lists", {
 90 |   roundtrip(list(1, "what"))
 91 |   roundtrip(list("a", list("b", 4)))
 92 | })
 93 | 
 94 | 
 95 | test_that("unpack simplified vectors", {
 96 |   roundtrip(c(FALSE, NA)) #bool
 97 |   roundtrip(list(FALSE, 3L)) #list; don't coerce logicals (that aren't all NA)
 98 |   roundtrip(c(1L, NA)) #integer
 99 |   roundtrip(c(1L, NA, 1.0)) #real
100 |   roundtrip(c("hello", NA)) #string
101 |   roundtrip(list(1L, 2L, "hi")) #list, don't coerce to char
102 |   roundtrip(list(c(1,2), c("hi", "bye"))) #list
103 | })
104 | 
105 | 
106 | test_that("unpack simplified vectors starting with NA", {
107 |   roundtrip(c(NA, FALSE, TRUE))
108 |   roundtrip(c(NA, 1L, 2L))
109 |   roundtrip(c(NA, exp(0), pi))
110 |   roundtrip(c(NA, "hi", "bye"))
111 | })
112 | 
113 | 
114 | test_that("pack zero length vectors", {
115 |   roundtrip(logical(0))
116 | })
117 | 
118 | 
119 | test_that("packing overflow handler works", {
120 |   expect_true(length(packMsg(1:10000)) > 1000)
121 | })
122 | 
123 | test_that("extension mechanism", {
124 |   obj <- c(1, 2, 3)
125 |   class(obj) <- c("reverse")
126 |   assign(envir = globalenv(), "prepack.reverse", function(x) rev(x))
127 |   unpackMsg(packMsg(obj)) %is% c(3, 2, 1)
128 | })
129 | 
130 | 
131 | test_that("recursive use of msgpack works", {
132 |   assign(envir = globalenv(), "prepack.blob", function(x) packMsg(unclass(x)))
133 |   obj <- "hello"
134 |   class(obj) <- "blob"
135 |   typeof(unpackMsg(packMsg(obj))) %is% "raw"
136 | })
137 | 
138 | 
139 | test_that("Max buffer size", {
140 |   packMsg(300:400, max_size=306, buf_size=10)
141 |   expect_error(packMsg(300:401, max_size=306, buf_size = 10), "overflow")
142 | })
143 | 
144 | 
145 | test_that("NA and NaN are distinct doubles,", {
146 |   roundtrip(c(NA, NaN))
147 | })
148 | 
149 | 
150 | test_that("compatibility mode", {
151 |   packMsg(as.raw(c(1, 2, 3))) %is% as.raw(c(0xc4, 0x03, 0x01, 0x02, 0x03))
152 |   packMsg(as.raw(c(1, 2, 3)), compatible=TRUE) %is% as.raw(c(0xa3, 0x01, 0x02, 0x03))
153 | })
154 | 
155 | 
156 | test_that("UnpackMsg: detect bad strings, warn, and return raw", {
157 | 
158 |   expect_warning(expect_equal(unpackMsg(as.raw(c(0xa3, 0x00, 0x62, 0x63))),
159 |                               as.raw(c(0x00, 0x62, 0x63))),
160 |                  "nul")
161 | 
162 |   #and check for malformed UTF8
163 |   #3 byte sequence with last continuation byte missing
164 |   expect_warning(expect_equal(unpackMsg(as.raw(c(0xa2, 0x30, 0x80))),
165 |                               as.raw(c(0x30, 0x80))),
166 |                  "UTF")
167 | 
168 |   #2 bytes of 3 byte sequence followed by space
169 |   expect_warning(expect_equal(unpackMsg(as.raw(c(0xa3, 0x30, 0x80, 0x20))),
170 |                               as.raw(c(0x30, 0x80, 0x20))),
171 |                  "UTF")
172 | 
173 |   # illegal byte
174 |   expect_warning(expect_equal(unpackMsg(as.raw(c(0xa1, 0xff))),
175 |                               as.raw(c(0xff))),
176 |                  "UTF")
177 |   #also for malformed UTF8?
178 | 
179 | })
180 | 
181 | test_that("always emit strings in UTF8,", {
182 |   x <- "fa\xE7ile"
183 |   Encoding(x) <- "latin1"
184 |   packMsg(x) %is% as.raw(c(0xa7, 0x66, 0x61, 0xc3, 0xa7, 0x69, 0x6c, 0x65))
185 | })
186 | 
187 | test_that("use ints for integral floats under 32 bits", {
188 |   packMsg(1) %is% packMsg(1L)
189 |   length(packMsg(2^32)) %is% 5
190 |   length(packMsg(2^32+1)) %is% 9
191 |   length(packMsg(-2^31)) %is% 5
192 |   length(packMsg(-2^31-1)) %is% 9
193 | })
194 | 
195 | 
196 | test_that("as_is uses arrays even for singletons", {
197 |   length(packMsg(1)) %is% 1
198 |   length(packMsg(1, as_is=TRUE)) %is% 2
199 |   length(packMsg(list(1, 2, 3))) %is% 4
200 |   length(packMsg(list(1, 2, 3), as_is = TRUE)) %is% 7
201 |   unpackMsg(packMsg(list(1, 2, 3))) %is% c(1, 2, 3)
202 |   unpackMsg(packMsg(list(1, 2, 3), as_is = TRUE)) %is% list(1, 2, 3)
203 |   length(packMsg(I(1), as_is=FALSE)) %is% 2
204 | })
205 | 
206 | 
207 | test_that( "single row data frames also pack with asIs", {
208 |   expect_true(  length(packMsg(data.frame( a=1, b=2)))
209 |               > length(packMsg(list(a=1, b=2))))
210 | })
211 | 
212 | 
213 | test_that("pack named vectors into dicts", {
214 |   unpackMsg(packMsg(list(a=1, b=NULL))) %is% c(a=1, b=NA)
215 |   unpackMsg(packMsg(list(a=1, b=NULL), use_dict=FALSE)) %is% c(1, NA)
216 | })
217 | 
218 | 
219 | test_that("Unpack dicts into envs", {
220 |   unpackMsg(packMsg(list2env(list(a=1, b=2)))) %is% c(a=1, b=2)
221 |   x <- new.env()
222 |   e <- unpackMsg(packMsg(list(a = 1, b = NA)), parent = x)
223 |   typeof(e) %is% "environment"
224 |   as.list(e) %is% list(a = 1, b = NA)
225 |   parent.env(e) %is% x
226 |   unpackMsg(packMsg(emptyenv()), parent=emptyenv())
227 | })
228 | 
229 | 
230 | test_that("non-string dict keys", {
231 |   d = as.raw(c(0x82, 0xa1, 0x61, 0x01, 0x02, 0x02))
232 |   expect_warning(unpackMsg(d) %is% c(a=1, `2`=2), "string")
233 |   d2 = as.raw(c(as.raw(c(0x82, 0xa1, 0x61, 0x92, 0x01, 0x04, 0x92, 0x02, 0x04, 0x00))))
234 |   expect_warning(unpackMsg(d2) %is% list(a=c(1, 4), `c(2, 4)` = 0), "string")
235 | })
236 | 
237 | 
238 | test_that("pack envs into sorted dicts", {
239 |   e <- list2env(list(c=3, b=1, d=4, a=2))
240 |   unpackMsg(packMsg(e)) %is% c(a=2, b=1, c=3, d=4)
241 | })
242 | 
243 | 
244 | 
245 | test_that("Unpack dicts into envs", {
246 |   x <- unpackMsg(packMsg(c(a=2, b=1, c=3, d=4)), parent=environment())
247 |   expect_equal(as.list.environment(x, sorted = TRUE),
248 |                list(a=2, b=1, c=3, d=4))
249 | })
250 | 
251 | 
252 | test_that("warn bad var names and discard", {
253 |   b <- packMsg(c(a=1, 3, b=4))
254 |   expect_warning(e <- unpackMsg(b, parent=environment()), "empty")
255 |   as.list.environment(e, all.names=TRUE, sorted=TRUE) %is% list(a=1, b=4)
256 | })
257 | 
258 | 
259 | test_that("NA names, dots names...", {
260 |   x <- list(2, "four", rep(1,6), c(), list())
261 |   n <- c("two", NA, "...", "..5", "")
262 |   names(x) <- n
263 |   names(unpackMsg(packMsg(x))) %is% n
264 |   expect_warning(e <- unpackMsg(packMsg(x), parent=environment()))
265 |   ls(e, all.names=TRUE) %is% c("NA", "two")
266 | })
267 | 
268 | 
269 | test_that("detect data frames", {
270 |   expect_false(is.data.frame(unpackMsg(packMsg(list(a=1, b=2)))))
271 |   expect_true(is.data.frame(unpackMsg(packMsg(list(a=1, b=2), as_is=TRUE))))
272 |   expect_true(is.data.frame(unpackMsg(packMsg(list(a=numeric(0))))))
273 |   expect_false(is.data.frame(unpackMsg(packMsg(list(a=c(1, 2, 3), b=c(2, 3))))))
274 | })
275 | 
276 | 
277 | test_that("Raw with names", {
278 |   expect_warning(packMsg(structure(as.raw(c(1,2)), names=c("a", "b"))))
279 | })
280 | 
281 | 
282 | test_that("unpackMsg refusing to simplify", {
283 |   unpackMsg(packMsg(list(1, 2))) %is% c(1L, 2L)
284 |   unpackMsg(packMsg(list(1, 2)), simplify=FALSE) %is% list(1L, 2L)
285 | })
286 | 
287 | 
288 | test_that("Homepage example", {
289 |   pack_rt(list(compact=TRUE, schema=0),
290 |           c(as.raw(c(0x82, 0xa7)),
291 |             charToRaw("compact"),
292 |             as.raw(c(0xc3, 0xa6)),
293 |             charToRaw("schema"),
294 |             as.raw(00)))
295 | })
296 | 
297 | 
298 | test_that("warnings trigger once per message", {
299 |   bigint = as.raw(c(0xcf, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01))
300 |   length(capture_warnings(unpackMsg(c(as.raw(0x92), bigint, bigint)))) %is% 1
301 | })
302 | 
303 | 
304 | test_that("limit pending size", {
305 |   # the idea is that you may get a message like,
306 |   # <ddffffddffffddfffff...> which if not defended against, will make
307 |   # you malloc up all your memory with a small message. So we need to
308 |   # limit the number of items we'va allocated, as well. E.G. each
309 |   # array we are in the middle of, treat as a promise for at least one
310 |   # byte per item in the message.
311 |   bad <- packMsg(c(1:5, list(1:10), 6:10))
312 |   unpackMsg(bad, max_size = 16)
313 |   expect_error(unpackMsg(bad, max_size=15), "long")
314 | })
315 | 
316 | 
317 | test_that("prevent stack overflows", {
318 |   # another attack may be to try to overflow the stack with an indefinitely
319 |   # nested array, e.g. 0x919191919191919191.....
320 |   bad <- packMsg(list(list(list(1), list(list(2, list(3))))))
321 |   expect_error(unpackMsg(bad, max_depth=4), "nest")
322 |   unpackMsg(bad, max_depth=5)
323 | })
324 | 
325 | 
326 | test_that("extension types unpacked as raw with a class attr", {
327 |   x <- as.raw(c(0xd4, 0xfe, 0xdd))
328 |   expect_warning(d <- unpackMsg(x), "raw")
329 |   expect_equal(d, structure(as.raw(0xdd), class="ext-2"))
330 | 
331 |   x <- as.raw(c(0xd4, 0x7f, 0xdd))
332 |   expect_warning(d <- unpackMsg(x), "raw")
333 |   expect_equal(d, structure(as.raw(0xdd), class="ext127"))
334 | })
335 | 


--------------------------------------------------------------------------------
/tests/testthat/test-streams.R:
--------------------------------------------------------------------------------
  1 | context("unpack from streams")
  2 | #most of these tests to be implemented using a fifo connection.
  3 | 
  4 | `%is%` <- expect_equal
  5 | 
  6 | msgs <- as.raw(c(0xa5, 0x68, 0x65, 0x6c, 0x6c, 0x6f,
  7 |                  0xa3, 0x61, 0x6e, 0x64,
  8 |                  0xa7, 0x67, 0x6f, 0x6f))
  9 | 
 10 | two_ends <- function(f, read_size = 2^16, max_buf = NA) {
 11 |   fl <- file("", open="w+b", blocking=FALSE)
 12 |   on.exit(close(fl), add=TRUE)
 13 |   f(fl, msgConnection(fl))
 14 | }
 15 | 
 16 | test_that("consume N messages and return remaining data", {
 17 |   expect_equal(unpackMsgs(as.raw(1:10)),
 18 |                list(msgs = as.list(1:10),
 19 |                     remaining = raw(0),
 20 |                     status = "end of input",
 21 |                     bytes_read = 10))
 22 | 
 23 |   expect_equal(unpackMsgs(msgs),
 24 |                list(msgs = list("hello", "and"),
 25 |                     remaining = as.raw(c(0xa7, 0x67, 0x6f, 0x6f)),
 26 |                     status = "buffer underflow",
 27 |                     bytes_read = 10))
 28 | 
 29 |   expect_equal(unpackMsgs(as.raw(c(0xa7, 0x6f, 0x6f))),
 30 |                list(msgs = list(),
 31 |                     remaining = as.raw(c(0xa7, 0x6f, 0x6f)),
 32 |                     status = "buffer underflow",
 33 |                     bytes_read = 0))
 34 | 
 35 |   expect_equal(unpackMsgs(as.raw(c(1:10)), 3),
 36 |                list(msgs = as.list(1:3),
 37 |                     remaining = as.raw(4:10),
 38 |                     status = "ok",
 39 |                     bytes_read = 3))
 40 | })
 41 | 
 42 | test_that("packMsgs round trip", {
 43 |   unpackMsgs(packMsgs(c(list(1:10), 1:10)))$msgs %is%
 44 |     c(list(1:10), as.list(1:10))
 45 | })
 46 | 
 47 | test_that("Errors raised in parsing are caught", {
 48 |   unpackMsgs(as.raw(c(0xc1)))$status %is% "malformed input"
 49 | })
 50 | 
 51 | test_that("read from a connection (blocking)", {
 52 | 
 53 |   conn <- msgConnection(rawConnection(msgs, open="r"))
 54 |   readMsg(conn) %is% "hello"
 55 |   readMsg(conn) %is% "and"
 56 |   expect_error(readMsg(conn))
 57 |   close(conn)
 58 | 
 59 |   conn <- msgConnection(rawConnection(msgs, open="r"))
 60 |   readMsgs(conn) %is% list("hello", "and")
 61 |   status(conn) %is% "buffer underflow"
 62 |   close(conn)
 63 | })
 64 | 
 65 | test_that("write to a connection", {
 66 |   conn <- rawConnection(raw(), open="w")
 67 |   writeMsg(1:10, conn)
 68 |   writeMsgs(1:10, conn)
 69 |   expect_equal(
 70 |     unpackMsgs(rawConnectionValue(conn))$msgs,
 71 |     c(list(1:10), as.list(1:10)))
 72 |   close(conn)
 73 | })
 74 | 
 75 | test_that("write to and read from connections", {
 76 |   con <- msgConnection(rawBuffer(raw(0)))
 77 |   writeMsgs(1:10, con)
 78 |   readMsgs(con) %is% as.list(1:10)
 79 |   close(con)
 80 |   two_ends(function(A, B) {
 81 |     writeMsgs(1:10, A)
 82 |     flush(A)
 83 |     readMsgs(B)
 84 |   }) %is% as.list(1:10)
 85 | })
 86 | 
 87 | test_that("read non-blocking with complete message", {
 88 |     test <- packMsgs(list("hello", "and"))
 89 |     conn <- msgConnection(
 90 |         rawConnection(packMsgs(list("hello", "and", "world")), open="r"),
 91 |         read_size = length(test))
 92 |     readMsgs(conn) %is% list("hello", "and", "world")
 93 |     close(conn)
 94 | })
 95 | 
 96 | test_that("read non-blocking with incomplete message", {
 97 |   two_ends(function(endA, endB) {
 98 |     writeMsg(1, endA)
 99 |     partial <- packMsgs(list("here is a partial message", 2))
100 |     writeBin(partial[1:10], endA)
101 |     flush(endA)
102 |     readMsgs(endB) %is% list(1)
103 |     status(endB) %is% "buffer underflow"
104 |     partial(endB) %is% partial[1:10]
105 |     writeBin(partial[11:length(partial)], endA)
106 |     flush(endA)
107 |     Sys.sleep(0.1)
108 |     readMsgs(endB) %is% list("here is a partial message", 2)
109 |   })
110 | })
111 | 
112 | test_that("underflow at incomplete message (1-process)", {
113 |     buf <- rawBuffer()
114 |     partial <- packMsgs(list(1, "here is a partial message", 2))
115 |     writeRaw(partial[1:10], buf)
116 |     con <- msgConnection(buf)
117 |     readMsgs(con) %is% list(1)
118 |     writeRaw(partial[11:length(partial)], buf)
119 |     readMsgs(con) %is% list("here is a partial message", 2)
120 |     readMsgs(con) %is% list()
121 |     close(con)
122 | })
123 | 
124 | test_that("read non-blocking with array breaking over chunks", {
125 |     ##this should at least trigger the underflow handler, no?
126 |     partial <- packMsgs(list("hello", 1:2))
127 |     full <- packMsgs(list("hello", 1:10))
128 |     conn <- rawConnection(full, open="r")
129 |     conn <- msgConnection(conn, read_size = length(partial))
130 |     readMsgs(conn) %is% list("hello", 1:10)
131 |     close(conn)
132 | })
133 | 
134 | test_that("rawBuffer", {
135 |     x <- rawBuffer(as.raw(1:5))
136 |     tryCatch({
137 |         readRaw(x, 3) %is% as.raw(1:3)
138 |         writeRaw(as.raw(6:10), x)
139 |         readRaw(x, 5) %is% as.raw(4:8)
140 |         readRaw(x, 10) %is% as.raw(9:10)
141 |         readRaw(x, 10) %is% raw(0)
142 |         writeRaw(as.raw(11:25), x)
143 |         readRaw(x, 100) %is% as.raw(11:25)
144 |         writeRaw(as.raw(26:27), x)
145 |         readRaw(x, 2) %is% as.raw(26:27)
146 |         readRaw(x, 0) %is% raw(0)
147 |         readRaw(x, 10) %is% raw(0)
148 |     }, finally = {
149 |         close(x)
150 |     })
151 | })
152 | 
153 | test_that("read non-blocking and underflow handling when variously interrupted",
154 | {
155 |     orig <- list("hello",  c("hello", "world"), list("hello", "world", c(1, 2, 3)))
156 |     packed <- packMsgs(orig)
157 | 
158 |     cut <- 28
159 |     for (cut in 1:(length(packed) - 1)) {
160 |         firstChunk <- packed[1:cut]
161 |         secondChunk <- packed[  (cut+1) : (length(packed)) ]
162 |         con <- msgConnection(rawBuffer(firstChunk))
163 |         expect_error(read1 <- readMsgs(con), NA, info = paste0("at cut ", cut))
164 |         writeRaw(secondChunk, con)
165 |         expect_error(read2 <- readMsgs(con), NA, info = paste0("at cut ", cut))
166 |         expect_equal(c(read1, read2), orig, info = paste0("at cut ", cut))
167 |         close(con)
168 |     }
169 | })
170 | 
171 | test_that("assembling an array > read_size", {
172 |     mess <- (1:25) + rep(0, 1000)
173 |     c <- msgConnection(rawBuffer(raw(0)), read_size=32)
174 |     writeMsg(mess, c)
175 |     readMsgs(c) %is% list(mess)
176 |     close(c)
177 | })
178 | 
179 | test_that("resume from interrupt when message >> read_size",
180 | {
181 |     mess <- (1:25) + rep(0, 1000)
182 |     con <- msgConnection(rawBuffer(raw(0)), read_size=32)
183 |     bin <- packMsg(mess)
184 |     writeRaw(bin[1:100], con)
185 |     readMsgs(con) %is% list()
186 |     writeRaw(bin[101:200], con)
187 |     readMsgs(con) %is% list()
188 |     writeRaw(bin[201:length(bin)], con)
189 |     readMsgs(con) %is% list(mess)
190 |     close(con)
191 | })
192 | 
193 | test_that("Assembling a string >> read size", {
194 |     ## strings are individual messages that stretch over potentially many reads.
195 |     mess <- paste0(letters[sample(26, 1000, replace=TRUE)], collapse="")
196 |     con <- msgConnection(rawBuffer(raw(0)), read_size=32)
197 |     writeMsgs(list(mess), con)
198 |     readMsgs(con) %is% list(mess)
199 |     expect_equal(seek(con, rw="r"), 1003)
200 |     close(con)
201 | })
202 | 
203 | test_that("seek method", {
204 |     con = msgConnection(rawConnection(msgs, open="r"))
205 |     readMsgs(con)
206 |     seek(con) %is% 10
207 |     close(con)
208 | 
209 |     con = msgConnection(rawConnection(msgs, open="w"))
210 |     writeMsg("hello", con)
211 |     seek(con) %is% 6
212 |     close(con)
213 | 
214 |     con = msgConnection(rawBuffer())
215 |     writeMsg("hello", con)
216 |     expect_error(seek(con))
217 |     seek(con, rw = "w") %is% 6
218 |     seek(con, rw = "r") %is% 0
219 |     readMsg(con)
220 |     seek(con, rw = "r") %is% 6
221 |     close(con)
222 | })
223 | 
224 | `%@%` <- function(x, name) {
225 |   attr(x, as.character(substitute(name)))
226 | }
227 | 
228 | test_that("large blob to force GC?", {
229 |   data <- sample(as.raw(0:255), 0x2000000, TRUE)
230 |   con <- msgConnection(rawConnection(raw(0), open="wb"))
231 |   packet <- packMsg(data)
232 |   writeMsg(data, con)
233 |   bytes <- rawConnectionValue(con)
234 |   close(con)
235 |   con2 <- msgConnection(rawConnection(bytes, open="rb"))
236 |   as.read <<- readMsg(con2)
237 |   close(con2)
238 |   expect_identical(as.read, data)
239 | })
240 | 
241 | test_that("smallish blob under gctorture", {
242 |   data <- sample(as.raw(0:255), 0x30000, TRUE)
243 |   con <- msgConnection(rawConnection(raw(0), open="wb"))
244 |   packet <- packMsg(data)
245 |   writeMsg(data, con)
246 |   bytes <- rawConnectionValue(con)
247 |   close(con)
248 |   con2 <- msgConnection(rawConnection(bytes, open="rb"))
249 |   as.read <- NULL
250 |   local({
251 |     gctorture(TRUE)
252 |     on.exit(gctorture(FALSE))
253 |     # getting Error: RAW() can only be applied to a 'raw', not a 'NULL'
254 |     # I think perhaps because the buffer gets collected.
255 |     # I should hold an external pointer to the buffer, then?
256 |     # gctorture triggers an early error "2" so yeah.
257 |     # any way to debug GC, though?
258 |     as.read <<- readMsg(con2)
259 |   })
260 |   close(con2)
261 |   expect_identical(as.read, data)
262 | })
263 | 
264 | 
265 | # I'd like to have some tests with reading/writing to a separate
266 | # process. I tried with parallel/mcfork, makeForkCluster, but forking
267 | # seems to cause weirdness with nonblocking. (however this was before
268 | # I sorted out partial reads.) Subprocess package seems to not have
269 | # that prob. but the subprocess that is created doesn't have my
270 | # functions loaded.
271 | 
272 | print.raw <- function(x, max.print = getOption("max.print"), ...) {
273 |   # Display raw vectors as hex dumps. Debugging use, not exported.
274 |   con <- pipe("hexdump -C", "wb")
275 |   if (length(x) > max.print) {
276 |     writeBin(x[1:max.print], con)
277 |     close(con)
278 |     cat(paste0("[ reached getOption(\"max.print\") -- omitted ",
279 |                length(x) - max.print,
280 |                " entries ] \n"))
281 |   } else {
282 |     writeBin(x, con[1:max.print])
283 |     close(con)
284 |   }
285 | }
286 | 


--------------------------------------------------------------------------------
/vignettes/comparison.html.asis:
--------------------------------------------------------------------------------
1 | %\VignetteIndexEntry{Which Data Format is Fastest Data Format?}
2 | %\VignetteEngine{R.rsp::asis}
3 | %\VignetteKeyword{HTML}
4 | %\VignetteKeyword{vignette}
5 | %\VignetteKeyword{package}
6 | 


--------------------------------------------------------------------------------