├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── DESCRIPTION
├── LICENSE
├── NAMESPACE
├── NEWS
├── R
├── conn.R
├── decode.R
└── encode.R
├── README.Rmd
├── README.md
├── gh
├── space.svg
└── time.svg
├── inst
├── benchmarking.R
├── benchmarks.RData
└── comparison.Rmd
├── man
├── msgConnection.Rd
├── packMsg.Rd
└── unpackMsg.Rd
├── src
├── cwpack.c
├── cwpack.h
├── cwpack_defines.h
├── decode.c
├── decode.h
├── encode.c
├── encode.h
├── init.c
├── opts.h
├── utf8.c
├── utf8.h
├── vadr.c
└── vadr.h
├── tests
├── testthat.R
└── testthat
│ ├── test-static.R
│ └── test-streams.R
└── vignettes
├── comparison.html
└── comparison.html.asis
/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^Meta$
2 | ^doc$
3 | ^.*\.Rproj$
4 | ^\.Rproj\.user$
5 | ^vignettes/comparison_cache$
6 | ^vignettes/comparison_files$
7 | ^README.*
8 | .git
9 | .travis.yml
10 | gh
11 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | Meta
2 | .Rproj.user
3 | .Rhistory
4 | .RData
5 | *.so
6 | *.o
7 | *.bc
8 | README.html
9 | src/symbols.rds
10 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: r
2 | cache: packages
3 | r:
4 | - oldrel
5 | - release
6 | - devel
7 | r_github_packages:
8 | - r-lib/covr
9 |
10 | after_success:
11 | - Rscript -e 'covr::codecov()'
--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
1 | Package: msgpack
2 | Title: A Compact, High Speed Data Format
3 | Version: 1.0.1
4 | Authors@R: c(person("Peter", "Meilstrup", email="peter.meilstrup@gmail.com", role=c("aut", "cre", "cph")),
5 | person("Claes", "Wihlborg", role=c("ctr", "cph")),
6 | person("Björn", "Höhrmann", role=c("ctr", "cph")))
7 | Description: A fast C-based encoder and streaming decoder for the
8 | 'messagepack' data format. 'Messagepack' is similar in structure
9 | to 'JSON' but uses a more compact binary encoding. Based on the
10 | 'CWPack' C library.
11 | Depends:
12 | R (>= 3.3.0)
13 | License: MIT + file LICENSE
14 | Encoding: UTF-8
15 | LazyData: true
16 | Suggests:
17 | testthat, R.rsp
18 | VignetteBuilder: R.rsp
19 | RoxygenNote: 6.1.0
20 | Roxygen: list(markdown=TRUE)
21 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | YEAR: 2017-2018
2 | COPYRIGHT HOLDER: Peter Meilstrup
3 |
4 |
--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | # Generated by roxygen2: do not edit by hand
2 |
3 | S3method(close,msgConnection)
4 | S3method(partial,msgConnection)
5 | S3method(prepack,data.frame)
6 | S3method(prepack,default)
7 | S3method(readMsg,msgConnection)
8 | S3method(readMsgs,msgConnection)
9 | S3method(seek,msgConnection)
10 | S3method(status,msgConnection)
11 | S3method(summary,msgConnection)
12 | S3method(writeMsg,connection)
13 | S3method(writeMsg,msgConnection)
14 | S3method(writeMsgs,connection)
15 | S3method(writeMsgs,msgConnection)
16 | export(msgConnection)
17 | export(packMsg)
18 | export(packMsgs)
19 | export(partial)
20 | export(prepack)
21 | export(readMsg)
22 | export(readMsgs)
23 | export(status)
24 | export(unpackMsg)
25 | export(unpackMsgs)
26 | export(writeMsg)
27 | export(writeMsgs)
28 | useDynLib(msgpack, .registration = TRUE)
29 |
--------------------------------------------------------------------------------
/NEWS:
--------------------------------------------------------------------------------
1 | msgpack 1.0.1
2 | ==========
3 |
4 | Version 1.0.1
5 |
6 | * Updated an example which prevented `R CMD check` from passing on Windows.
7 | * Fixed a protection issue flagged by `rchk`.
8 |
9 | msgpack 1.0
10 | ==========
11 |
12 | Initial release.
13 |
--------------------------------------------------------------------------------
/R/conn.R:
--------------------------------------------------------------------------------
1 | #' Read and write msgpack formatted messages over R connections.
2 | #'
3 | #' A `msgConnection` object decodes msgpack messages from an
4 | #' underlying R raw connection.
5 | #'
6 | #' @param con A [connection] object open in binary mode.
7 | #' @param max_size The largest partial message to store, in
8 | #' bytes. `NA` means do not enforce a limit.
9 | #' @param read_size How many bytes to read at a time.
10 | #' @param ... Unpacking options (see [unpackMsg]).
11 | #' @return `msgConnection()` returns an object of class
12 | #' `msgConnection`.
13 | #'
14 | #' @examples
15 | #' out <- rawConnection(raw(0), open="wb")
16 | #' apply(quakes, 1, function(x) writeMsg(x, out)) # one message for each row
17 | #' length(rawConnectionValue(out))
18 | #' inn <- msgConnection(rawConnection(rawConnectionValue(out), open="rb"))
19 | #' close(out)
20 | #' readMsg(inn)
21 | #' readMsgs(inn, 3)
22 | #' length(readMsgs(inn))
23 | #' close(inn)
24 | #' @export
25 | msgConnection <- function(con, read_size=2^16, max_size=NA, ...) {
26 | partial <- raw(0)
27 | status <- "ok"
28 | bread <- 0
29 | bwrite <- 0
30 |
31 | reader <- function(desired) {
32 | # ignore "desired" and just read non-blockingly.
33 | readRaw(con, read_size)
34 | }
35 |
36 | readMsgs <- function(n=NA, read_size = parent.env(environment())$read_size) {
37 | if (is.na(n)) n <- .Machine$integer.max
38 | msgs_bytes <- unpackMsgs(partial, n, max_size = max_size, reader = reader)
39 | partial <<- msgs_bytes$remaining
40 | status <<- msgs_bytes$status
41 | bread <<- bread + msgs_bytes$bytes_read
42 | msgs_bytes$msgs
43 | }
44 |
45 | doClose <- function(...) {
46 | close(con, ...)
47 | }
48 |
49 | #msgConnection object is just the orig object with this
50 | #environment dangled off it.
51 | structure(addClass(con, "msgConnection"), reader = environment())
52 | }
53 |
54 | #' @export
55 | summary.msgConnection <- function(object, ...) {
56 | s <- NextMethod("summary")
57 | c(s, list(status = status(object)))
58 | }
59 |
60 |
61 | #' @rdname msgConnection
62 | #' @export
63 | close.msgConnection <- function(con, ...) {
64 | attr(con, "reader")$doClose(...)
65 | }
66 |
67 | catenator <- function(val=c()) {
68 | # An in-memory FIFO type object.
69 | #tracemem(val)
70 | start <- 0
71 | end <- length(val)
72 | function(x, action="store", ..., opts) {
73 |
74 | switch(action,
75 | store = {
76 | lx <- length(x)
77 | l <- length(val)
78 | if (lx > 0) {
79 | #check for overflow
80 | if (end + lx > l && start > 0) {
81 | # rotate back to start
82 | if (start > 0 && end != start) {
83 | val[1:(end-start)] <- val[(start+1):end]
84 | }
85 | end <<- end - start
86 | start <<- 0
87 | }
88 | if (end + lx > l) {
89 | # double array length
90 | length(val) <<- max(end + lx, 2 * l);
91 | }
92 |
93 | #inject new values
94 | val[ (end + 1):(end + lx) ] <<- x
95 | end <<- end + lx
96 | }
97 | dbg("lx", lx, "start", start, "end", end, "\n")
98 | x
99 | },
100 |
101 | read = {
102 | if (end > start) {
103 | val[(start+1):end]
104 | } else val[c()]
105 | },
106 |
107 | buf = {
108 | val
109 | },
110 |
111 | start = start,
112 |
113 | length = end - start,
114 |
115 | end = end,
116 |
117 | contents = {
118 | list(val, start, end)
119 | },
120 |
121 | reset = {
122 | val <<- x
123 | start <<- 0
124 | end <<- length(x)
125 | },
126 |
127 | drop = {
128 | if (x <= end - start && x >= 0) {
129 | start <<- start + x
130 | } else {
131 | stop("have ", end - start, ", can't drop ", x)
132 | }
133 | })
134 | }
135 | }
136 |
137 | lister <- function(val = list()) {
138 | n <- length(val)
139 | function(x, action="store") {
140 | switch(action,
141 | store = {
142 | if (n > length(val))
143 | length(val) <<- max(1, 2 * length(val))
144 | n <<- n + 1
145 | val[[n]] <<- x
146 | },
147 | read = {
148 | length(val) <<- n
149 | val
150 | },
151 | length = {
152 | n
153 | },
154 | clear = {
155 | n <<- 0
156 | length(val) <<- 0
157 | }
158 | )
159 | }
160 | }
161 |
162 | addClass <- function(x, classes) structure(x, class = c(classes, class(x)))
163 |
164 | #' @return `partial(con)` returns any data that has been read ahead of
165 | #' the last decoded message.
166 | #' @rdname msgConnection
167 | #' @export
168 | partial <- function(con) UseMethod("partial")
169 |
170 | #' @rdname msgConnection
171 | #' @export
172 | partial.msgConnection <- function(con) {
173 | attr(con, "reader")$partial
174 | }
175 |
176 | #' @rdname msgConnection
177 | #' @export
178 | #' @param n The maximum number of messages to read. A value of NA
179 | #' means to parse all available messages until end of input.
180 | #' @return `readMsgs(con, n)` returns a list of up to `n` decoded messages.
181 | readMsgs <- function(con, n = NA, ...) {
182 | UseMethod("readMsgs")
183 | }
184 |
185 | #' @export
186 | readMsgs.msgConnection <- function(con, n = NA, ...) {
187 | attr(con, "reader")$readMsgs(n, ...)
188 | }
189 |
190 | #' @rdname msgConnection
191 | #' @return `status(con)` returns the status of msgpack decoding on the
192 | #' connection. A value of `"ok"` indicates all requested messages
193 | #' were read, `"buffer underflow"` for a non-blocking connection
194 | #' indicates that only part of a message has been received, and
195 | #' `"end of input"` means the last available message has been read.
196 | #' Other values indicate errors encountered in decoding, which will
197 | #' effectively halt reading.
198 | #' @export
199 | status <- function(con) UseMethod("status")
200 |
201 | #' @rdname msgConnection
202 | #' @export
203 | status.msgConnection <- function(con) {
204 | attr(con, "reader")$status
205 | }
206 |
207 |
208 | #' @rdname msgConnection
209 | #' @return `seek(con)` returns the number of bytes that have been
210 | #' successfully read or written, depending on the mode of the
211 | #' connection. (Repositioning is not supported.)
212 | #' @param rw See [seek()].
213 | #' @export
214 | seek.msgConnection <- function(con, rw = summary(con)$mode, ...) {
215 | rw <- pmatch(rw, c("read", "write"), 0L)
216 | switch(rw,
217 | attr(con, "reader")$bread,
218 | attr(con, "reader")$bwrite,
219 | )
220 | }
221 |
222 | #' `readMsg(con)` reads exactly one message from a
223 | #' msgConnection, or throws an error.
224 | #'
225 | #' @rdname msgConnection
226 | #' @return `readMsg(con)` returns one decoded message.
227 | #' @export
228 | readMsg <- function(con, ...) {
229 | UseMethod("readMsg", con)
230 | }
231 |
232 | #' @export
233 | readMsg.msgConnection <- function(con, ...) {
234 | x <- readMsgs(con, 1, ...)
235 | if (length(x) < 1) {
236 | stop(status(con))
237 | }
238 | x[[1]]
239 | }
240 |
241 | #' `writeMsg(x, con)` writes a single message to a msgConnection.
242 | #'
243 | #' @rdname msgConnection
244 | #' @param obj An R object.
245 | #' @export
246 | writeMsg <- function(obj, con, ...) {
247 | UseMethod("writeMsg", con)
248 | }
249 |
250 | #' @export
251 | writeMsg.connection <- function(obj, con, ...) {
252 | writeMsgs(list(obj), con, ...)
253 | }
254 |
255 | #' @export
256 | writeMsg.msgConnection <- function(obj, con, ...) {
257 | writeMsgs(list(obj), con, ...)
258 | }
259 |
260 | #' `writeMsgs(l, conn)` writes a list of
261 | #' messages to a connection. That is, `writeMsg(1:10, conn)` writes one
262 | #' message containing an array, while `writeMsgs(1:10, conn)` writes
263 | #' ten consecutive messages each containing one integer.
264 | #'
265 | #' `writeMsg` will work with any R connection in raw mode, but reading
266 | #' requires a msgConnection object.
267 | #'
268 | #' Because msgpack messages have unpredictable length, the decoder
269 | #' reads ahead in chunks, then finds the boundaries between messages.
270 | #' Therefore when reading over a socket or a fifo it is best to use a
271 | #' nonblocking connection, and it will not work to mix readMsg and
272 | #' readBin on the same connection.
273 | #'
274 | #' If you are reading data from a not completely trusted source you
275 | #' should specify options `max_size` and `max_depth` (see
276 | #' [unpackOpts]). Without it, some deeply nested or cleverly designed
277 | #' messages can cause a stack overflow or out-of-memory error. With
278 | #' these options set, you will get an R exception instead.
279 |
280 | #' @rdname msgConnection
281 | #' @param objs A list of R objects.
282 | #' @export
283 | writeMsgs <- function(objs, con, ...) {
284 | UseMethod("writeMsgs", con)
285 | }
286 |
287 | #' @export
288 | writeMsgs.connection <- function(objs, con, ...) {
289 | writeRaw(packMsgs(objs, ...), con)
290 | }
291 |
292 | #' @export
293 | writeMsgs.msgConnection <- function(objs, con, ...) {
294 | buf <- packMsgs(objs, ...)
295 | result <- writeRaw(buf, con)
296 | attr(con, "reader")$bwrite <- attr(con, "reader")$bwrite + length(buf)
297 | invisible(result)
298 | }
299 |
300 | ## To support test harness, we use "readRaw" and "writeRaw"
301 | ## internally, instead of "readBin" which is not an S3 method
302 | readRaw <- function(con, n, ...) {
303 | UseMethod("readRaw")
304 | }
305 |
306 | writeRaw <- function(object, con, ...) {
307 | UseMethod("writeRaw", con)
308 | }
309 |
310 | readRaw.connection <- function(con, n) {
311 | # I get errors thrown (sometimes) when reading at the end of
312 | # a nonblocking fifo.
313 | tryCatch({
314 | readBin(con, 'raw', n)
315 | },
316 | error = function(e) {
317 | # warning("Ignoring ", e)
318 | raw(0)
319 | })
320 | }
321 |
322 | writeRaw.connection <- function(object, con, ...) {
323 | writeBin(object, con, ...)
324 | }
325 |
326 |
327 | ## An inefficient double ended byte buffer for test harness purposes
328 | rawBuffer <- function(object = raw(0)) {
329 | open <- "r"
330 | bytes <- length(object)
331 | buf <- rawConnection(object, open = "r")
332 | object <- NULL
333 |
334 | write <- function(object) {
335 | switch(open,
336 | "w" = {
337 | writeBin(object, buf)
338 | },
339 | "r" = {
340 | data <- readBin(buf, 'raw', bytes - seek(buf))
341 | close(buf)
342 | buf <<- rawConnection(data,
343 | open = "w")
344 | open <<- "w"
345 | writeBin(data, buf)
346 | write(object)
347 | }
348 | )
349 | }
350 |
351 | read <- function(n) {
352 | switch(open,
353 | "r" = {
354 | readBin(buf, 'raw', n)
355 | },
356 | "w" = {
357 | ##convert a write buffer into a read buffer
358 | val <- rawConnectionValue(buf)
359 | close(buf)
360 | buf <<- rawConnection(val, open = "r")
361 | bytes <<- length(val)
362 | open <<- "r"
363 | read(n)
364 | }
365 | )
366 | }
367 |
368 | doClose <- function(n) {
369 | close(buf)
370 | buf <- NULL
371 | }
372 |
373 | structure(list(environment()), class = "rawBuffer")
374 | }
375 |
376 | writeRaw.rawBuffer <- function(object, con) {
377 | con[[1]]$write(object)
378 | }
379 |
380 | readRaw.rawBuffer <- function(con, n) {
381 | con[[1]]$read(n)
382 | }
383 |
384 | close.rawBuffer <- function(con) {
385 | con[[1]]$doClose()
386 | }
387 |
--------------------------------------------------------------------------------
/R/decode.R:
--------------------------------------------------------------------------------
1 | #' Decode msgpack messages.
2 | #'
3 | #' `unpackMsg` converts a raw array containing one message in msgpack
4 | #' format into the corresponding R data structure.
5 | #'
6 | #' @param x A [raw()] object, perhaps read from a file or socket.
7 | #' @param ... Options passed to [unpackOpts].
8 | #' @return `unpackMsg(x)` returns one decoded message (which might be
9 | #' shorter than the input raw), or throws an error.
10 | #' @useDynLib msgpack, .registration = TRUE
11 | #' @examples
12 | #' msg <- as.raw(c(0x82, 0xa7, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x63, 0x74, 0xc3,
13 | #' 0xa6, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x00))
14 | #' unpackMsg(msg)
15 | #' @export
16 | unpackMsg <- function(x, ...) {
17 | .Call("_unpack_msg", x, unpackOpts(...))
18 | }
19 |
20 | dbg <- alist
21 | #dbg <- cat
22 |
23 | #' `unpackMsgs` extracts a number of msgpack messages from a raw object.
24 | #'
25 | #' @param n How many messages to read. An "NA" here means to read as
26 | #' much as possible.
27 | #' @param reader For implementing connections; a function that takes
28 | #' no arguments and returns a raw containing more data.
29 | #' @return `unpackMsgs(r, n)` returns a list `X` with four elements:
30 | #' * `X$msgs` is a list of the messages unpacked.
31 | #' * `X$remaining` is data remaining to be parsed.
32 | #' * `X$status` is a status message, typically "ok", "end of input",
33 | #' or "buffer underflow".
34 | #' * `x$bytes_read` the number of bytes consumed.
35 | #'
36 | #' @examples
37 | #' x <- packMsgs(list("one", "two", "three"))
38 | #' unpackMsgs(x, 2)
39 | #' @rdname unpackMsg
40 | #' @export
41 | unpackMsgs <- function(x, n = NA, reader = NULL, ...) {
42 | if (is.na(n)) {
43 | n <- .Machine$integer.max
44 | }
45 |
46 | bread <- 0 # bytes read
47 | saveMessage <- lister() # messages read so far
48 | saveData <- catenator(x) # bytes pending
49 | offset <- 0 # position within buffer
50 |
51 | status <- "ok"
52 |
53 | readMore <- function(current, desired) {
54 | dbg("readMore:", "current =", current, "desired =", desired, "\n")
55 | start <- saveData(action="start")
56 | saveData(reader(desired))
57 | new_start <- saveData(action="start")
58 | current <- new_start - start + current
59 | result <- c(saveData(action="contents"), current)
60 | # cat("after read: "); print(saveData(action="contents"))
61 | result
62 | }
63 |
64 | opts = unpackOpts(...,
65 | underflow_handler = if (is.null(reader)) NULL else readMore)
66 |
67 | # cat("buffer: "); print(saveData(action="contents"))
68 | tryCatch(
69 | while(saveMessage(action="length") < n) {
70 | last_start <- saveData(action="start")
71 | result <- .Call("_unpack_msg_partial",
72 | saveData(action="buf"),
73 | offset,
74 | saveData(action="end"),
75 | opts)
76 | ## result is a pairlist( message, status, new_offset )
77 | # cat("unpack result: "); print(result)
78 | status <- result[[1]]
79 | if (status == "ok") {
80 | # got a good message,
81 | saveMessage(result[[2]])
82 | message_length <- ((result[[3]] - offset) +
83 | (saveData(action="start") - last_start))
84 | bread <- bread + message_length
85 | saveData(message_length, action="drop")
86 | offset <- result[[3]]
87 | # dbg("after 1 msg, buffer: "); print(saveData(action="contents"))
88 | } else {
89 | stop(status)
90 | }
91 | },
92 | error = function(e) {
93 | dbg("Stopping with exception: ", e$message, "in", deparse(e$call), "\n")
94 | status <<- e$message
95 | }
96 | )
97 | # dbg("after failure, buffer: "); print(saveData(action="contents"))
98 |
99 | list(msgs = saveMessage(action="read"),
100 | remaining = saveData(action="read"),
101 | status = status,
102 | bytes_read = bread)
103 | }
104 |
105 | #' [unpackOpts()] interprets is passed to `...` in [unpackMsgs()],
106 | #' [unpackMsg()], and [msgConnection()]. It is not exported.
107 | #'
108 | #' @param parent When an environment is given, (such as [emptyenv()]),
109 | #' unpack msgpack dicts into environment objects, with the given
110 | #' value as parent. This option overrides `use_df=TRUE`. Otherwise,
111 | #' unpack dicts into named vectors / lists.
112 | #' @param df When `TRUE`, convert msgpack dicts, whose elements are
113 | #' all arrays having the same length, into [data.frame()]s.
114 | #' @param simplify If `TRUE`, simplify msgpack lists into primitive
115 | #' vectors.
116 | #' @param max_size The maximum length of message to decode.
117 | #' @param max_depth The maximum degree of nesting to support.
118 | #' @param underflow_handler Used internally.
119 | #'
120 | #' @details
121 | #' The msgpack format does not have typed arrays, so all msgpack
122 | #' arrays are effectively lists from the R perspective. However, if an
123 | #' array containing compatibly typed elements is read, `unpack` will
124 | #' return a logical, integer, real or string vector as
125 | #' appropriate. This behavior is disabled with `simplify=FALSE`. The
126 | #' coercion used is more conservative than R's coercion: Integer
127 | #' values may be converted to real, but boolean values will not be
128 | #' cast to numeric, nor any types to string. If conversion from a
129 | #' large integer to real loses precision, a warning is printed.
130 | #'
131 | #' Msgpack also does not distinguish between `NA` and `NULL`. All nils
132 | #' will be decoded as NA.
133 | #'
134 | #' Strings are assumed to be UTF-8 encoded. If a msgpack string does
135 | #' not appear to be valid UTF-8, a warning is printed and a raw object
136 | #' is produced instead.
137 | #'
138 | #' Msgpack allows any type to be the key of a dict, but R only
139 | #' supports strings. If a non-string appears as key in a msgpack dict,
140 | #' it will be converted to string with [deparse()].
141 | #'
142 | #' Extension types will be decoded as raw objects with a class like
143 | #' `"ext120"` and a warning.
144 | #'
145 | #' @rdname unpackMsg
146 | unpackOpts <- function(parent = NULL,
147 | df = TRUE,
148 | simplify = TRUE,
149 | max_size = NA,
150 | max_depth = NA,
151 | underflow_handler = NULL) {
152 | .Call("_unpack_opts",
153 | parent,
154 | df,
155 | simplify,
156 | parent.env(environment()),
157 | max_size,
158 | max_depth,
159 | underflow_handler);
160 | }
161 |
162 | # Come up with a name when a non-string is used as key.
163 | repr <- function(x) {
164 | paste0(deparse(x, control = c("keepNA")), collapse="")
165 | }
166 |
--------------------------------------------------------------------------------
/R/encode.R:
--------------------------------------------------------------------------------
1 | #' Convert R objects to msgpack format.
2 | #'
3 | #' @param x An R object, which can be null, a vector, list,
4 | #' environment, raw, or any combinations thereof.
5 | #' @param ... Options passed to [packOpts()]
6 | #' @return An object of class "raw".
7 | #' @details
8 | #' Strings are re-encoded to UTF-8 if necessary. Real numbers
9 | #' taking integral values may be emitted as integers to save space.
10 | #'
11 | #' Normally an R vector of length 1 will be unboxed, e.g. packMsg(1)
12 | #' will make a msgpack integer, but packMsg(c(1,2)) will make a
13 | #' msgpack list. To prevent this and produce a list of length 1 in the
14 | #' first case, specify `as_is = TRUE`. Objects of class `AsIs` or
15 | #' `data.frame` will always be encoded as-is.
16 | #'
17 | #' A hook for pre-processing R objects before packing is supported, by
18 | #' giving the object an S3 [class] and implementing a method
19 | #' `prepack`. For instance, `prepack.data.frame(x)` simply adds the
20 | #' `"AsIs"` class to `x`.
21 | #'
22 | #' Environment objects are written out with the keys in sorted order,
23 | #' but named vectors are written in the order which the entries
24 | #' appear.
25 | #'
26 | #' Object attributes other than `names` and `class` are ignored.
27 | #'
28 | #' @examples
29 | #' packMsg( list(compact=TRUE, schema=0) )
30 | #' @export
31 | packMsg <- function(x, ...) {
32 | .Call("_pack_msg", x, packOpts(...))
33 | }
34 |
35 | #' @param xs a list of objects to pack.
36 | #' @rdname packMsg
37 | #' @examples
38 | #' x <- packMsgs(list("one", "two", "three"))
39 | #' unpackMsgs(x, 2)
40 | #' @export
41 | packMsgs <- function(xs, ...) {
42 | opts <- packOpts(...)
43 | unlist(lapply(xs, function(xx) .Call("_pack_msg", xx, opts)))
44 | }
45 |
46 | #' [packOpts()] interprets the `...` argument of packMsg and
47 | #' packMsgs. it is not exported.
48 | #'
49 | #' @param compatible If TRUE, emitted bytes conform to version 1.0 of
50 | #' msgpack encoding. This means that msgpack strings are used for
51 | #' raw objects.
52 | #' @param as_is If TRUE, singletons (R primitive vectors of length 1
53 | #' having no names attribute) are encoded as msgpack arrays of
54 | #' length 1. Otherwise singletons are simplified to msgpack scalars.
55 | #' @param use_dict If TRUE, vectors having a "names" attribute are
56 | #' encoded as dicts. If false, they are encoded as arrays and the
57 | #' names are discarded.
58 | #' @param max_size The largest buffer that will be allocated.
59 | #' @param buf_size The initial amount of memory, in bytes, to allocate
60 | #' for packing each message. This will be dynamically grown if a
61 | #' larger message is passed, so there is little reason to change
62 | #' this.
63 | #' @rdname packMsg
64 | packOpts <- function(compatible = FALSE,
65 | as_is = FALSE,
66 | use_dict = TRUE,
67 | max_size = NA,
68 | buf_size = 512) {
69 | .Call("_pack_opts",
70 | compatible,
71 | as_is,
72 | use_dict,
73 | max_size,
74 | buf_size,
75 | parent.env(environment()))
76 | }
77 |
78 | #' @rdname packMsg
79 | #' @export
80 | prepack <- function(x) UseMethod("prepack")
81 |
82 | #' @rdname packMsg
83 | #' @export
84 | prepack.default <- function(x) unclass(x)
85 |
86 | #' @rdname packMsg
87 | #' @export
88 | prepack.data.frame <- function(x) I(unclass(x))
89 |
--------------------------------------------------------------------------------
/README.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "msgpack for R"
3 | output: github_document
4 | author: Peter Meilstrup
5 | ---
6 |
7 | [](https://cran.r-project.org/package=msgpack)
8 | [](https://travis-ci.org/crowding/msgpack-r)
9 | [](https://codecov.io/gh/crowding/msgpack-r)
10 |
11 |
12 | This is a high speed [msgpack](https://msgpack.org) encoder
13 | and decoder for R, based on the [CWPack](https://github.com/clwi/CWPack) C
14 | implementation.
15 |
16 | `msgpack` is a binary data format with data structures similar to
17 | `JSON` and a compact binary encoding. It can be a drop-in replacement
18 | for `JSON` in most applications. It is designed to be fast to parse
19 | and compact to transmit and store.
20 |
21 | ```{R echo=FALSE}
22 | knitr::opts_chunk$set(collapse=TRUE)
23 | ```
24 |
25 | ## Installation
26 |
27 | From CRAN:
28 |
29 | ```{R eval=FALSE}
30 | install.packages("msgpack")
31 | ```
32 |
33 | From Github:
34 |
35 | ```{R eval=FALSE}
36 | library(devtools)
37 | install_github("crowding/msgpack-r")
38 | ```
39 |
40 | ## Usage
41 |
42 | ```{R}
43 | library(msgpack)
44 | x <- packMsg( list(compact=TRUE, schema=0) )
45 | x
46 | dput(unpackMsg( x ))
47 | ```
48 |
49 | ### Connections / Streaming
50 |
51 | Write messages one or several at a time:
52 |
53 | ```{r}
54 | conOut <- rawConnection(raw(0), open = "w") # or socketConnection, etc
55 | writeMsg("one", conOut)
56 | writeMsgs(list(2, c(buckle=TRUE), c(owner="my", type="shoe")), conOut)
57 | ```
58 |
59 | Use a `msgConnection` object to read messages one or several at a time:
60 |
61 | ```{r}
62 | conIn <- msgConnection(rawConnection(rawConnectionValue(conOut), open = "r"))
63 | dput(readMsgs(conIn, 2))
64 | dput(readMsg(conIn))
65 | dput(readMsgs(conIn))
66 | ```
67 |
68 | ### Performance
69 |
70 | Msgpack is fast and compact. See the [benchmarking vignette](inst/doc/comparison.html).
71 |
72 | 
73 |
74 | 
75 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | msgpack for R
2 | ================
3 | Peter Meilstrup
4 |
5 | [](https://cran.r-project.org/package=msgpack)
7 | [](https://travis-ci.org/crowding/msgpack-r)
9 | [](https://codecov.io/gh/crowding/msgpack-r)
11 |
12 | This is a high speed [msgpack](https://msgpack.org) encoder and decoder
13 | for R, based on the [CWPack](https://github.com/clwi/CWPack) C
14 | implementation.
15 |
16 | `msgpack` is a binary data format with data structures similar to `JSON`
17 | and a compact binary encoding. It can be a drop-in replacement for
18 | `JSON` in most applications. It is designed to be fast to parse and
19 | compact to transmit and store.
20 |
21 | ## Installation
22 |
23 | From CRAN:
24 |
25 | ``` r
26 | install.packages("msgpack")
27 | ```
28 |
29 | From Github:
30 |
31 | ``` r
32 | library(devtools)
33 | install_github("crowding/msgpack-r")
34 | ```
35 |
36 | ## Usage
37 |
38 | ``` r
39 | library(msgpack)
40 | x <- packMsg( list(compact=TRUE, schema=0) )
41 | x
42 | ## [1] 82 a7 63 6f 6d 70 61 63 74 c3 a6 73 63 68 65 6d 61 00
43 | dput(unpackMsg( x ))
44 | ## structure(list(compact = TRUE, schema = 0L), .Names = c("compact",
45 | ## "schema"))
46 | ```
47 |
48 | ### Connections / Streaming
49 |
50 | Write messages one or several at a time:
51 |
52 | ``` r
53 | conOut <- rawConnection(raw(0), open = "w") # or socketConnection, etc
54 | writeMsg("one", conOut)
55 | writeMsgs(list(2, c(buckle=TRUE), c(owner="my", type="shoe")), conOut)
56 | ```
57 |
58 | Use a `msgConnection` object to read messages one or several at a
59 | time:
60 |
61 | ``` r
62 | conIn <- msgConnection(rawConnection(rawConnectionValue(conOut), open = "r"))
63 | dput(readMsgs(conIn, 2))
64 | ## list("one", 2L)
65 | dput(readMsg(conIn))
66 | ## structure(TRUE, .Names = "buckle")
67 | dput(readMsgs(conIn))
68 | ## list(structure(c("my", "shoe"), .Names = c("owner", "type")))
69 | ```
70 |
71 | ### Performance
72 |
73 | Msgpack is fast and compact. See the [benchmarking
74 | vignette](inst/doc/comparison.html).
75 |
76 | 
78 |
79 | 
81 |
--------------------------------------------------------------------------------
/gh/space.svg:
--------------------------------------------------------------------------------
1 |
2 |
76 |
--------------------------------------------------------------------------------
/inst/benchmarking.R:
--------------------------------------------------------------------------------
1 | ## ----- setup
2 | ensureExists <- function(packages){
3 | installed <- library()$results[,1]
4 | needs.install <- setdiff(packages, installed)
5 | if(length(needs.install) > 0) {
6 | install.packages(needs.install)
7 | }
8 | }
9 |
10 | if (exists("clust") && !is.null(clust)) {
11 | stopCluster(clust)
12 | clust <- NULL
13 | }
14 |
15 | ensureExists(c(
16 | "knitr",
17 | "rmarkdown",
18 | "purrr",
19 | "dplyr",
20 | "magrittr",
21 | "tibble",
22 | "printr",
23 | "ggplot2",
24 | "nycflights13",
25 | "parallel",
26 | "ssh.utils",
27 | "tools",
28 | "msgpack",
29 | "jsonlite",
30 | "msgpackR",
31 | "rjson",
32 | "RJSONIO",
33 | "yaml",
34 | "broom"
35 | ))
36 |
37 | library(knitr)
38 | library(rmarkdown)
39 | library(purrr)
40 | library(dplyr)
41 | library(magrittr)
42 | library(tibble)
43 | library(printr)
44 | library(ggplot2)
45 |
46 | ## ---- dataset
47 |
48 | library(nycflights13)
49 | dataset <- as.list(as.environment("package:nycflights13"))
50 |
51 | subsample <- function(dataset, rate) {
52 | lapply(dataset, function(df) {
53 | df[1:(1 + round(nrow(df) * rate)), ]
54 | })
55 | }
56 |
57 | onerow <- subsample(dataset, 0)
58 |
59 | ## ---- definitions
60 |
61 | remoteHost <- "paradoxus.local"
62 | port <- 42170
63 |
64 | bufferBytes <- function(f) function(con, chunk = 65536L) {
65 | buf <- rawConnection(raw(0), open="w")
66 | repeat {
67 | result <- readBin(con = con, what = "raw", n = chunk)
68 | if (length(result) == 0) break
69 | writeBin(result, buf)
70 | }
71 | f(rawConnectionValue(buf))
72 | }
73 |
74 | bufferBytes2 <- function(f) function(con, chunk = 65536L) {
75 | buf <- msgpack:::catenator(raw(0)) # this is faster
76 | repeat {
77 | result <- readBin(con = con, what = "raw", n = chunk)
78 | if (length(result) == 0) break
79 | buf(result)
80 | }
81 | f(buf(action="read"))
82 | }
83 |
84 | bufferRawConn <- function(f) function(con, chunk = 65536L) {
85 | buf <- msgpack:::catenator(raw(0)) # this is faster
86 | repeat {
87 | result <- readBin(con = con, what = "raw", n = chunk)
88 | if (length(result) == 0) break
89 | buf(result)
90 | }
91 | f(rawConnection(buf(action="read"), open="r"))
92 | }
93 |
94 |
95 | bufferLines <- function(reader) function(con) {
96 | lines <- msgpack:::catenator(character(0))
97 | repeat {
98 | l <- readLines(con)
99 | if (length(l) == 0) break()
100 | lines(l)
101 | }
102 | buf <- textConnection(lines(action="read"))
103 | on.exit(close(buf))
104 | reader(buf)
105 | }
106 |
107 | forked <- function(ifParent, ifChild, catch=TRUE) {
108 | # Fork the R process. In the parent process call ifParent(), in
109 | # the child process call ifChild(). Return a list of return value of
110 | # both functions, or error messages.
111 | other <- parallel:::mcfork()
112 | if (inherits(other, "masterProcess")) {
113 | # we are child process
114 | if (catch) {
115 | tryCatch({
116 | result <- ifChild(x)
117 | parallel:::mcexit(0, send = result)
118 | },
119 | error =
120 | function(e) {
121 | parallel:::mcexit(1, send = list(error=e, calls=sys.calls()))
122 | }
123 | )
124 | } else {
125 | result <- ifChild(x)
126 | parallel:::mcexit(0, send = result)
127 | }
128 | } else {
129 | # we are master process
130 | if (catch) {
131 | mine <- tryCatch(ifParent(x),
132 | error = if (catch) function(e) list(error=e, calls = sys.calls()) else NULL)
133 | } else {
134 | mine <- ifParent(x)
135 | }
136 | theirs <- tryCatch(
137 | unserialize(parallel:::readChild(other)),
138 | error = if(catch) function(e) list(error=e, calls=sys.calls()) else NULL)
139 | child_pids <- vapply(parallel:::children(), function(x) x$pid, 0)
140 | if (other$pid %in% child_pids) {
141 | warning("Killing child process ", deparse(other))
142 | parallel:::mckill(other, tools::SIGTERM)
143 | }
144 | c(theirs, mine)
145 | }
146 | }
147 |
148 | return_result <- FALSE
149 | # Function accepts a set of sys.time readings and computes timings
150 | times <- function(start.write, end.write,
151 | start.read, end.read,
152 | bytes, result,
153 | start.parent, end.parent, ...) {
154 | c(list(extra=list(list(...))),
155 | if (missing(end.read) || missing(end.write))
156 | list()
157 | else
158 | c(read.user = end.read[["user.self"]] - start.read[["user.self"]],
159 | read.sys = end.read[["sys.self"]] - start.read[["sys.self"]],
160 | read.elapsed = end.read[["elapsed"]] - start.read[["elapsed"]],
161 | write.user = end.write[["user.self"]] - start.write[["user.self"]],
162 | write.sys = end.write[["sys.self"]] - start.write[["sys.self"]],
163 | write.elapsed = end.write[["elapsed"]] - start.write[["elapsed"]],
164 | total.user =
165 | ( end.read[["user.self"]] + end.write[["user.self"]]
166 | - start.write[["user.self"]] - start.read[["user.self"]]),
167 | total.sys =
168 | ( end.read[["sys.self"]] + end.write[["sys.self"]]
169 | - start.write[["sys.self"]] - start.read[["sys.self"]]),
170 | total.elapsed = max(end.read[["elapsed"]] - start.read[["elapsed"]],
171 | end.write[["elapsed"]] - start.write[["elapsed"]])),
172 | bytes = if (missing(bytes)) list() else bytes,
173 | result = if (missing(result) || !return_result) list() else list(result),
174 | parent = if (missing(start.parent)) list()
175 | else c(user = (end.parent[["user.self"]] - start.parent[["user.self"]]),
176 | sys = (end.parent[["sys.self"]] - start.parent[["sys.self"]]),
177 | elapsed = (end.parent[["elapsed"]] - start.parent[["elapsed"]])))
178 | }
179 |
180 | bytes <- function(x) {
181 | switch(mode(x),
182 | raw = {
183 | length(x)
184 | },
185 | character = {
186 | length(x) + sum(nchar(x))
187 | })
188 | }
189 |
190 | timeConvert <- function(data,
191 | from = unserialize,
192 | to = function(data) serialize(data,NULL),
193 | wrap = identity, ...) {
194 | force(data)
195 | start.write <- proc.time()
196 | enc <- to(data)
197 | end.write <- proc.time()
198 | as.read <- from(enc)
199 | end.read <- proc.time()
200 | bytes <- bytes(enc)
201 | times(start.write, end.write,
202 | end.write, end.read,
203 | bytes(enc), as.read)
204 | }
205 |
206 |
207 | timeConnection <- function(..., raw = TRUE) {
208 | if (raw) {
209 | timeRawConnection(...)
210 | } else {
211 | timeTextConnection(...)
212 | }
213 | }
214 |
215 | timeRawConnection <- function(data,
216 | reader = unserialize,
217 | writer = serialize,
218 | wrap = identity, ...) {
219 | force(data)
220 | conn <- wrap(rawConnection(raw(0), open="wb"))
221 | on.exit(close(conn), add=TRUE)
222 | start.write <- proc.time()
223 | writer(data, conn)
224 | end.write <- proc.time()
225 | bytes <- rawConnectionValue(conn)
226 | conn2 <- wrap(rawConnection(bytes, open="rb"))
227 | on.exit(close(conn2), add=TRUE)
228 | start.read <- proc.time()
229 | as.read <- reader(conn2)
230 | end.read <- proc.time()
231 | times(start.write, end.write,
232 | end.write, end.read,
233 | length(bytes), as.read)
234 | }
235 |
236 | timeTextConnection <- function(data,
237 | reader = function(x) source(x, TRUE),
238 | writer = function(x, conn) dump("x", conn),
239 | wrap = identity, ...) {
240 | force(data)
241 | theText <- character(0)
242 | conn <- textConnection(NULL, open="w")
243 | on.exit(close(conn), add=TRUE)
244 | start.write <- proc.time()
245 | writer(data, conn)
246 | end.write <- proc.time()
247 | theText <- textConnectionValue(conn)
248 | nbytes <- bytes(theText)
249 | conn2 <- wrap(textConnection(theText, open="r"))
250 | on.exit(close(conn2), add=TRUE)
251 | start.read <- proc.time()
252 | as.read <- reader(conn2)
253 | end.read <- proc.time()
254 | times(start.write, end.write,
255 | end.write, end.read,
256 | nbytes, as.read)
257 | }
258 |
259 | timeFileIO <- function(data,
260 | reader = unserialize,
261 | writer = serialize,
262 | raw = TRUE,
263 | wrap = identity, ...) {
264 | force(data)
265 | fnam <- tempfile()
266 | on.exit(unlink(fnam, force=TRUE))
267 | con <- file(fnam, open=paste0("w", if(raw) "b" else ""), raw=raw)
268 | start.write <- proc.time()
269 | writer(data, con)
270 | nbytes <- seek(con)
271 | close(con)
272 | end.write <- proc.time()
273 | con <- wrap(file(fnam, open=paste0("r", if(raw) "b" else ""), raw=raw))
274 | on.exit(close(con), add=TRUE)
275 | start.read <- proc.time()
276 | as.read <- reader(con)
277 | end.read <- proc.time()
278 | times(start.write, end.write,
279 | start.read, end.read,
280 | nbytes, as.read)
281 | }
282 |
283 | timeSocketTransfer <- function(data,
284 | reader = unserialize,
285 | writer = serialize,
286 | wrap = identity,
287 | raw = TRUE,
288 | catch = FALSE, ...) {
289 | force(data)
290 | doRead <- function(other) {
291 | conn <- wrap(socketConnection(port = port,
292 | server = TRUE,
293 | blocking = TRUE,
294 | open = paste0("r", if(raw) "b" else "")))
295 | on.exit(close(conn))
296 | start.read <- proc.time()
297 | as.read <- reader(conn)
298 | end.read <- proc.time()
299 | list(start.read = start.read,
300 | end.read = end.read)
301 | }
302 | doWrite <- function(other) {
303 | Sys.sleep(0.5)
304 | conn <- wrap(socketConnection(port = port, server = FALSE,
305 | blocking = TRUE,
306 | open = paste0("w", if(raw) "b" else "")))
307 | start.write <- proc.time()
308 | on.exit(close(conn))
309 | writer(data, conn)
310 | flush(conn)
311 | end.write <- proc.time()
312 | list(start.write = start.write,
313 | end.write = end.write, bytes=NA)
314 | }
315 | start.parent <- proc.time()
316 | results <- forked(ifChild = doWrite, ifParent = doRead, catch = catch)
317 | end.parent <- proc.time()
318 |
319 | do.call(times, c(results,
320 | list(start.parent = start.parent, end.parent = end.parent)),
321 | quote=TRUE)
322 | }
323 |
324 | timeFifoTransfer <- function(data,
325 | reader = unserialize,
326 | writer = serialize,
327 | wrap = identity,
328 | catch = FALSE, ...) {
329 | force(data)
330 | fnam <- tempfile()
331 | on.exit(unlink(fnam, force = TRUE))
332 | system(paste("mkfifo", fnam))
333 |
334 | doRead <- function(other) {
335 | Sys.sleep(0.5)
336 | conn <- wrap(fifo(fnam, open = "rb", blocking = TRUE))
337 | on.exit({
338 | close(conn)
339 | })
340 | start.read <- proc.time()
341 | as.read <- reader(conn)
342 | end.read <- proc.time()
343 | list(start.read = start.read,
344 | end.read = end.read)
345 | }
346 | doWrite <- function(other) {
347 | conn <- fifo(fnam, open = "wb", blocking = TRUE)
348 | on.exit({
349 | close(conn)
350 | })
351 | start.write <- proc.time()
352 | writer(data, conn)
353 | flush(conn)
354 | end.write <- proc.time()
355 | list(start.write = start.write,
356 | end.write = end.write,
357 | bytes = NA)
358 | }
359 |
360 | start.parent <- proc.time()
361 | results <- forked(ifChild = doWrite, ifParent = doRead, catch = catch)
362 | end.parent <- proc.time()
363 |
364 | do.call(times, c(results,
365 | list(start.parent = start.parent,
366 | end.parent = end.parent)),
367 | quote=TRUE)
368 | }
369 |
370 | clust <<- NULL
371 | startRemote <- function() {
372 | message("starting remote")
373 | ssh.utils::run.remote("killall R", remoteHost)
374 | clust <- makePSOCKcluster(nnodes=1, c(remoteHost), rscript = "Rscript")
375 | rtmp <- clusterCall(clust, tempdir)
376 | message("rtmp is ", rtmp)
377 | message("getwd is ", getwd())
378 |
379 | ssh.utils::cp.remote("", "../inst/benchmarking.R",
380 | remoteHost, paste0(rtmp, "/benchmarking.R"))
381 | clusterCall(clust, options, repos = getOption("repos"))
382 | parallel::clusterCall(clust, source, paste0(rtmp, "/benchmarking.R"))
383 | clust <<- clust
384 | message("started remote")
385 | }
386 |
387 | doRemoteWrite <- function(data, host, port, wrap, raw, writer) {
388 | tryCatch({
389 | Sys.sleep(1)
390 | conn <- wrap(socketConnection(host = host,
391 | port = port,
392 | server = FALSE,
393 | blocking = TRUE,
394 | open = paste0("w", if(raw) "b" else "")))
395 | start.write <- proc.time()
396 | on.exit(close(conn))
397 | writer(data, conn)
398 | flush(conn)
399 | end.write <- proc.time()
400 | list(start.write = start.write,
401 | end.write = end.write, bytes=NA)
402 | },
403 | error = function(e) list(error=e, calls=sys.calls()))
404 | }
405 |
406 | doRemoteRead <- function(port, wrap, raw, reader) {
407 | # place logging statements in here...
408 |
409 | tryCatch({
410 | sock <- socketConnection(port = port,
411 | server = TRUE,
412 | blocking = TRUE,
413 | open = paste0("r", if(raw) "b" else ""),
414 | timeout = 60)
415 | conn <- wrap(sock)
416 | on.exit(close(conn))
417 | start.read <- proc.time()
418 | as.read <- reader(conn)
419 | end.read <- proc.time()
420 | list(start.read = start.read,
421 | end.read = end.read)
422 | },
423 | error = function(e) list(error=e, calls=sys.calls())
424 | )
425 | }
426 |
427 | timeRemoteTransfer <- function(data,
428 | reader = bufferBytes(unserialize),
429 | writer = serialize,
430 | wrap = identity,
431 | raw = TRUE,
432 | ...) {
433 | force(data)
434 | if (is.null(clust)) {
435 | startRemote()
436 | }
437 |
438 | parallel:::sendCall(clust[[1]],
439 | doRemoteRead,
440 | list(port, wrap, raw, reader))
441 | local_results <- tryCatch(
442 | doRemoteWrite(data, remoteHost, port, wrap, raw, writer),
443 | error = function(e) list(error = e, calls = sys.calls()))
444 | remote_results <- parallel:::recvResult(clust[[1]])
445 | do.call(times, c(local_results, remote_results), quote=TRUE)
446 | }
447 |
448 | timeCurve <- function(dataset,
449 | method,
450 | timeout = 60,
451 | start = 0.01,
452 | max = 1, ...
453 | ) {
454 | results <- data_frame()
455 | current <- start
456 | if (missing(method)) {
457 | message("method missing???")
458 | return(data.frame())
459 | }
460 | while (current <= max) {
461 | message(paste0("size = ", current))
462 | data <- subsample(dataset, current)
463 | result <- method(data, ...)
464 | results <- bind_rows(results, as_tibble(c(size = current, result)))
465 | if ("total.elapsed" %in% names(result)) {
466 | if (result$total.elapsed > timeout) break() else NULL
467 | } else {
468 | break()
469 | }
470 | if (current == max) break()
471 | current = min(current * sqrt(2), max)
472 | }
473 | results
474 | }
475 |
476 | arg_df <- function(tests) (tests
477 | # produces an arg data frame: the labels in columns and the argument
478 | # data structure in column "args"
479 | %>% map(names)
480 | %>% cross_df()
481 | %>% pmap_dfr(function(...) {
482 | labels <- list(...)
483 | arglist <- pmap(list(labels, names(labels)),
484 | function(label, column) list(tests[[column]][[label]]))
485 | c(labels, args = list(list(arglist)))
486 | })
487 | %>% mutate(args = (args # unlist the args twice
488 | %>% map(.
489 | %>% unlist(recursive = FALSE, use.names = FALSE)
490 | %>% unlist(recursive = FALSE, use.names = TRUE))))
491 | )
492 |
493 | `%*%` <- intersect
494 |
495 | run_tests <- function(arg_df) {
496 | pmap_dfr(arg_df, function(..., args) {
497 | labels <- list(...)
498 | message(paste0(collapse = "\n", deparse(labels, control=c())))
499 | # what I want is a bind syntax like
500 | ## bind({
501 | ## list(strategy = ?fun, ??args) <- args
502 | ## fun(!!args)
503 | ## })
504 | fun <- args$strategy
505 | args <- args[names(args) != "strategy"]
506 | the_env <- list2env(args, parent = environment())
507 |
508 | # this dance is to avoid calling do.call with a giant unnamed dataset
509 | # (which causes R to spin several minutes on writing a traceback)
510 | call <- as.call(c(quote(fun),
511 | structure(map(names(args), as.name),
512 | names = names(args))))
513 | results <- eval(call, the_env)
514 | as_data_frame(c(labels, results))
515 | })
516 | }
517 |
518 | store <- function(data, dataset) {
519 | # key on all character columns the two tables hav in common
520 | existing.keys <- names(benchmarks)[map_lgl(benchmarks, is.character)]
521 | new.keys <- names(data)[map_lgl(data, is.character)]
522 | keys <- intersect(existing.keys, new.keys)
523 | message(paste("Replacing on keys:", paste0("\"", keys, "\"", collapse=", ")))
524 | if (length(keys) > 0) {
525 | dataset <- (dataset # update benchmarks
526 | %>% anti_join(data, by=keys)
527 | %>% bind_rows(data)
528 | )
529 | } else {
530 | dataset <- data
531 | }
532 | dataset
533 | }
534 |
535 | common.options <- list(
536 | strategy = list(
537 | timeCurve = list(strategy = timeCurve, timeout = 10, start = 0.001)),
538 | dataset = list(
539 | nycflights13 = list(data = dataset)))
540 |
541 | conversion.methods <- list(
542 | method = list(
543 | convert = list(method = timeConvert))
544 | )
545 |
546 | synchronous.methods <- list(
547 | method = list(
548 | conn = list(method = timeConnection),
549 | file = list(method = timeFileIO)
550 | ))
551 |
552 | concurrent.methods <- list(
553 | method = list(
554 | remote = list(method = timeRemoteTransfer),
555 | socket = list(method = timeSocketTransfer),
556 | fifo = list(method = timeFifoTransfer))
557 | )
558 |
559 | connection.methods <- list(
560 | method = c(concurrent.methods$method,
561 | synchronous.methods$method)
562 | )
563 |
564 | convert.common.options <- c(
565 | common.options,
566 | conversion.methods
567 | )
568 |
569 | raw.common.options <- c(
570 | common.options,
571 | conversion.methods,
572 | raw = list('TRUE' = list(raw = TRUE))
573 | )
574 |
575 | text.common.options <- c(
576 | common.options,
577 | conversion.methods,
578 | raw = list('FALSE' = list(raw = FALSE))
579 | )
580 |
581 | all.common.options <- c(
582 | common.options,
583 | list(
584 | method = c(connection.methods$method,
585 | conversion.methods$method))
586 | )
587 |
588 | buffer_read_options <- function(reader = identity,
589 | raw = FALSE,
590 | buffer = (if (raw) bufferBytes2 else bufferLines)) {
591 | c(common.options,
592 | list(raw = structure(list(list(raw = raw)), names = as.character(raw))),
593 | list(method = list(
594 | conn = list(method = timeConnection,
595 | reader = reader),
596 | file = list(method = timeFileIO,
597 | reader = reader),
598 | fifo = list(method = timeFifoTransfer,
599 | reader = buffer(reader)),
600 | socket = list(method = timeSocketTransfer,
601 | reader = buffer(reader)),
602 | remote = list(method = timeRemoteTransfer,
603 | reader = buffer(reader))
604 | ))
605 | )
606 | }
607 |
608 | combine_opts <- function(x, ...) {
609 | for (l in list(...)) {
610 | for (n in names(l)) {
611 | if (n %in% names(x)) {
612 | x[[n]] <- c(x[[n]], l[[n]])
613 | } else {
614 | x[[n]] <- c(x[[n]], l[[n]])
615 | }
616 | }
617 | }
618 | x;
619 | }
620 |
621 | `%but%` <- function(l, r) {
622 | l[names(r)] <- r;
623 | l
624 | }
625 |
626 | showTimings <- function(timings, label, ...) {
627 | stats <- with(timings, c(
628 | "total.elapsed (s)" = total.elapsed,
629 | "read.cpu (s)" = read.user + read.sys,
630 | "read.elapsed (s)" = read.elapsed,
631 | "write.cpu (s)" = write.user + write.sys,
632 | "write.elapsed (s)" = write.elapsed,
633 | "data size (MB)" = bytes / 0x100000
634 | ))
635 | kable(stats, col.names = label, digits=2)
636 | }
637 |
638 | ## list(x = ?first, ??rest) -> list()
639 |
640 | ## f <- function(a, call) bind({
641 | ## list(?first, ??rest) <- callList
642 | ## })
643 |
644 | ## Local Variables:
645 | ## ess-r-package-info: ("msgpack" . "~/msgpack/")
646 | ## End:
647 |
--------------------------------------------------------------------------------
/inst/benchmarks.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/crowding/msgpack-r/c77554d7c5fe0e10bb46d5284d451337813734e8/inst/benchmarks.RData
--------------------------------------------------------------------------------
/inst/comparison.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | title: "Which Data Format is Fastest Data Format?"
3 | output:
4 | "rmarkdown::html_vignette":
5 | fig_width: 6
6 | fig_height: 4
7 | toc: true
8 | df_print: tibble
9 | self_contained: true
10 | dev: "svglite"
11 | vignette: >
12 | %\VignetteIndexEntry{Which Data Format is Fastest Data Format\?}
13 | %\VignetteEngine{R.rsp::asis}
14 | %\VignetteKeyword{IO}
15 | %\VignetteKeyword{file}
16 | %\VignetteKeyword{benchmark}
17 | %\VignetteKeyword{connection}
18 | %\VignetteKeyword{interface}
19 | %\VignetteKeyword{msgpack}
20 | %\VignetteKeyword{vignette}
21 | %\VignetteKeyword{package}
22 | ---
23 |
24 | It's good to get data into and out of R! More compatibility, more
25 | data, more better. But which is fastest?
26 |
27 | Of course, speed might not be your only criterion. Different data
28 | formats have different capabilities and purposes:
29 |
30 | * `serialize()` is your best best for getting R objects out of and
31 | back into R the way you had them, but doesn't do much for you when
32 | communicating with other systems.
33 | * `dump()` is your next best bet, while being a somewhat
34 | human-readable text format.
35 | * Neither of above are good for exchanging data with other entities
36 | you don't trust.
37 | * *JSON* is widely used on the web, but because it is based on
38 | Javascript data structures, it doesn't seamlessly represent R
39 | objects. R attributes like `dims` and `class` don't have equivalents
40 | in Javascript. Different packages also take different approaches in
41 | representing JSON objects in R, and vice versa.
42 | * *CSV* is ubiquitous and can be read by most anything but only represents
43 | tabular data. Its data types are ambiguous.
44 | * *Msgpack* has a data model compatible with JSON but is a binary
45 | format. For web applications, replacing JSON with msgpack is an easy
46 | way to save bandwidth and CPU usage. I'm considering msgpack as a
47 | convenient wire format for embedded devices.
48 |
49 |
50 | ```{R setup, cache=FALSE, echo=FALSE, message=FALSE}
51 | library(knitr)
52 | opts_chunk$set(cache = TRUE, autodep = TRUE, message = FALSE, warning = FALSE)
53 | message("cache path is '", opts_chunk$get("cache.path"), "'")
54 | ```
55 |
56 | All that said.... Which is fastest? Which is most efficient? Under
57 | which set of arbitrarily chosen tests? I'll give more details below,
58 | but first I'll show the results. In both graphs, lower is better.
59 |
60 | ```{R, fig_width=6, fig_height=6, fig.asp = 1, cache=FALSE}
61 | load_cache("timings","timing.plot")
62 | ```
63 |
64 | On the horizontal axis is the normalized size of the object being
65 | transmitted. On the vertical is the time taken to transmit the test
66 | dataset. There are four scenarios corresponding to how the dataset is
67 | transmitted or stored. All tests are using the default options
68 | for each respective library.
69 |
70 | This following graph shows the amount of storage used to encode the
71 | test dataset.
72 |
73 | ```{R, cache=FALSE}
74 | load_cache("datasize","datasize.plot")
75 | ```
76 |
77 | # Benchmarking Data Export / Import
78 |
79 | This document is included as a static vignette, because running
80 | benchmark code would not be kind to package builders.
81 |
82 | I'll try picking a reference dataset and timing how long it takes to
83 | encode and decode it. I'll start with the dataset `nycflights13`,
84 | consisting of five data frames, since it's large enough to take a
85 | measurable time to process.
86 |
87 | The file `inst/benchmarking.R` contains code for test generation and
88 | timing.
89 |
90 | ```{R cache = FALSE, results = "hide"}
91 | source(system.file("benchmarking.R", package = "msgpack"))
92 | ```
93 |
94 | I will test each encoder under the following scenarios:
95 |
96 | * `convert` is for just converting an R object to an R raw or
97 | character object and back, in memory.
98 | * `conn` also does an in-memory conversion but with R `textConnection`
99 | or `rawConnection` objects. (The `rawConnection` objects seem to
100 | work faster, even with text formats.)
101 | * `file` writes to a temp file and then reads it back.
102 | * `fifo` forks the R process and writes from one while reading
103 | from the other over a UNIX named pipe.
104 | * `tcp` forks the R process and writes from one to the other using a
105 | TCP socket.
106 | * `remote` connects to a R process on a remote machine, and sends
107 | data from the local to the remote over a TCP socket.
108 |
109 | Each test will record the OS-reported CPU time, as well as elapsed
110 | clock time for reading, writing and total.
111 |
112 | For `fifo`, `tcp`, and `remote` tests, the reading and writing may
113 | happen concurrently, so that `total.elapsed < read.elapsed +
114 | write.elapsed`. On the other hand, some packages cannot cope with
115 | reading from asynchronous connections (more on this in the
116 | details). In these cases I need to make sure all data is read into a
117 | buffer before calling the decoder.
118 |
119 | These packages turn out to vary in performance by a couple orders of
120 | magnitude. So the test harness has to dynamically vary tne input size
121 | so as to not take forever to run. It starts with a small subsample
122 | of the dataset and increases its size until the encode and decode
123 | takes 10 seconds (or the entire dataset is transmitted). (See function
124 | `testCurve` in `inst/benchmarking.R`.).
125 |
126 | On to the per-package benchmarks.
127 |
128 | ## R serialization
129 |
130 | `serialize` and `unserialize` produce faithful replications of R
131 | objects including R-specific structures like closures, environments,
132 | and attributes. But generally only R code can read it. It is useful
133 | for communicating between R processes.
134 |
135 | For instance:
136 |
137 | ```{R}
138 | unserialize.inmem <- timeConvert(dataset,
139 | unserialize,
140 | function(data) serialize(data, NULL))
141 | showTimings(unserialize.inmem, "R serialization (in memory)")
142 | ```
143 |
144 | This is reasonably speedy, too.
145 |
146 | However, I run into a problem if I try to transfer too large an object over
147 | a fifo or socket and read it with `unserialize().`
148 |
149 | ```{R, error=TRUE}
150 | unserialize.bad <- timeFifoTransfer(dataset, unserialize, serialize, catch=FALSE)
151 | ```
152 |
153 | This appears to be because `unserialize` doesn't operate concurrently,
154 | in the sense that it doesn't recover from finding the end of the line
155 | having only read part of a message. Meanwhile, on my machine `fifo()`
156 | and `socketConnection()` do not seem to block even if `blocking =
157 | TRUE` is set. They always wait for at least one byte, but may return
158 | fewer than requested. So `unserialize` does not work easily with socket
159 | connections.
160 |
161 | One workaround is to exhaustively read the connection before handing
162 | off the data to `unserialize`. I handle that off screen in
163 | `bufferBytes` in `inst/benchmarking.R`.
164 |
165 | ```{R}
166 | unserialize.socket <- timeSocketTransfer(dataset,
167 | bufferBytes(unserialize),
168 | serialize)
169 |
170 | showTimings(unserialize.socket,
171 | c("R serialization (over TCP, same host, blocking read)"))
172 | ```
173 |
174 | But this strategy only works for transmitting one object per
175 | connection, and for one connection at a time.
176 |
177 | Another way you can do it is to wrap R serialization with
178 | `msgpack::msgConnection`. This only adds a few bytes to each message,
179 | and msgpack will handle assembling complete messages. This also allows
180 | you to send several values per connection, or poll several connections
181 | connections until one of them returns a decoded message.
182 |
183 | ```{R}
184 | unserialize.wrapped_socket <- timeSocketTransfer(dataset,
185 | unserialize,
186 | serialize,
187 | wrap = msgpack::msgConnection)
188 | ```
189 |
190 | ```{R}
191 | showTimings(unserialize.wrapped_socket,
192 | "R serialization over msgpack over TCP (same host)")
193 | ```
194 |
195 | Surprisingly, this actually works faster. `R
196 | unserialize.wrapped_socket$total.elapsed <
197 | unserialize.socket$total.elapsed || stop()), include=FALSE`. One
198 | hypothesis that might account for this is that there are fewer write
199 | syscalls if the object is serialized into memory before sending, and
200 | fewer read syscalls if the object is prepended with its length before
201 | reading.
202 |
203 | Now I'll start collecting benchmarks systematically. Since the methods
204 | we will explore have such a wide variance of performance, we will test
205 | them with successively larger datasets, until they exceed a
206 | timeout. The `serialize.spec` data structure annotated below specifies
207 | how to test R serialization and how to label the results. See the code
208 | file `inst/benchmarking.R` for more details.
209 |
210 | ```{R serialization}
211 | # A spec is a named list of test factors.
212 | # A test factor is a named list of arguments that will be passed to the
213 | # test function at each factor level.
214 | # (Actually go look at the simpler test spec for msgpack before trying to
215 | # grok this)
216 | serialize.spec <- combine_opts( # options of the same name concatenated
217 | list(
218 | method = list( # the factor "method"
219 | convert = list(method = timeConvert)), # has a level "convert"
220 | # that passes timeConvert
221 | # to the "method"
222 | # argument of the test
223 | # function
224 | encoder = list( # the factor "encoder"
225 | serialize = list( # has a level named "serialize"
226 | writer = serialize, # with these arguments specifying, reader, writer, to, from
227 | from = unserialize,
228 | to = function(data) serialize(data, NULL)))),
229 | # Combined with these common options (that include connection-based
230 | # test methods
231 | buffer_read_options(reader = unserialize, raw = TRUE)
232 | )
233 |
234 | # arg_df takes the above spec and takes its outer product, generating
235 | # a data frame with case labels and arguments used
236 | serialize.calls <- arg_df(serialize.spec)
237 | ```
238 |
239 | ```{R}
240 | serialize.timings <- run_tests(serialize.calls)
241 | ```
242 |
243 | ```{R include=FALSE, results="hide"}
244 | benchmarks <- data.frame()
245 | ```
246 |
247 | ```{R include=FALSE, results="hide"}
248 | benchmarks <- store(serialize.timings, benchmarks)
249 | ```
250 |
251 | ```{R}
252 | test.plot <- (ggplot(
253 | filter(benchmarks, encoder=="serialize"))
254 | + aes(x = size, y = total.elapsed, color = method)
255 | + geom_line()
256 | + scale_x_continuous(limits = c(0, 1), name = "Fraction of nycflights13 transmitted")
257 | + scale_y_continuous(limits = c(0, NA), name = "Elapsed time (s)")
258 | )
259 | test.plot + labs(title="R serialization performance")
260 | ```
261 |
262 | Here the horizontal axis is the size of the test dataset, and the
263 | vertical axis is the time taken to transmit and receive.
264 |
265 | R's implementation of fifo connections seem to be consistently slower
266 | than other methods on the test system (OS X). The next slowest, in the
267 | case of R serialization, is the test case where data is transmitted
268 | over a gigabit link (if it matters, the reading computer is an Ubuntu
269 | box slower than the writing computer.)
270 |
271 | ## Dput/source
272 |
273 | The `dput` and `deparse` functions render R data objects to an ASCII
274 | connection in R-like syntax. The idea is that the text output "looks
275 | like" the code it takes to construct the object, to the extent that
276 | the mechanism for reading objects back in is to `eval` or `source` the
277 | text. (Hopefully one does not do this with untrusted data. A better
278 | technique may be to evaluate the data in a limited environment that
279 | just contains the needed constructors like `structure`, `list` and `c`
280 | etc.)
281 |
282 | ```{R}
283 | ## Annoyingly, the behavior of "dump" and "dput" depend on this global option.
284 | options(deparse.max.lines = NULL)
285 |
286 | #the output mimics the input
287 | dput(control=c(),
288 | list(1, "2", verb=quote(buckle), my=c("s", "h", "o", "e")))
289 | ```
290 |
291 | (Note how `dput` fails on transmitting language objects; if we try to
292 | eval the above we will try to evaluate "buckle" instead of getting
293 | just the name object. `as.name("buckle")`.)
294 |
295 | Performance-wise, `dput` and `source` should not be used for large
296 | datasets, because they display an O(n^2) characteristic in terms of
297 | the data size.
298 |
299 | ```{R dput}
300 | options(deparse.max.lines = NULL)
301 |
302 | dput.timings <- run_tests(arg_df(c(
303 | all.common.options,
304 | list(
305 | encoder = list(
306 | dput = list(
307 | from = function(t) eval(parse(text=t)),
308 | to = deparse,
309 | reader = function(c) eval(parse(c)),
310 | writer = function(x, c) dput(x, file=c)))))))
311 | ```
312 |
313 | ```{R, include = FALSE, results = "hide"}
314 | benchmarks <- store(dput.timings, benchmarks)
315 | ```
316 |
317 | ```{R}
318 | (test.plot + labs(title="dput performance")) %+% filter(benchmarks, encoder == "dput")
319 | ```
320 |
321 | It's interesting that `deparse` (which is used for the "in-memory
322 | conversion" test method labeled `conn`) is much slower than the
323 | connection-based `dput`.
324 |
325 | ## jsonlite
326 |
327 | The `jsonlite` package includes a fromJSON and toJSON implementation. It also
328 | supports streaming reads and write, but only of records consisting of
329 | one data frame per message. Data frames are sent row-wise.
330 |
331 | Since the test dataset consists of several data frames, I will send
332 | one after the other over the lifetime of one connection.
333 |
334 | ```{R jsonlite}
335 | jsonlite_reader <- function(conn) {
336 | append <- msgpack:::catenator()
337 | jsonlite::stream_in(conn, verbose = FALSE, handler = function(x) append(list(x)))
338 | append(action="read")
339 | }
340 |
341 | jsonlite_writer <- function(l, conn) {
342 | lapply(l, function(x) jsonlite::stream_out(x, verbose=FALSE, conn))
343 | }
344 |
345 | jsonlite.spec <- c(
346 | all.common.options,
347 | list(
348 | encoder = list(
349 | jsonlite = list(
350 | to = jsonlite::toJSON,
351 | from = jsonlite::fromJSON,
352 | reader = jsonlite_reader,
353 | writer = jsonlite_writer,
354 | raw = FALSE))))
355 |
356 | jsonlite.timings <- run_tests(arg_df(jsonlite.spec))
357 | ```
358 |
359 | ```{R echo=FALSE, results="hide"}
360 | getElapsed <- function(d) filter(d, size == 1, method=="convert")$total.elapsed[[1]]
361 | getRemote <- function(d) filter(d, size == 1, method=="remote")$total.elapsed[[1]]
362 | `%digits%` <- function(x, y) format(x, digits=y)
363 | ```
364 | jsonlite performs reasonably well, but is several times slower than
365 | serialization.
366 |
367 | ```{R, results = "hide"}
368 | benchmarks <- store(jsonlite.timings, benchmarks)
369 | ```
370 |
371 | ```{R}
372 | (test.plot + labs(title="jsonlite performance")) %+% filter(benchmarks, encoder == "jsonlite")
373 | ```
374 |
375 | ## Msgpack
376 |
377 | `msgpack` is the package this vignette is written for.
378 |
379 | ```{R msgpack_remote}
380 | msgpack_remote.spec <- c(
381 | common.options,
382 | list(method = list(remote = list(method=timeRemoteTransfer)),
383 | encoder = list(
384 | msgpack = list(
385 | reader = msgpack::readMsg,
386 | writer = msgpack::writeMsg,
387 | wrap = msgpack::msgConnection))))
388 |
389 | msgpackR_remote.timings <- run_tests(arg_df(msgpack_remote.spec))
390 | (test.plot + labs(title="msgpack")) %+% msgpackR_remote.timings
391 |
392 | ```
393 |
394 | ```{R msgpack}
395 | msgpack.spec <- c(
396 | all.common.options,
397 | list(
398 | encoder = list(
399 | msgpack = list(
400 | wrap = msgpack::msgConnection,
401 | reader = msgpack::readMsg,
402 | writer = msgpack::writeMsg,
403 | to = msgpack::packMsg,
404 | from = msgpack::unpackMsg))))
405 | ```
406 | ```{R}
407 | msgpack.timings <- run_tests(arg_df(msgpack.spec))
408 | ```
409 |
410 | ```{R, results = "hide"}
411 | benchmarks <- store(msgpack.timings, benchmarks)
412 | ```
413 |
414 | ```{R}
415 | (test.plot + labs(title="msgpack performance")) %+% filter(benchmarks, encoder == "msgpack")
416 | ```
417 |
418 | Implementing the streaming-mode callbacks has helped a lot, but there
419 | is still a quadratic characteristic going on here. Need to do some
420 | profiling of memory allocation.
421 |
422 | Interestingly, msgpack is already faster than serialize for the remote
423 | use case (modulo some network glitches affecting one or two
424 | datapoints.)
425 |
426 | ## msgpackR
427 |
428 | There is an older pure-R implementation of msgpack on CRAN. One quirk is that
429 | it doesn't accept `NA` in R vectors.
430 |
431 | ```{R, error = TRUE}
432 | msgpackR::pack(c(1, 2, 3))
433 | msgpackR::pack(c(1, 2, 3, NA))
434 | ```
435 |
436 | As a workaround I'll substitute out all the NA values in the dataset.
437 |
438 | ```{R}
439 | dataset_mungenull <- map(dataset, map_dfc,
440 | function(col) ifelse(is.na(col), 9999, col))
441 | ```
442 |
443 | ```{R msgpackR}
444 | msgpackR.spec <- c(
445 | all.common.options %but% list(
446 | dataset = list(nycflights13 = list(data = dataset_mungenull)),
447 | note = list("no NAs" = list())),
448 | list(
449 | encoder = list(
450 | msgpackR = list(
451 | from = msgpackR::unpack,
452 | to = msgpackR::pack,
453 | reader = bufferBytes(msgpackR::unpack),
454 | writer = function(data, conn) writeBin(msgpackR::pack(data), conn)))))
455 | ```
456 |
457 | ```{R, results = "hide"}
458 | msgpackR.timings <- run_tests(arg_df(msgpackR.spec))
459 | ```
460 |
461 | ```{R, include=FALSE, results="hide"}
462 | benchmarks <- store(msgpackR.timings, benchmarks)
463 | ```
464 |
465 | ```{R}
466 | (test.plot + labs(title="msgpackR performance")) %+% filter(benchmarks, encoder == "msgpackR")
467 | ```
468 |
469 | Performance-wise, it is quite slow, but a deeper concern is that if I
470 | send larger objects and inspect the results it sometimes looks garbled, and there
471 | are intermittent errors like:
472 |
473 | ```{R error=TRUE}
474 | str(subsample(dataset_mungenull, .000044) %>% (msgpackR::pack) %>% (msgpackR::unpack))
475 | ```
476 |
477 | ## rjson
478 |
479 | ```{R rjson}
480 | rjson.spec <- combine_opts(
481 | list(
482 | method = list(
483 | convert = list(method = timeConvert)),
484 | encoder = list(
485 | rjson = list(
486 | from = rjson::fromJSON,
487 | to = rjson::toJSON,
488 | writer = function(data, con) writeChar(rjson::toJSON(data), con)))),
489 | buffer_read_options(reader = function(con) rjson::fromJSON(file=con),
490 | raw = TRUE,
491 | buffer = bufferRawConn))
492 | rjson.timings <- run_tests(arg_df(rjson.spec))
493 | ```
494 |
495 | ```{R include=FALSE, echo=FALSE}
496 | inmemory.factor <- (getElapsed(rjson.timings) / getElapsed(msgpack.timings))
497 | remote.factor <- (getRemote(rjson.timings) / getRemote(msgpack.timings))
498 | ```
499 |
500 | `rjson` is the fastest JSON implementation -- only `r inmemory.factor %digits% 2`
501 | times slower than `msgpack`, in memory, and `r remote.factor %digits% 2`
502 | times slower across the wire. It does not
503 | support streaming reads, so we must byte-buffer to read from
504 | connections. But that turns out to be quite fast as well.
505 |
506 | ```{R results="hide"}
507 | benchmarks <- store(rjson.timings, benchmarks)
508 | ```
509 |
510 | ```{R}
511 | (test.plot + labs(title="rjson performance")) %+% filter(benchmarks, encoder == "rjson")
512 | ```
513 |
514 | ## RJSONIO
515 |
516 | I am getting the following intermittent error in RJSONIO (i.e. this
517 | document fails to render once in a while.)
518 |
519 | ```
520 | Error in RJSONIO::readJSONStream(con) : failed to parse json at 10240
521 | ```
522 |
523 | ```{R RJSONIO}
524 | RJSONIO.spec <- c(
525 | all.common.options,
526 | list(
527 | encoder = list(
528 | RJSONIO = list(
529 | from = RJSONIO::fromJSON,
530 | to = RJSONIO::toJSON,
531 | writer = function(x, con) writeBin(RJSONIO::toJSON(x), con),
532 | reader = RJSONIO::readJSONStream,
533 | raw = TRUE))))
534 |
535 | RJSONIO.timings <- run_tests(arg_df(RJSONIO.spec))
536 | ```
537 | ```{R, include = FALSE, results = "hide"}
538 | benchmarks <- store(RJSONIO.timings, benchmarks)
539 | ```
540 |
541 | RJSONIO offers a function to do streaming reads from connection, but
542 | it has much overhead compared with in-memory conversion.
543 |
544 | ```{R}
545 | (test.plot + labs(title="RJSONIO performance")) %+% filter(benchmarks, encoder == "RJSONIO")
546 | ```
547 |
548 | ## YAML
549 |
550 | YAML is kind of "JSON, but more like Markdown." Allegedly easier to
551 | read but with a more complex grammar. It's popular for config files.
552 |
553 | ```{R}
554 | cat(yaml::as.yaml(list(compact=TRUE, schema = 0)))
555 | ```
556 |
557 | Unfortunately, the YAML package produces a protection stack overflow
558 | when decoding too large a message.
559 |
560 | ```{R, error = TRUE}
561 | oops <- yaml::yaml.load(yaml::as.yaml(subsample(dataset, 0.15)))
562 | ```
563 |
564 | ```{R}
565 | yaml.timings <- run_tests(arg_df(c(
566 | all.common.options,
567 | list(
568 | note = list("Max 0.14" = list(max = 0.14)),
569 | encoder = list(
570 | yaml = list(
571 | reader = yaml::yaml.load_file,
572 | writer = function(data, conn) writeChar(yaml::as.yaml(data), conn),
573 | to = yaml::as.yaml,
574 | from = yaml::yaml.load,
575 | raw = TRUE))))))
576 | ```
577 |
578 | ```{R, results = "hide"}
579 | benchmarks <- store(yaml.timings, benchmarks)
580 | ```
581 |
582 | ```{R}
583 | (test.plot + labs(title="yaml performance")) %+% filter(benchmarks, encoder == "yaml")
584 | ```
585 |
586 | ## write.csv
587 |
588 | We're going here aren't we. Let's see if we can send messages with
589 | CSV. To send several CSV tables over one connection, I'll prepend to
590 | each message a header saying how many rows to read following.
591 |
592 | ```{R}
593 | writeCsvs <- function(data, con) {
594 | for (nm in names(data)) {
595 | write.csv(as_tibble(list(name = nm, nrows = nrow(data[[nm]]))), con)
596 | write.csv(data[[nm]], con, row.names = FALSE)
597 | }
598 | }
599 |
600 | readCsvs <- function(data, con) {
601 | output <- list()
602 | tryCatch(
603 | repeat {
604 | header <- read.csv(con2, nrows=1, stringsAsFactors = FALSE)
605 | output[[header$name]] <- read.csv(con2, nrows=header$nrows, stringsAsFactors = FALSE)
606 | },
607 | error = force)
608 | output
609 | }
610 | ```
611 |
612 | I have just had a sinking feeling that a lot of people actually build
613 | web services that talk in CSV.
614 |
615 | Unfortunately `read.table` can't cope with non-blocking connections, so
616 | I have to buffer the read into memory when reading from a fifo or socket.
617 |
618 | ```{R, results="hide"}
619 | csv.timings <- run_tests(
620 | arg_df(c(
621 | list(encoder = list(csv = list(writer = writeCsvs))),
622 | buffer_read_options(reader = readCsvs, raw = TRUE))))
623 | ```
624 |
625 | ```{R, results = "hide"}
626 | benchmarks <- store(csv.timings, benchmarks)
627 | ```
628 |
629 | ```{R}
630 | (test.plot + labs(title="R csv performance")) %+% filter(benchmarks, encoder == "csv")
631 | ```
632 |
633 | Surprisingly fast at writing to files.
634 |
635 | ## Comparison of Encoder Speed
636 |
637 | ```{R results="hide", include=FALSE}
638 | ## Derive some kind of ordering from fastest to slowest (for choosing our
639 | ## color palette):
640 | library(gnm)
641 | library(broom)
642 | library(stringr)
643 |
644 | # order the factors from high to low by ad-hoc regression model...
645 | m <- gnm(data = benchmarks,
646 | total.elapsed ~ Mult(size, method, encoder))
647 |
648 | getEncoder <- function(term) {
649 | match <- str_match(term, "\\.(size|method|encoder)(\\w*)")
650 | match <- match[,-1]
651 | dimnames(match) <- list(NULL, list("param", "value"))
652 | as_data_frame(match)
653 | }
654 |
655 | coefs <- (m %>% tidy %>% bind_cols(., getEncoder(.$term)))
656 |
657 | encoder_order <- (coefs
658 | %>% filter(param == "encoder")
659 | %>% arrange(desc(abs(estimate)))
660 | %>% .$value
661 | )
662 |
663 | method_order <- (coefs
664 | %>% filter(param == "method")
665 | %>% arrange(desc(abs(estimate)))
666 | %>% .$value
667 | )
668 | ```
669 |
670 | The most importent scenarios are conversion in memory, writing to
671 | file, writing to another process on the same host, and writing to
672 | remote host.
673 |
674 | ```{R timings, fig_width=6, fig_height=6, fig.asp = 1}
675 | library(directlabels)
676 | timing.plot <- ( benchmarks
677 | %>% subset(method %in% c("convert", "socket", "remote", "file"))
678 | %>% mutate(method = c(convert="Conversion in memory", socket="TCP (same host)",
679 | remote="TCP (over LAN)", file = "File I/O")[method])
680 | %>% mutate(encoder = factor(encoder, levels=encoder_order))
681 | %>% { (ggplot(.)
682 | + aes(y = total.elapsed, x = size, color = encoder, group = encoder)
683 | + facet_wrap(~method)
684 | + geom_point()
685 | + geom_line()
686 | + theme(aspect.ratio=1)
687 | + scale_x_continuous(name = "Fraction of nycflights13 transmitted")
688 | + scale_y_continuous(limits = c(0, NA), name = "Elapsed time (s)")
689 | + labs(title="Elapsed time to encode, write, read, decode, by package")
690 | + expand_limits(x=1.5, y=30)
691 | + geom_dl(aes(label=encoder),
692 | debug=FALSE,
693 | method = list(
694 | "last.points",
695 | rot=45,
696 | "bumpup",
697 | dl.trans(x=x+0.2, y=y+0.2)))
698 | + guides(color=FALSE)
699 | )})
700 | timing.plot
701 | ```
702 |
703 | ## Comparison of Encoder Data Usage
704 |
705 | ```{R datasize}
706 | # We didn't have time to test each encoder with the full
707 | # dataset, so we'll extrapolate from the largest data set tested
708 |
709 | needed <- (benchmarks
710 | %>% filter(!is.na(bytes))
711 | %>% select(., encoder, method, size, bytes)
712 | %>% group_by(encoder)
713 | %>% filter(row_number() == 1)
714 | %>% arrange(desc(size), .by_group=TRUE)
715 | %>% slice(1)
716 | %>% select(encoder, method)
717 | )
718 |
719 | model <- ( needed
720 | %>% inner_join(benchmarks)
721 | %>% select(bytes, encoder, size)
722 | %>% lm(formula = bytes ~ encoder * size))
723 |
724 | predicted <- (needed
725 | %>% mutate(size = 1)
726 | %>% augment(model, newdata=.)
727 | %>% rename(bytes = .fitted)
728 | )
729 |
730 | datasize.plot <- (predicted
731 | %>% mutate(encoder = factor(encoder, levels=encoder_order))
732 | %>% ggplot
733 | %>% +aes(x = reorder(encoder, bytes), y=bytes/2^20, fill=encoder)
734 | %>% +geom_col()
735 | %>% +labs(y = "MiB", x = NULL,
736 | title = "Space used to encode nycflights13 (shorter is better)")
737 | %>% +geom_dl(aes(label=encoder, group=encoder),
738 | method=c(dl.trans(y=y+0.1), "top.bumpup"))
739 | %>% +guides(fill = FALSE)
740 | %>% +theme(axis.title.x=element_blank(),
741 | axis.text.x=element_blank(),
742 | axis.ticks.x=element_blank()
743 | )
744 | )
745 | datasize.plot
746 | ```
747 |
748 | The difference in size between different JSON and msgpack
749 | implementations could use some investigation. It may down to
750 | whitespace, sending data row-wise vs. col-wise, differences in the
751 | mapping between R and data format types, or bugs.
752 |
753 | ```{R}
754 | save(benchmarks, file="../inst/benchmarks.RData")
755 | ```
756 |
--------------------------------------------------------------------------------
/man/msgConnection.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/conn.R
3 | \name{msgConnection}
4 | \alias{msgConnection}
5 | \alias{close.msgConnection}
6 | \alias{partial}
7 | \alias{partial.msgConnection}
8 | \alias{readMsgs}
9 | \alias{status}
10 | \alias{status.msgConnection}
11 | \alias{seek.msgConnection}
12 | \alias{readMsg}
13 | \alias{writeMsg}
14 | \alias{writeMsgs}
15 | \title{Read and write msgpack formatted messages over R connections.}
16 | \usage{
17 | msgConnection(con, read_size = 2^16, max_size = NA, ...)
18 |
19 | \method{close}{msgConnection}(con, ...)
20 |
21 | partial(con)
22 |
23 | \method{partial}{msgConnection}(con)
24 |
25 | readMsgs(con, n = NA, ...)
26 |
27 | status(con)
28 |
29 | \method{status}{msgConnection}(con)
30 |
31 | \method{seek}{msgConnection}(con, rw = summary(con)$mode, ...)
32 |
33 | readMsg(con, ...)
34 |
35 | writeMsg(obj, con, ...)
36 |
37 | writeMsgs(objs, con, ...)
38 | }
39 | \arguments{
40 | \item{con}{A \link{connection} object open in binary mode.}
41 |
42 | \item{read_size}{How many bytes to read at a time.}
43 |
44 | \item{max_size}{The largest partial message to store, in
45 | bytes. \code{NA} means do not enforce a limit.}
46 |
47 | \item{...}{Unpacking options (see \link{unpackMsg}).}
48 |
49 | \item{n}{The maximum number of messages to read. A value of NA
50 | means to parse all available messages until end of input.}
51 |
52 | \item{rw}{See \code{\link[=seek]{seek()}}.}
53 |
54 | \item{obj}{An R object.}
55 |
56 | \item{objs}{A list of R objects.}
57 | }
58 | \value{
59 | \code{msgConnection()} returns an object of class
60 | \code{msgConnection}.
61 |
62 | \code{partial(con)} returns any data that has been read ahead of
63 | the last decoded message.
64 |
65 | \code{readMsgs(con, n)} returns a list of up to \code{n} decoded messages.
66 |
67 | \code{status(con)} returns the status of msgpack decoding on the
68 | connection. A value of \code{"ok"} indicates all requested messages
69 | were read, \code{"buffer underflow"} for a non-blocking connection
70 | indicates that only part of a message has been received, and
71 | \code{"end of input"} means the last available message has been read.
72 | Other values indicate errors encountered in decoding, which will
73 | effectively halt reading.
74 |
75 | \code{seek(con)} returns the number of bytes that have been
76 | successfully read or written, depending on the mode of the
77 | connection. (Repositioning is not supported.)
78 |
79 | \code{readMsg(con)} returns one decoded message.
80 | }
81 | \description{
82 | A \code{msgConnection} object decodes msgpack messages from an
83 | underlying R raw connection.
84 |
85 | \code{writeMsg} will work with any R connection in raw mode, but reading
86 | requires a msgConnection object.
87 | }
88 | \details{
89 | Because msgpack messages have unpredictable length, the decoder
90 | reads ahead in chunks, then finds the boundaries between messages.
91 | Therefore when reading over a socket or a fifo it is best to use a
92 | nonblocking connection, and it will not work to mix readMsg and
93 | readBin on the same connection.
94 |
95 | If you are reading data from a not completely trusted source you
96 | should specify options \code{max_size} and \code{max_depth} (see
97 | \link{unpackOpts}). Without it, some deeply nested or cleverly designed
98 | messages can cause a stack overflow or out-of-memory error. With
99 | these options set, you will get an R exception instead.
100 | }
101 | \examples{
102 | out <- rawConnection(raw(0), open="wb")
103 | apply(quakes, 1, function(x) writeMsg(x, out)) # one message for each row
104 | length(rawConnectionValue(out))
105 | inn <- msgConnection(rawConnection(rawConnectionValue(out), open="rb"))
106 | close(out)
107 | readMsg(inn)
108 | readMsgs(inn, 3)
109 | length(readMsgs(inn))
110 | close(inn)
111 | }
112 |
--------------------------------------------------------------------------------
/man/packMsg.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/encode.R
3 | \name{packMsg}
4 | \alias{packMsg}
5 | \alias{packMsgs}
6 | \alias{packOpts}
7 | \alias{prepack}
8 | \alias{prepack.default}
9 | \alias{prepack.data.frame}
10 | \title{Convert R objects to msgpack format.}
11 | \usage{
12 | packMsg(x, ...)
13 |
14 | packMsgs(xs, ...)
15 |
16 | packOpts(compatible = FALSE, as_is = FALSE, use_dict = TRUE,
17 | max_size = NA, buf_size = 512)
18 |
19 | prepack(x)
20 |
21 | \method{prepack}{default}(x)
22 |
23 | \method{prepack}{data.frame}(x)
24 | }
25 | \arguments{
26 | \item{x}{An R object, which can be null, a vector, list,
27 | environment, raw, or any combinations thereof.}
28 |
29 | \item{...}{Options passed to \code{\link[=packOpts]{packOpts()}}}
30 |
31 | \item{xs}{a list of objects to pack.}
32 |
33 | \item{compatible}{If TRUE, emitted bytes conform to version 1.0 of
34 | msgpack encoding. This means that msgpack strings are used for
35 | raw objects.}
36 |
37 | \item{as_is}{If TRUE, singletons (R primitive vectors of length 1
38 | having no names attribute) are encoded as msgpack arrays of
39 | length 1. Otherwise singletons are simplified to msgpack scalars.}
40 |
41 | \item{use_dict}{If TRUE, vectors having a "names" attribute are
42 | encoded as dicts. If false, they are encoded as arrays and the
43 | names are discarded.}
44 |
45 | \item{max_size}{The largest buffer that will be allocated.}
46 |
47 | \item{buf_size}{The initial amount of memory, in bytes, to allocate
48 | for packing each message. This will be dynamically grown if a
49 | larger message is passed, so there is little reason to change
50 | this.}
51 | }
52 | \value{
53 | An object of class "raw".
54 | }
55 | \description{
56 | Convert R objects to msgpack format.
57 |
58 | \code{\link[=packOpts]{packOpts()}} interprets the \code{...} argument of packMsg and
59 | packMsgs. it is not exported.
60 | }
61 | \details{
62 | Strings are re-encoded to UTF-8 if necessary. Real numbers
63 | taking integral values may be emitted as integers to save space.
64 |
65 | Normally an R vector of length 1 will be unboxed, e.g. packMsg(1)
66 | will make a msgpack integer, but packMsg(c(1,2)) will make a
67 | msgpack list. To prevent this and produce a list of length 1 in the
68 | first case, specify \code{as_is = TRUE}. Objects of class \code{AsIs} or
69 | \code{data.frame} will always be encoded as-is.
70 |
71 | A hook for pre-processing R objects before packing is supported, by
72 | giving the object an S3 \link{class} and implementing a method
73 | \code{prepack}. For instance, \code{prepack.data.frame(x)} simply adds the
74 | \code{"AsIs"} class to \code{x}.
75 |
76 | Environment objects are written out with the keys in sorted order,
77 | but named vectors are written in the order which the entries
78 | appear.
79 |
80 | Object attributes other than \code{names} and \code{class} are ignored.
81 | }
82 | \examples{
83 | packMsg( list(compact=TRUE, schema=0) )
84 | x <- packMsgs(list("one", "two", "three"))
85 | unpackMsgs(x, 2)
86 | }
87 |
--------------------------------------------------------------------------------
/man/unpackMsg.Rd:
--------------------------------------------------------------------------------
1 | % Generated by roxygen2: do not edit by hand
2 | % Please edit documentation in R/decode.R
3 | \name{unpackMsg}
4 | \alias{unpackMsg}
5 | \alias{unpackMsgs}
6 | \alias{unpackOpts}
7 | \title{Decode msgpack messages.}
8 | \usage{
9 | unpackMsg(x, ...)
10 |
11 | unpackMsgs(x, n = NA, reader = NULL, ...)
12 |
13 | unpackOpts(parent = NULL, df = TRUE, simplify = TRUE,
14 | max_size = NA, max_depth = NA, underflow_handler = NULL)
15 | }
16 | \arguments{
17 | \item{x}{A \code{\link[=raw]{raw()}} object, perhaps read from a file or socket.}
18 |
19 | \item{...}{Options passed to \link{unpackOpts}.}
20 |
21 | \item{n}{How many messages to read. An "NA" here means to read as
22 | much as possible.}
23 |
24 | \item{reader}{For implementing connections; a function that takes
25 | no arguments and returns a raw containing more data.}
26 |
27 | \item{parent}{When an environment is given, (such as \code{\link[=emptyenv]{emptyenv()}}),
28 | unpack msgpack dicts into environment objects, with the given
29 | value as parent. This option overrides \code{use_df=TRUE}. Otherwise,
30 | unpack dicts into named vectors / lists.}
31 |
32 | \item{df}{When \code{TRUE}, convert msgpack dicts, whose elements are
33 | all arrays having the same length, into \code{\link[=data.frame]{data.frame()}}s.}
34 |
35 | \item{simplify}{If \code{TRUE}, simplify msgpack lists into primitive
36 | vectors.}
37 |
38 | \item{max_size}{The maximum length of message to decode.}
39 |
40 | \item{max_depth}{The maximum degree of nesting to support.}
41 |
42 | \item{underflow_handler}{Used internally.}
43 | }
44 | \value{
45 | \code{unpackMsg(x)} returns one decoded message (which might be
46 | shorter than the input raw), or throws an error.
47 |
48 | \code{unpackMsgs(r, n)} returns a list \code{X} with four elements:
49 | \itemize{
50 | \item \code{X$msgs} is a list of the messages unpacked.
51 | \item \code{X$remaining} is data remaining to be parsed.
52 | \item \code{X$status} is a status message, typically "ok", "end of input",
53 | or "buffer underflow".
54 | \item \code{x$bytes_read} the number of bytes consumed.
55 | }
56 | }
57 | \description{
58 | \code{unpackMsg} converts a raw array containing one message in msgpack
59 | format into the corresponding R data structure.
60 | }
61 | \details{
62 | The msgpack format does not have typed arrays, so all msgpack
63 | arrays are effectively lists from the R perspective. However, if an
64 | array containing compatibly typed elements is read, \code{unpack} will
65 | return a logical, integer, real or string vector as
66 | appropriate. This behavior is disabled with \code{simplify=FALSE}. The
67 | coercion used is more conservative than R's coercion: Integer
68 | values may be converted to real, but boolean values will not be
69 | cast to numeric, nor any types to string. If conversion from a
70 | large integer to real loses precision, a warning is printed.
71 |
72 | Msgpack also does not distinguish between \code{NA} and \code{NULL}. All nils
73 | will be decoded as NA.
74 |
75 | Strings are assumed to be UTF-8 encoded. If a msgpack string does
76 | not appear to be valid UTF-8, a warning is printed and a raw object
77 | is produced instead.
78 |
79 | Msgpack allows any type to be the key of a dict, but R only
80 | supports strings. If a non-string appears as key in a msgpack dict,
81 | it will be converted to string with \code{\link[=deparse]{deparse()}}.
82 |
83 | Extension types will be decoded as raw objects with a class like
84 | \code{"ext120"} and a warning.
85 | }
86 | \examples{
87 | msg <- as.raw(c(0x82, 0xa7, 0x63, 0x6f, 0x6d, 0x70, 0x61, 0x63, 0x74, 0xc3,
88 | 0xa6, 0x73, 0x63, 0x68, 0x65, 0x6d, 0x61, 0x00))
89 | unpackMsg(msg)
90 | x <- packMsgs(list("one", "two", "three"))
91 | unpackMsgs(x, 2)
92 | }
93 |
--------------------------------------------------------------------------------
/src/cwpack.c:
--------------------------------------------------------------------------------
1 | /* CWPack - cwpack.c */
2 | /*
3 | The MIT License (MIT)
4 |
5 | Copyright (c) 2017 Claes Wihlborg
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
8 | software and associated documentation files (the "Software"), to deal in the Software
9 | without restriction, including without limitation the rights to use, copy, modify,
10 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
11 | persons to whom the Software is furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all copies or
14 | substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
17 | BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | */
22 |
23 | #include
24 |
25 | #include "cwpack.h"
26 | #include "cwpack_defines.h"
27 |
28 |
29 |
30 | /************************* C S Y S T E M L I B R A R Y ****************/
31 |
32 | #ifdef FORCE_NO_LIBRARY
33 |
34 | static void *memcpy(void *dst, const void *src, size_t n)
35 | {
36 | unsigned int i;
37 | uint8_t *d=(uint8_t*)dst, *s=(uint8_t*)src;
38 | for (i=0; istart = pack_context->current = (uint8_t*)data;
77 | pack_context->end = pack_context->start + length;
78 | pack_context->be_compatible = false;
79 | pack_context->err_no = 0;
80 | pack_context->handle_pack_overflow = hpo;
81 | pack_context->return_code = test_byte_order();
82 | return pack_context->return_code;
83 | }
84 |
85 | void cw_pack_set_compatibility (cw_pack_context* pack_context, bool be_compatible)
86 | {
87 | pack_context->be_compatible = be_compatible;
88 | }
89 |
90 |
91 |
92 | /* Packing routines -------------------------------------------------------------------------------- */
93 |
94 |
95 | void cw_pack_unsigned(cw_pack_context* pack_context, uint64_t i)
96 | {
97 | if (pack_context->return_code)
98 | return;
99 |
100 | if (i < 128)
101 | tryMove0(i);
102 |
103 | if (i < 256)
104 | tryMove1(0xcc, i);
105 |
106 | if (i < 0x10000L)
107 | {
108 | tryMove2(0xcd, i);
109 | }
110 | if (i < 0x100000000LL)
111 | tryMove4(0xce, i);
112 |
113 | tryMove8(0xcf,i);
114 | }
115 |
116 |
117 | void cw_pack_signed(cw_pack_context* pack_context, int64_t i)
118 | {
119 | if (pack_context->return_code)
120 | return;
121 |
122 | if (i >127)
123 | {
124 | if (i < 256)
125 | tryMove1(0xcc, i);
126 |
127 | if (i < 0x10000L)
128 | tryMove2(0xcd, i);
129 |
130 | if (i < 0x100000000LL)
131 | tryMove4(0xce, i);
132 |
133 | tryMove8(0xcf,i);
134 | }
135 |
136 | if (i >= -32)
137 | tryMove0(i);
138 |
139 | if (i >= -128)
140 | tryMove1(0xd0, i);
141 |
142 | if (i >= -32768)
143 | tryMove2(0xd1,i);
144 |
145 | if (i >= (int64_t)0xffffffff80000000LL)
146 | tryMove4(0xd2,i);
147 |
148 | tryMove8(0xd3,i);
149 | }
150 |
151 |
152 | void cw_pack_float(cw_pack_context* pack_context, float f)
153 | {
154 | if (pack_context->return_code)
155 | return;
156 |
157 | uint32_t tmp = *((uint32_t*)&f);
158 | tryMove4(0xca,tmp);
159 | }
160 |
161 |
162 | void cw_pack_double(cw_pack_context* pack_context, double d)
163 | {
164 | if (pack_context->return_code)
165 | return;
166 |
167 | uint64_t tmp = *((uint64_t*)&d);
168 | tryMove8(0xcb,tmp);
169 | }
170 |
171 |
172 | void cw_pack_real (cw_pack_context* pack_context, double d)
173 | {
174 | float f = (float)d;
175 | double df = f;
176 | if (df == d)
177 | cw_pack_float (pack_context, f);
178 | else
179 | cw_pack_double (pack_context, d);
180 | }
181 |
182 |
183 | void cw_pack_nil(cw_pack_context* pack_context)
184 | {
185 | if (pack_context->return_code)
186 | return;
187 |
188 | tryMove0(0xc0);
189 | }
190 |
191 |
192 | void cw_pack_true (cw_pack_context* pack_context)
193 | {
194 | if (pack_context->return_code)
195 | return;
196 |
197 | tryMove0(0xc3);
198 | }
199 |
200 |
201 | void cw_pack_false (cw_pack_context* pack_context)
202 | {
203 | if (pack_context->return_code)
204 | return;
205 |
206 | tryMove0(0xc2);
207 | }
208 |
209 |
210 | void cw_pack_boolean(cw_pack_context* pack_context, bool b)
211 | {
212 | if (pack_context->return_code)
213 | return;
214 |
215 | tryMove0(b? 0xc3: 0xc2);
216 | }
217 |
218 |
219 | void cw_pack_array_size(cw_pack_context* pack_context, uint32_t n)
220 | {
221 | if (pack_context->return_code)
222 | return;
223 |
224 | if (n < 16)
225 | tryMove0(0x90 | n);
226 |
227 | if (n < 65536)
228 | tryMove2(0xdc, n);
229 |
230 | tryMove4(0xdd, n);
231 | }
232 |
233 |
234 | void cw_pack_map_size(cw_pack_context* pack_context, uint32_t n)
235 | {
236 | if (pack_context->return_code)
237 | return;
238 |
239 | if (n < 16)
240 | tryMove0(0x80 | n);
241 |
242 | if (n < 65536)
243 | tryMove2(0xde, n);
244 |
245 | tryMove4(0xdf, n);
246 | }
247 |
248 |
249 | void cw_pack_str(cw_pack_context* pack_context, const char* v, uint32_t l)
250 | {
251 | if (pack_context->return_code)
252 | return;
253 |
254 | uint8_t *p;
255 |
256 | if (l < 32) // Fixstr
257 | {
258 | cw_pack_reserve_space(l+1);
259 | *p = (uint8_t)(0xa0 + l);
260 | memcpy(p+1,v,l);
261 | return;
262 | }
263 | if (l < 256 && !pack_context->be_compatible) // Str 8
264 | {
265 | cw_pack_reserve_space(l+2);
266 | *p++ = (uint8_t)(0xd9);
267 | *p = (uint8_t)(l);
268 | memcpy(p+1,v,l);
269 | return;
270 | }
271 | if (l < 65536) // Str 16
272 | {
273 | cw_pack_reserve_space(l+3)
274 | *p++ = (uint8_t)0xda;
275 | cw_store16(l);
276 | memcpy(p+2,v,l);
277 | return;
278 | }
279 | // Str 32
280 | cw_pack_reserve_space(l+5)
281 | *p++ = (uint8_t)0xdb;
282 | cw_store32(l);
283 | memcpy(p+4,v,l);
284 | return;
285 | }
286 |
287 |
288 | void cw_pack_bin(cw_pack_context* pack_context, const void* v, uint32_t l)
289 | {
290 | if (pack_context->return_code)
291 | return;
292 |
293 | if (pack_context->be_compatible)
294 | {
295 | cw_pack_str( pack_context, v, l);
296 | return;
297 | }
298 |
299 | uint8_t *p;
300 |
301 | if (l < 256) // Bin 8
302 | {
303 | cw_pack_reserve_space(l+2);
304 | *p++ = (uint8_t)(0xc4);
305 | *p = (uint8_t)(l);
306 | memcpy(p+1,v,l);
307 | return;
308 | }
309 | if (l < 65536) // Bin 16
310 | {
311 | cw_pack_reserve_space(l+3)
312 | *p++ = (uint8_t)0xc5;
313 | cw_store16(l);
314 | memcpy(p+2,v,l);
315 | return;
316 | }
317 | // Bin 32
318 | cw_pack_reserve_space(l+5)
319 | *p++ = (uint8_t)0xc6;
320 | cw_store32(l);
321 | memcpy(p+4,v,l);
322 | return;
323 | }
324 |
325 |
326 | void cw_pack_ext (cw_pack_context* pack_context, int8_t type, const void* v, uint32_t l)
327 | {
328 | if (pack_context->return_code)
329 | return;
330 |
331 | if (pack_context->be_compatible)
332 | PACK_ERROR(CWP_RC_ILLEGAL_CALL);
333 |
334 | uint8_t *p;
335 |
336 | switch (l)
337 | {
338 | case 1: // Fixext 1
339 | cw_pack_reserve_space(3);
340 | *p++ = (uint8_t)0xd4;
341 | *p++ = (uint8_t)type;
342 | *p++ = *(uint8_t*)v;
343 | return;
344 | case 2: // Fixext 2
345 | cw_pack_reserve_space(4);
346 | *p++ = (uint8_t)0xd5;
347 | break;
348 | case 4: // Fixext 4
349 | cw_pack_reserve_space(6);
350 | *p++ = (uint8_t)0xd6;
351 | break;
352 | case 8: // Fixext 8
353 | cw_pack_reserve_space(10);
354 | *p++ = (uint8_t)0xd7;
355 | break;
356 | case 16: // Fixext16
357 | cw_pack_reserve_space(18);
358 | *p++ = (uint8_t)0xd8;
359 | break;
360 | default:
361 | if (l < 256) // Ext 8
362 | {
363 | cw_pack_reserve_space(l+3);
364 | *p++ = (uint8_t)0xc7;
365 | *p++ = (uint8_t)(l);
366 | }
367 | else if (l < 65536) // Ext 16
368 | {
369 | cw_pack_reserve_space(l+4)
370 | *p++ = (uint8_t)0xc8;
371 | cw_store16(l);
372 | p += 2;
373 | }
374 | else // Ext 32
375 | {
376 | cw_pack_reserve_space(l+6)
377 | *p++ = (uint8_t)0xc9;
378 | cw_store32(l);
379 | p += 4;
380 | }
381 | }
382 | *p++ = (uint8_t)type;
383 | memcpy(p,v,l);
384 | }
385 |
386 |
387 | void cw_pack_insert (cw_pack_context* pack_context, const void* v, uint32_t l)
388 | {
389 | uint8_t *p;
390 | cw_pack_reserve_space(l);
391 | memcpy(p,v,l);
392 | }
393 |
394 | /******************************* U N P A C K **********************************/
395 |
396 |
397 | int cw_unpack_context_init (cw_unpack_context* unpack_context, void* data, unsigned long length, unpack_underflow_handler huu)
398 | {
399 | unpack_context->start = unpack_context->current = (uint8_t*)data;
400 | unpack_context->end = unpack_context->start + length;
401 | unpack_context->return_code = test_byte_order();
402 | unpack_context->err_no = 0;
403 | unpack_context->handle_unpack_underflow = huu;
404 | return unpack_context->return_code;
405 | }
406 |
407 |
408 | /* Unpacking routines ---------------------------------------------------------- */
409 |
410 |
411 |
412 | void cw_unpack_next (cw_unpack_context* unpack_context)
413 | {
414 | if (unpack_context->return_code)
415 | return;
416 |
417 | uint64_t tmpu64;
418 | uint32_t tmpu32;
419 | uint16_t tmpu16;
420 | uint8_t* p;
421 |
422 | #define buffer_end_return_code CWP_RC_END_OF_INPUT;
423 | cw_unpack_assert_space(1);
424 | uint8_t c = *p;
425 | #undef buffer_end_return_code
426 | #define buffer_end_return_code CWP_RC_BUFFER_UNDERFLOW;
427 | switch (c)
428 | {
429 | case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
430 | case 0x08: case 0x09: case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x0e: case 0x0f:
431 | case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
432 | case 0x18: case 0x19: case 0x1a: case 0x1b: case 0x1c: case 0x1d: case 0x1e: case 0x1f:
433 | case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27:
434 | case 0x28: case 0x29: case 0x2a: case 0x2b: case 0x2c: case 0x2d: case 0x2e: case 0x2f:
435 | case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:
436 | case 0x38: case 0x39: case 0x3a: case 0x3b: case 0x3c: case 0x3d: case 0x3e: case 0x3f:
437 | case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
438 | case 0x48: case 0x49: case 0x4a: case 0x4b: case 0x4c: case 0x4d: case 0x4e: case 0x4f:
439 | case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57:
440 | case 0x58: case 0x59: case 0x5a: case 0x5b: case 0x5c: case 0x5d: case 0x5e: case 0x5f:
441 | case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67:
442 | case 0x68: case 0x69: case 0x6a: case 0x6b: case 0x6c: case 0x6d: case 0x6e: case 0x6f:
443 | case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
444 | case 0x78: case 0x79: case 0x7a: case 0x7b: case 0x7c: case 0x7d: case 0x7e: case 0x7f:
445 | getDDItem(CWP_ITEM_POSITIVE_INTEGER, i64, c); return; // positive fixnum
446 | case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87:
447 | case 0x88: case 0x89: case 0x8a: case 0x8b: case 0x8c: case 0x8d: case 0x8e: case 0x8f:
448 | getDDItem(CWP_ITEM_MAP, map.size, c & 0x0f); return; // fixmap
449 | case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97:
450 | case 0x98: case 0x99: case 0x9a: case 0x9b: case 0x9c: case 0x9d: case 0x9e: case 0x9f:
451 | getDDItem(CWP_ITEM_ARRAY, array.size, c & 0x0f); return; // fixarray
452 | case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: case 0xa5: case 0xa6: case 0xa7:
453 | case 0xa8: case 0xa9: case 0xaa: case 0xab: case 0xac: case 0xad: case 0xae: case 0xaf:
454 | case 0xb0: case 0xb1: case 0xb2: case 0xb3: case 0xb4: case 0xb5: case 0xb6: case 0xb7:
455 | case 0xb8: case 0xb9: case 0xba: case 0xbb: case 0xbc: case 0xbd: case 0xbe: case 0xbf:
456 | getDDItem(CWP_ITEM_STR, str.length, c & 0x1f); // fixstr
457 | cw_unpack_assert_blob(str);
458 | case 0xc0: unpack_context->item.type = CWP_ITEM_NIL; return; // nil
459 | case 0xc2: getDDItem(CWP_ITEM_BOOLEAN, boolean, false); return; // false
460 | case 0xc3: getDDItem(CWP_ITEM_BOOLEAN, boolean, true); return; // true
461 | case 0xc4: getDDItem1(CWP_ITEM_BIN, bin.length, uint8_t); // bin 8
462 | cw_unpack_assert_blob(bin);
463 | case 0xc5: getDDItem2(CWP_ITEM_BIN, bin.length, uint16_t); // bin 16
464 | cw_unpack_assert_blob(bin);
465 | case 0xc6: getDDItem4(CWP_ITEM_BIN, bin.length, uint32_t); // bin 32
466 | cw_unpack_assert_blob(bin);
467 | case 0xc7: getDDItem1(CWP_ITEM_EXT, ext.length, uint8_t); // ext 8
468 | cw_unpack_assert_space(1);
469 | unpack_context->item.type = *(int8_t*)p;
470 | cw_unpack_assert_blob(ext);
471 | case 0xc8: getDDItem2(CWP_ITEM_EXT, ext.length, uint16_t); // ext 16
472 | cw_unpack_assert_space(1);
473 | unpack_context->item.type = *(int8_t*)p;
474 | cw_unpack_assert_blob(ext);
475 | case 0xc9: getDDItem4(CWP_ITEM_EXT, ext.length, uint32_t); // ext 32
476 | cw_unpack_assert_space(1);
477 | unpack_context->item.type = *(int8_t*)p;
478 | cw_unpack_assert_blob(ext);
479 | case 0xca: unpack_context->item.type = CWP_ITEM_FLOAT; // float
480 | cw_unpack_assert_space(4);
481 | cw_load32(p);
482 | unpack_context->item.as.real = *(float*)&tmpu32; return;
483 | case 0xcb: getDDItem8(CWP_ITEM_DOUBLE); return; // double
484 | case 0xcc: getDDItem1(CWP_ITEM_POSITIVE_INTEGER, u64, uint8_t); return; // unsigned int 8
485 | case 0xcd: getDDItem2(CWP_ITEM_POSITIVE_INTEGER, u64, uint16_t); return; // unsigned int 16
486 | case 0xce: getDDItem4(CWP_ITEM_POSITIVE_INTEGER, u64, uint32_t); return; // unsigned int 32
487 | case 0xcf: getDDItem8(CWP_ITEM_POSITIVE_INTEGER); return; // unsigned int 64
488 | case 0xd0: getDDItem1(CWP_ITEM_NEGATIVE_INTEGER, i64, int8_t); // signed int 8
489 | if (unpack_context->item.as.i64 >= 0)
490 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
491 | return;
492 | case 0xd1: getDDItem2(CWP_ITEM_NEGATIVE_INTEGER, i64, int16_t); // signed int 16
493 | if (unpack_context->item.as.i64 >= 0)
494 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
495 | return;
496 | case 0xd2: getDDItem4(CWP_ITEM_NEGATIVE_INTEGER, i64, int32_t); // signed int 32
497 | if (unpack_context->item.as.i64 >= 0)
498 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
499 | return;
500 | case 0xd3: getDDItem8(CWP_ITEM_NEGATIVE_INTEGER); // signed int 64
501 | if (unpack_context->item.as.i64 >= 0)
502 | unpack_context->item.type = CWP_ITEM_POSITIVE_INTEGER;
503 | return;
504 | case 0xd4: getDDItemFix(1); // fixext 1
505 | case 0xd5: getDDItemFix(2); // fixext 2
506 | case 0xd6: getDDItemFix(4); // fixext 4
507 | case 0xd7: getDDItemFix(8); // fixext 8
508 | case 0xd8: getDDItemFix(16); // fixext 16
509 | case 0xd9: getDDItem1(CWP_ITEM_STR, str.length, uint8_t); // str 8
510 | cw_unpack_assert_blob(str);
511 | case 0xda: getDDItem2(CWP_ITEM_STR, str.length, uint16_t); // str 16
512 | cw_unpack_assert_blob(str);
513 | case 0xdb: getDDItem4(CWP_ITEM_STR, str.length, uint32_t); // str 32
514 | cw_unpack_assert_blob(str);
515 | case 0xdc: getDDItem2(CWP_ITEM_ARRAY, array.size, uint16_t); return; // array 16
516 | case 0xdd: getDDItem4(CWP_ITEM_ARRAY, array.size, uint32_t); return; // array 32
517 | case 0xde: getDDItem2(CWP_ITEM_MAP, map.size, uint16_t); return; // map 16
518 | case 0xdf: getDDItem4(CWP_ITEM_MAP, map.size, uint32_t); return; // map 32
519 | case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5: case 0xe6: case 0xe7:
520 | case 0xe8: case 0xe9: case 0xea: case 0xeb: case 0xec: case 0xed: case 0xee: case 0xef:
521 | case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6: case 0xf7:
522 | case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xfc: case 0xfd: case 0xfe: case 0xff:
523 | getDDItem(CWP_ITEM_NEGATIVE_INTEGER, i64, (int8_t)c); return; // negative fixnum
524 | default:
525 | UNPACK_ERROR(CWP_RC_MALFORMED_INPUT)
526 | }
527 |
528 | return;
529 | }
530 |
531 | #define cw_skip_bytes(n) \
532 | cw_unpack_assert_space((n)); \
533 | break;
534 |
535 | void cw_skip_items (cw_unpack_context* unpack_context, long item_count)
536 | {
537 | if (unpack_context->return_code)
538 | return;
539 |
540 | uint32_t tmpu32;
541 | uint16_t tmpu16;
542 | uint8_t* p;
543 |
544 | while (item_count-- > 0)
545 | {
546 | #undef buffer_end_return_code
547 | #define buffer_end_return_code CWP_RC_END_OF_INPUT;
548 | cw_unpack_assert_space(1);
549 | uint8_t c = *p;
550 |
551 | #undef buffer_end_return_code
552 | #define buffer_end_return_code CWP_RC_BUFFER_UNDERFLOW;
553 | switch (c)
554 | {
555 | case 0x00: case 0x01: case 0x02: case 0x03: case 0x04: case 0x05: case 0x06: case 0x07:
556 | case 0x08: case 0x09: case 0x0a: case 0x0b: case 0x0c: case 0x0d: case 0x0e: case 0x0f:
557 | case 0x10: case 0x11: case 0x12: case 0x13: case 0x14: case 0x15: case 0x16: case 0x17:
558 | case 0x18: case 0x19: case 0x1a: case 0x1b: case 0x1c: case 0x1d: case 0x1e: case 0x1f:
559 | case 0x20: case 0x21: case 0x22: case 0x23: case 0x24: case 0x25: case 0x26: case 0x27:
560 | case 0x28: case 0x29: case 0x2a: case 0x2b: case 0x2c: case 0x2d: case 0x2e: case 0x2f:
561 | case 0x30: case 0x31: case 0x32: case 0x33: case 0x34: case 0x35: case 0x36: case 0x37:
562 | case 0x38: case 0x39: case 0x3a: case 0x3b: case 0x3c: case 0x3d: case 0x3e: case 0x3f:
563 | case 0x40: case 0x41: case 0x42: case 0x43: case 0x44: case 0x45: case 0x46: case 0x47:
564 | case 0x48: case 0x49: case 0x4a: case 0x4b: case 0x4c: case 0x4d: case 0x4e: case 0x4f:
565 | case 0x50: case 0x51: case 0x52: case 0x53: case 0x54: case 0x55: case 0x56: case 0x57:
566 | case 0x58: case 0x59: case 0x5a: case 0x5b: case 0x5c: case 0x5d: case 0x5e: case 0x5f:
567 | case 0x60: case 0x61: case 0x62: case 0x63: case 0x64: case 0x65: case 0x66: case 0x67:
568 | case 0x68: case 0x69: case 0x6a: case 0x6b: case 0x6c: case 0x6d: case 0x6e: case 0x6f:
569 | case 0x70: case 0x71: case 0x72: case 0x73: case 0x74: case 0x75: case 0x76: case 0x77:
570 | case 0x78: case 0x79: case 0x7a: case 0x7b: case 0x7c: case 0x7d: case 0x7e: case 0x7f:
571 | // unsigned fixint
572 | case 0xe0: case 0xe1: case 0xe2: case 0xe3: case 0xe4: case 0xe5: case 0xe6: case 0xe7:
573 | case 0xe8: case 0xe9: case 0xea: case 0xeb: case 0xec: case 0xed: case 0xee: case 0xef:
574 | case 0xf0: case 0xf1: case 0xf2: case 0xf3: case 0xf4: case 0xf5: case 0xf6: case 0xf7:
575 | case 0xf8: case 0xf9: case 0xfa: case 0xfb: case 0xfc: case 0xfd: case 0xfe: case 0xff:
576 | // signed fixint
577 | case 0xc0: // nil
578 | case 0xc2: // false
579 | case 0xc3: break; // true
580 | case 0xcc: // unsigned int 8
581 | case 0xd0: cw_skip_bytes(1); // signed int 8
582 | case 0xcd: // unsigned int 16
583 | case 0xd1: // signed int 16
584 | case 0xd4: cw_skip_bytes(2); // fixext 1
585 | case 0xd5: cw_skip_bytes(3); // fixext 2
586 | case 0xca: // float
587 | case 0xce: // unsigned int 32
588 | case 0xd2: cw_skip_bytes(4); // signed int 32
589 | case 0xd6: cw_skip_bytes(5); // fixext 4
590 | case 0xcb: // double
591 | case 0xcf: // unsigned int 64
592 | case 0xd3: cw_skip_bytes(8); // signed int 64
593 | case 0xd7: cw_skip_bytes(9); // fixext 8
594 | case 0xd8: cw_skip_bytes(17); // fixext 16
595 | case 0xa0: case 0xa1: case 0xa2: case 0xa3: case 0xa4: case 0xa5: case 0xa6: case 0xa7:
596 | case 0xa8: case 0xa9: case 0xaa: case 0xab: case 0xac: case 0xad: case 0xae: case 0xaf:
597 | case 0xb0: case 0xb1: case 0xb2: case 0xb3: case 0xb4: case 0xb5: case 0xb6: case 0xb7:
598 | case 0xb8: case 0xb9: case 0xba: case 0xbb: case 0xbc: case 0xbd: case 0xbe: case 0xbf:
599 | cw_skip_bytes(c & 0x1f); // fixstr
600 | case 0xd9: // str 8
601 | case 0xc4: // bin 8
602 | cw_unpack_assert_space(1);
603 | tmpu32 = *p;
604 | cw_skip_bytes(tmpu32);
605 |
606 | case 0xda: // str 16
607 | case 0xc5: // bin 16
608 | cw_unpack_assert_space(2);
609 | cw_load16(p);
610 | cw_skip_bytes(tmpu16);
611 |
612 | case 0xdb: // str 32
613 | case 0xc6: // bin 32
614 | cw_unpack_assert_space(4);
615 | cw_load32(p);
616 | cw_skip_bytes(tmpu32);
617 |
618 | case 0x80: case 0x81: case 0x82: case 0x83: case 0x84: case 0x85: case 0x86: case 0x87:
619 | case 0x88: case 0x89: case 0x8a: case 0x8b: case 0x8c: case 0x8d: case 0x8e: case 0x8f:
620 | item_count += 2*(c & 15); // FixMap
621 | break;
622 |
623 | case 0x90: case 0x91: case 0x92: case 0x93: case 0x94: case 0x95: case 0x96: case 0x97:
624 | case 0x98: case 0x99: case 0x9a: case 0x9b: case 0x9c: case 0x9d: case 0x9e: case 0x9f:
625 | item_count += c & 15; // FixArray
626 | break;
627 |
628 | case 0xdc: // array 16
629 | cw_unpack_assert_space(2);
630 | cw_load16(p);
631 | item_count += tmpu16;
632 | break;
633 |
634 | case 0xde: // map 16
635 | cw_unpack_assert_space(2);
636 | cw_load16(p);
637 | item_count += 2*tmpu16;
638 | break;
639 |
640 | case 0xdd: // array 32
641 | cw_unpack_assert_space(4);
642 | cw_load32(p);
643 | item_count += tmpu32;
644 | break;
645 |
646 | case 0xdf: // map 32
647 | cw_unpack_assert_space(4);
648 | cw_load32(p);
649 | item_count += 2*tmpu32;
650 | break;
651 |
652 | case 0xc7: // ext 8
653 | cw_unpack_assert_space(1);
654 | tmpu32 = *p;
655 | cw_skip_bytes(tmpu32 +1);
656 |
657 | case 0xc8: // ext 16
658 | cw_unpack_assert_space(2);
659 | cw_load16(p);
660 | cw_skip_bytes(tmpu16 +1);
661 |
662 | case 0xc9: // ext 32
663 | cw_unpack_assert_space(4);
664 | cw_load32(p);
665 | cw_skip_bytes(tmpu32 +1);
666 |
667 | default: // illegal
668 | UNPACK_ERROR(CWP_RC_MALFORMED_INPUT)
669 | }
670 | }
671 | return;
672 | }
673 |
674 | /* end cwpack.c */
675 |
--------------------------------------------------------------------------------
/src/cwpack.h:
--------------------------------------------------------------------------------
1 | /* CWPack - cwpack.h */
2 | /*
3 | The MIT License (MIT)
4 |
5 | Copyright (c) 2017 Claes Wihlborg
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
8 | software and associated documentation files (the "Software"), to deal in the Software
9 | without restriction, including without limitation the rights to use, copy, modify,
10 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
11 | persons to whom the Software is furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all copies or
14 | substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
17 | BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | */
22 |
23 | #ifndef CWPack_H__
24 | #define CWPack_H__
25 |
26 |
27 | #include
28 | #include
29 | #include "opts.h"
30 |
31 |
32 |
33 | /******************************* Return Codes *****************************/
34 |
35 | #define CWP_RC_OK 0
36 | #define CWP_RC_END_OF_INPUT -1
37 | #define CWP_RC_BUFFER_OVERFLOW -2
38 | #define CWP_RC_BUFFER_UNDERFLOW -3
39 | #define CWP_RC_MALFORMED_INPUT -4
40 | #define CWP_RC_WRONG_BYTE_ORDER -5
41 | #define CWP_RC_ERROR_IN_HANDLER -6
42 | #define CWP_RC_ILLEGAL_CALL -7
43 | #define CWP_RC_MALLOC_ERROR -8
44 | #define CWP_RC_STOPPED -9
45 |
46 |
47 |
48 | /******************************* P A C K **********************************/
49 |
50 |
51 | struct cw_pack_context;
52 |
53 | typedef int (*pack_overflow_handler)(struct cw_pack_context*, unsigned long);
54 |
55 | typedef struct cw_pack_context {
56 | uint8_t* start;
57 | uint8_t* current;
58 | uint8_t* end;
59 | bool be_compatible;
60 | int return_code;
61 | int err_no; /* handlers can save error here */
62 | pack_overflow_handler handle_pack_overflow;
63 | pack_opts *opts;
64 | } cw_pack_context;
65 |
66 |
67 | int cw_pack_context_init (cw_pack_context* pack_context, void* data, unsigned long length, pack_overflow_handler hpo);
68 | void cw_pack_set_compatibility (cw_pack_context* pack_context, bool be_compatible);
69 |
70 | void cw_pack_nil (cw_pack_context* pack_context);
71 | void cw_pack_true (cw_pack_context* pack_context);
72 | void cw_pack_false (cw_pack_context* pack_context);
73 | void cw_pack_boolean (cw_pack_context* pack_context, bool b);
74 |
75 | void cw_pack_signed (cw_pack_context* pack_context, int64_t i);
76 | void cw_pack_unsigned (cw_pack_context* pack_context, uint64_t i);
77 |
78 | void cw_pack_float (cw_pack_context* pack_context, float f);
79 | void cw_pack_double (cw_pack_context* pack_context, double d);
80 | void cw_pack_real (cw_pack_context* pack_context, double d); /* Pack as float if precision isn't destroyed */
81 |
82 | void cw_pack_array_size (cw_pack_context* pack_context, uint32_t n);
83 | void cw_pack_map_size (cw_pack_context* pack_context, uint32_t n);
84 | void cw_pack_str (cw_pack_context* pack_context, const char* v, uint32_t l);
85 | void cw_pack_bin (cw_pack_context* pack_context, const void* v, uint32_t l);
86 | void cw_pack_ext (cw_pack_context* pack_context, int8_t type, const void* v, uint32_t l);
87 |
88 | void cw_pack_insert (cw_pack_context* pack_context, const void* v, uint32_t l);
89 |
90 |
91 | /***************************** U N P A C K ********************************/
92 |
93 |
94 | typedef enum
95 | {
96 | CWP_ITEM_MIN_RESERVED_EXT = -128,
97 | CWP_ITEM_MAX_RESERVED_EXT = -1,
98 | CWP_ITEM_MIN_USER_EXT = 0,
99 | CWP_ITEM_MAX_USER_EXT = 127,
100 | CWP_ITEM_NIL = 300,
101 | CWP_ITEM_BOOLEAN = 301,
102 | CWP_ITEM_POSITIVE_INTEGER = 302,
103 | CWP_ITEM_NEGATIVE_INTEGER = 303,
104 | CWP_ITEM_FLOAT = 304,
105 | CWP_ITEM_DOUBLE = 305,
106 | CWP_ITEM_STR = 306,
107 | CWP_ITEM_BIN = 307,
108 | CWP_ITEM_ARRAY = 308,
109 | CWP_ITEM_MAP = 309,
110 | CWP_ITEM_EXT = 310,
111 | CWP_NOT_AN_ITEM = 999,
112 | } cwpack_item_types;
113 |
114 |
115 | typedef struct {
116 | const void* start;
117 | uint32_t length;
118 | } cwpack_blob;
119 |
120 |
121 | typedef struct {
122 | uint32_t size;
123 | } cwpack_container;
124 |
125 |
126 | typedef struct {
127 | cwpack_item_types type;
128 | union
129 | {
130 | bool boolean;
131 | uint64_t u64;
132 | int64_t i64;
133 | float real;
134 | double long_real;
135 | cwpack_container array;
136 | cwpack_container map;
137 | cwpack_blob str;
138 | cwpack_blob bin;
139 | cwpack_blob ext;
140 | } as;
141 | } cwpack_item;
142 |
143 | struct cw_unpack_context;
144 |
145 | typedef int (*unpack_underflow_handler)(struct cw_unpack_context*, unsigned long);
146 |
147 | typedef struct cw_unpack_context {
148 | cwpack_item item;
149 | uint8_t* start;
150 | uint8_t* current;
151 | uint8_t* end; /* logical end of buffer */
152 | int return_code;
153 | int err_no; /* handlers can save error here */
154 | unpack_underflow_handler handle_unpack_underflow;
155 | unpack_opts *opts;
156 | } cw_unpack_context;
157 |
158 |
159 |
160 | int cw_unpack_context_init (cw_unpack_context* unpack_context, void* data, unsigned long length, unpack_underflow_handler huu);
161 |
162 | void cw_unpack_next (cw_unpack_context* unpack_context);
163 | void cw_skip_items (cw_unpack_context* unpack_context, long item_count);
164 |
165 |
166 | #endif /* CWPack_H__ */
167 |
--------------------------------------------------------------------------------
/src/cwpack_defines.h:
--------------------------------------------------------------------------------
1 | /* CWPack - cwpack_defines.h */
2 | /*
3 | The MIT License (MIT)
4 |
5 | Copyright (c) 2017 Claes Wihlborg
6 |
7 | Permission is hereby granted, free of charge, to any person obtaining a copy of this
8 | software and associated documentation files (the "Software"), to deal in the Software
9 | without restriction, including without limitation the rights to use, copy, modify,
10 | merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit
11 | persons to whom the Software is furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all copies or
14 | substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING
17 | BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM,
19 | DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
21 | */
22 |
23 |
24 | #ifndef cwpack_defines_h
25 | #define cwpack_defines_h
26 |
27 |
28 |
29 | /************************* A L I G N M E N T ******************************/
30 |
31 | /*
32 | * Sometime the processor demands that integer access is to an even memory address.
33 | * In that case define FORCE_ALIGNMENT
34 | */
35 |
36 | /* #define FORCE_ALIGNMENT */
37 |
38 |
39 | /************************* C S Y S T E M L I B R A R Y ****************/
40 |
41 | /*
42 | * The packer uses "memcpy" to move blobs. If you dont want to load C system library
43 | * for just that, define FORCE_NO_LIBRARY and CWPack will use an internal "memcpy"
44 | */
45 |
46 | /* #define FORCE_NO_LIBRARY */
47 |
48 |
49 |
50 | /************************* B Y T E O R D E R ****************************/
51 |
52 | /*
53 | * The pack/unpack routines are written in three versions: for big endian, for
54 | * little endian and insensitive to byte order. As you can get some speed gain
55 | * if the byte order is known, we try that when we can certainly detect it.
56 | * Define COMPILE_FOR_BIG_ENDIAN or COMPILE_FOR_LITTLE_ENDIAN if you know.
57 | */
58 |
59 | #ifndef FORCE_ALIGNMENT
60 | #if defined(__BYTE_ORDER__) && defined(__ORDER_LITTLE_ENDIAN__) && defined(__ORDER_BIG_ENDIAN__)
61 |
62 | #if __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
63 | #define COMPILE_FOR_BIG_ENDIAN
64 | #elif __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
65 | #define COMPILE_FOR_LITTLE_ENDIAN
66 | #endif
67 |
68 | #elif defined(__BYTE_ORDER) && defined(__LITTLE_ENDIAN) && defined(__BIG_ENDIAN)
69 |
70 | #if __BYTE_ORDER == __BIG_ENDIAN
71 | #define COMPILE_FOR_BIG_ENDIAN
72 | #elif __BYTE_ORDER == __LITTLE_ENDIAN
73 | #define COMPILE_FOR_LITTLE_ENDIAN
74 | #endif
75 |
76 | #elif defined(__BIG_ENDIAN__)
77 | #define COMPILE_FOR_BIG_ENDIAN
78 |
79 | #elif defined(__LITTLE_ENDIAN__)
80 | #define COMPILE_FOR_LITTLE_ENDIAN
81 |
82 | #elif defined(__i386__) || defined(__x86_64__)
83 | #define COMPILE_FOR_LITTLE_ENDIAN
84 |
85 | #endif
86 | #endif
87 |
88 | //#undef COMPILE_FOR_LITTLE_ENDIAN
89 |
90 |
91 | /******************************* P A C K **********************************/
92 |
93 |
94 |
95 | #define PACK_ERROR(error_code) \
96 | { \
97 | pack_context->return_code = error_code; \
98 | return; \
99 | }
100 |
101 |
102 |
103 | #ifdef COMPILE_FOR_BIG_ENDIAN
104 |
105 | #define cw_store16(x) *(uint16_t*)p = *(uint16_t*)&x;
106 | #define cw_store32(x) *(uint32_t*)p = *(uint32_t*)&x;
107 | #define cw_store64(x) *(uint64_t*)p = *(uint64_t*)&x;
108 |
109 | #else /* Byte order little endian or undetermined */
110 |
111 | #ifdef COMPILE_FOR_LITTLE_ENDIAN
112 |
113 | #define cw_store16(d) \
114 | *(uint16_t*)p = (uint16_t)((((d) >> 8) & 0x0ff) | (d) << 8)
115 |
116 | #define cw_store32(x) \
117 | *(uint32_t*)p = \
118 | ((uint32_t)((((uint32_t)(x)) >> 24) | \
119 | (((uint32_t)(x) & 0x00ff0000) >> 8) | \
120 | (((uint32_t)(x) & 0x0000ff00) << 8) | \
121 | (((uint32_t)(x)) << 24))); \
122 |
123 | #define cw_store64(x) \
124 | *(uint64_t*)p = \
125 | ((uint64_t)( \
126 | (((((uint64_t)(x)) >> 40) | \
127 | (((uint64_t)(x)) << 24)) & 0x0000ff000000ff00ULL) | \
128 | (((((uint64_t)(x)) >> 24) | \
129 | (((uint64_t)(x)) << 40)) & 0x00ff000000ff0000ULL) | \
130 | (((uint64_t)(x) & 0x000000ff00000000ULL) >> 8) | \
131 | (((uint64_t)(x) & 0x00000000ff000000ULL) << 8) | \
132 | (((uint64_t)(x)) >> 56) | \
133 | (((uint64_t)(x)) << 56))); \
134 |
135 | #else /* Byte order undetermined */
136 |
137 | #define cw_store16(d) \
138 | *p = (uint8_t)(d >> 8); \
139 | p[1] = (uint8_t)d;
140 |
141 | #define cw_store32(d) \
142 | *p = (uint8_t)(d >> 24); \
143 | p[1] = (uint8_t)(d >> 16); \
144 | p[2] = (uint8_t)(d >> 8); \
145 | p[3] = (uint8_t)d;
146 |
147 | #define cw_store64(z) \
148 | *p = (uint8_t)(z >> 56); \
149 | p[1] = (uint8_t)(z >> 48); \
150 | p[2] = (uint8_t)(z >> 40); \
151 | p[3] = (uint8_t)(z >> 32); \
152 | p[4] = (uint8_t)(z >> 24); \
153 | p[5] = (uint8_t)(z >> 16); \
154 | p[6] = (uint8_t)(z >> 8); \
155 | p[7] = (uint8_t)z;
156 | #endif
157 | #endif
158 |
159 |
160 |
161 | #define cw_pack_reserve_space(more) \
162 | { \
163 | p = pack_context->current; \
164 | uint8_t* nyp = p + more; \
165 | if (nyp > pack_context->end) \
166 | { \
167 | if (!pack_context->handle_pack_overflow) \
168 | PACK_ERROR(CWP_RC_BUFFER_OVERFLOW) \
169 | int rc = pack_context->handle_pack_overflow (pack_context, (unsigned long)(more)); \
170 | if (rc) \
171 | PACK_ERROR(rc) \
172 | p = pack_context->current; \
173 | nyp = p + more; \
174 | } \
175 | pack_context->current = nyp; \
176 | }
177 |
178 |
179 | #define tryMove0(t) \
180 | { \
181 | uint8_t *p; \
182 | cw_pack_reserve_space(1) \
183 | *p = (uint8_t)(t); \
184 | return; \
185 | }
186 |
187 | #define tryMove1(t,d) \
188 | { \
189 | uint8_t *p; \
190 | cw_pack_reserve_space(2) \
191 | *p++ = (uint8_t)t; \
192 | *p = (uint8_t)d; \
193 | return; \
194 | }
195 |
196 | #define tryMove2(t,d) \
197 | { \
198 | uint8_t *p; \
199 | cw_pack_reserve_space(3) \
200 | *p++ = (uint8_t)t; \
201 | cw_store16(d); \
202 | return; \
203 | }
204 |
205 | #define tryMove4(t,d) \
206 | { \
207 | uint8_t *p; \
208 | cw_pack_reserve_space(5) \
209 | *p++ = (uint8_t)t; \
210 | cw_store32(d); \
211 | return; \
212 | }
213 |
214 | #define tryMove8(t,d) \
215 | { \
216 | uint8_t *p; \
217 | cw_pack_reserve_space(9) \
218 | *p++ = (uint8_t)t; \
219 | cw_store64(d); \
220 | return; \
221 | }
222 |
223 |
224 |
225 |
226 | /******************************* U N P A C K **********************************/
227 |
228 |
229 |
230 | #define UNPACK_ERROR(error_code) \
231 | { \
232 | unpack_context->item.type = CWP_NOT_AN_ITEM; \
233 | unpack_context->return_code = error_code; \
234 | return; \
235 | }
236 |
237 |
238 |
239 | #ifdef COMPILE_FOR_BIG_ENDIAN
240 |
241 | #define cw_load16(ptr) tmpu16 = *(uint16_t*)ptr;
242 | #define cw_load32(ptr) tmpu32 = *(uint32_t*)ptr;
243 | #define cw_load64(ptr) tmpu64 = *(uint64_t*)ptr;
244 |
245 | #else /* Byte order little endian or undetermined */
246 |
247 | #ifdef COMPILE_FOR_LITTLE_ENDIAN
248 |
249 | #define cw_load16(ptr) \
250 | tmpu16 = *(uint16_t*)ptr; \
251 | tmpu16 = (uint16_t)((tmpu16<<8) | (tmpu16>>8))
252 |
253 | #define cw_load32(ptr) \
254 | tmpu32 = *(uint32_t*)ptr; \
255 | tmpu32 = (tmpu32<<24) | ((tmpu32 & 0xff00)<<8) | \
256 | ((tmpu32 & 0xff0000)>>8) | (tmpu32>>24)
257 |
258 | #define cw_load64(ptr) \
259 | tmpu64 = *((uint64_t*)ptr); \
260 | tmpu64 = ( \
261 | (((tmpu64 >> 40) | \
262 | (tmpu64 << 24)) & 0x0000ff000000ff00ULL) | \
263 | (((tmpu64 >> 24) | \
264 | (tmpu64 << 40)) & 0x00ff000000ff0000ULL) | \
265 | ((tmpu64 & 0x000000ff00000000ULL) >> 8) | \
266 | ((tmpu64 & 0x00000000ff000000ULL) << 8) | \
267 | (tmpu64 >> 56) | \
268 | (tmpu64 << 56) )
269 |
270 | #else /* Byte order undetermined */
271 |
272 | #define cw_load16(ptr) \
273 | tmpu16 = (uint16_t)((*ptr++) << 8); \
274 | tmpu16 |= (uint16_t)(*ptr++)
275 |
276 | #define cw_load32(ptr) \
277 | tmpu32 = (uint32_t)(*ptr++ << 24); \
278 | tmpu32 |= (uint32_t)(*ptr++ << 16); \
279 | tmpu32 |= (uint32_t)(*ptr++ << 8); \
280 | tmpu32 |= (uint32_t)(*ptr++)
281 |
282 | #define cw_load64(ptr) \
283 | tmpu64 = ((uint64_t)*ptr++) << 56; \
284 | tmpu64 |= ((uint64_t)*ptr++) << 48; \
285 | tmpu64 |= ((uint64_t)*ptr++) << 40; \
286 | tmpu64 |= ((uint64_t)*ptr++) << 32; \
287 | tmpu64 |= ((uint64_t)*ptr++) << 24; \
288 | tmpu64 |= ((uint64_t)*ptr++) << 16; \
289 | tmpu64 |= ((uint64_t)*ptr++) << 8; \
290 | tmpu64 |= (uint64_t)*ptr++
291 |
292 | #endif
293 | #endif
294 |
295 |
296 |
297 | #define cw_unpack_assert_space(more) \
298 | { \
299 | p = unpack_context->current; \
300 | uint8_t* nyp = p + more; \
301 | if (nyp > unpack_context->end) \
302 | { \
303 | if (!unpack_context->handle_unpack_underflow) \
304 | UNPACK_ERROR(buffer_end_return_code) \
305 | int rc = unpack_context->handle_unpack_underflow (unpack_context, (unsigned long)(more)); \
306 | if (rc != CWP_RC_OK) \
307 | { \
308 | if (rc != CWP_RC_END_OF_INPUT) \
309 | UNPACK_ERROR(rc) \
310 | else \
311 | UNPACK_ERROR(buffer_end_return_code) \
312 | } \
313 | p = unpack_context->current; \
314 | nyp = p + more; \
315 | } \
316 | unpack_context->current = nyp; \
317 | }
318 |
319 |
320 | #define cw_unpack_assert_blob(blob) \
321 | cw_unpack_assert_space(unpack_context->item.as.blob.length); \
322 | unpack_context->item.as.blob.start = p; \
323 | return;
324 |
325 |
326 | #define getDDItem(typ,var,val) \
327 | unpack_context->item.type = typ; \
328 | unpack_context->item.as.var = val;
329 |
330 | #define getDDItem1(typ,var,cast) \
331 | unpack_context->item.type = typ; \
332 | cw_unpack_assert_space(1); \
333 | unpack_context->item.as.var = (cast)*p;
334 |
335 | #define getDDItem2(typ,var,cast) \
336 | unpack_context->item.type = typ; \
337 | cw_unpack_assert_space(2); \
338 | cw_load16(p); \
339 | unpack_context->item.as.var = (cast)tmpu16;
340 |
341 | #define getDDItem4(typ,var,cast) \
342 | unpack_context->item.type = typ; \
343 | cw_unpack_assert_space(4); \
344 | cw_load32(p); \
345 | unpack_context->item.as.var = (cast)tmpu32;
346 |
347 | #define getDDItem8(typ) \
348 | unpack_context->item.type = typ; \
349 | cw_unpack_assert_space(8); \
350 | cw_load64(p); \
351 | unpack_context->item.as.u64 = tmpu64;
352 |
353 | #define getDDItemFix(len) \
354 | cw_unpack_assert_space(1); \
355 | unpack_context->item.type = *(int8_t*)p; \
356 | unpack_context->item.as.ext.length = len; \
357 | cw_unpack_assert_blob(ext);
358 |
359 |
360 |
361 |
362 | #endif /* cwpack_defines_h */
363 |
--------------------------------------------------------------------------------
/src/decode.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include "decode.h"
3 |
4 | #include
5 | #include "utf8.h"
6 |
7 | #ifdef DEBUG
8 | #define LOGD(FMT, ...) \
9 | LOG("%.*s " FMT, \
10 | MIN (cxt->opts->depth, 40), \
11 | ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>!", \
12 | ##__VA_ARGS__);
13 | #else
14 | #define LOGD(...) NULL
15 | #endif
16 |
17 | SEXP extract_sexp(cw_unpack_context *);
18 | int handle_unpack_underflow(cw_unpack_context *, unsigned long);
19 | void cw_unpack_next_or_fail(cw_unpack_context *);
20 | SEXP make_sexp_from_context(cw_unpack_context *);
21 |
22 | SEXP extract_simplified_vector(cw_unpack_context *);
23 | SEXP fill_vector(cw_unpack_context *, SEXP, uint32_t, PROTECT_INDEX, SEXP);
24 | SEXP coerce(SEXP, SEXPTYPE);
25 |
26 | SEXP extract_env(cw_unpack_context *);
27 | SEXP new_env(SEXP parent);
28 |
29 | SEXP make_charsxp_or_null(cw_unpack_context *);
30 | SEXP make_charsxp_or_raw_from_context(cw_unpack_context *);
31 | SEXP make_raw_from_context(cw_unpack_context *);
32 | SEXP make_ext_from_context(cw_unpack_context *);
33 |
34 | SEXP always_make_charsxp_from_context(cw_unpack_context *);
35 |
36 | double i64_to_double(cw_unpack_context *, int64_t);
37 | double u64_to_double(cw_unpack_context *, uint64_t);
38 |
39 | SEXPTYPE type_to_sexptype(int t);
40 |
41 | const char *decode_item_type(cwpack_item_types);
42 |
43 | /* used to make WARN_ONCE work: once per call */
44 | static long calls = 0;
45 |
46 | void * dataptr_or_null(SEXP x) {
47 | switch(TYPEOF(x)) {
48 | case VECSXP: return VECTOR_PTR(x);
49 | case RAWSXP: return RAW(x);
50 | case LGLSXP: return LOGICAL(x);
51 | case INTSXP: return INTEGER(x);
52 | case REALSXP: return REAL(x);
53 | case CPLXSXP: return COMPLEX(x);
54 | case STRSXP: return STRING_PTR(x);
55 | default: return 0;
56 | }
57 | }
58 |
59 |
60 |
61 | SEXP _unpack_opts(SEXP dict,
62 | SEXP use_df,
63 | SEXP simplify,
64 | SEXP package,
65 | SEXP max_size,
66 | SEXP max_depth,
67 | SEXP underflow_handler) {
68 | SEXP optsxp = PROTECT(allocVector(RAWSXP, sizeof(unpack_opts)));
69 | unpack_opts *opts = (unpack_opts*)(RAW(optsxp));
70 | opts->use_df = asLogical(use_df);
71 | opts->dict = dict;
72 | opts->simplify = asLogical(simplify);
73 | opts->package = package;
74 |
75 | opts->underflow_handler = underflow_handler;
76 | opts->buf = R_NilValue; /* the buf field is not filled out
77 | here. Whoever fills it out is responsible
78 | for managing protection. */
79 | opts->buf_index = -1;
80 | if (!R_finite(asReal(max_size))) {
81 | opts->max_pending = ULONG_MAX;
82 | LOG("max_size passed is not finite [%s: 0x%02lx]", type2char(TYPEOF(max_size)), *((unsigned long *) dataptr_or_null(max_size)));
83 | } else {
84 | LOG("max_size passed is finite [%s: 0x%02lx]", type2char(TYPEOF(max_size)), *((unsigned long *) dataptr_or_null(max_size)));
85 | opts->max_pending = asReal(max_size); /* as real since it may be > 2^32 */
86 | }
87 | if (!R_finite(asReal(max_depth))) {
88 | opts->max_depth = UINT_MAX;
89 | } else {
90 | opts->max_depth = asInteger(max_depth);
91 | }
92 | LOG("max_size = %f -> %lu; max_depth = %f -> %lu", asReal(max_size), opts->max_pending, asReal(max_depth), opts->max_depth);
93 |
94 | /* To make sure that R does not finalize the objects I'm using,
95 | also hang the object references off of it.
96 |
97 | The decode buffer is handled in init_unpack_context. */
98 | SEXP refs = PROTECT(CONS(dict,
99 | CONS(package,
100 | CONS(underflow_handler,
101 | R_NilValue))));
102 | setAttrib(optsxp,
103 | install("refs"),
104 | refs);
105 | UNPROTECT(2);
106 | return optsxp;
107 | }
108 |
109 |
110 | int init_unpack_context(cw_unpack_context *cxt,
111 | SEXP optsxp,
112 | SEXP buf,
113 | unsigned long start,
114 | unsigned long end) {
115 | assert_type(optsxp, RAWSXP);
116 | ASSERT(LENGTH(optsxp) == sizeof(unpack_opts));
117 |
118 | unpack_opts *opts = (unpack_opts *)RAW(optsxp);
119 | opts->depth = 0;
120 | opts->pending = 0;
121 |
122 | unpack_underflow_handler huu;
123 | if (opts->underflow_handler != R_NilValue) {
124 | huu = &handle_unpack_underflow;
125 | } else {
126 | huu = NULL;
127 | }
128 |
129 | int r = cw_unpack_context_init(cxt, RAW(buf) + start, end - start, huu);
130 | if (r)
131 | error("Could not initialize cw_unpack, %s", decode_return_code(r));
132 |
133 | cxt->opts = opts;
134 |
135 | /* Here is where we start holding a reference to the
136 | buffer... needss to be released on returning to R */
137 | if (cxt->opts->buf_index != -1) {
138 | LOGD("optsxp previously held a buffer iwth protextion index %d",
139 | cxt->opts->buf_index);
140 | }
141 | PROTECT_WITH_INDEX(cxt->opts->buf = buf, &(cxt->opts->buf_index));
142 | LOGD("First buffer is, ( 0x%x[%d:%d][%d] )",
143 | RAW(cxt->opts->buf),
144 | cxt->start - RAW(cxt->opts->buf),
145 | cxt->end - RAW(cxt->opts->buf),
146 | cxt->current - RAW(cxt->opts->buf));
147 | LOGD("buffer protected at index %d", cxt->opts->buf_index);
148 |
149 | return 1;
150 | }
151 |
152 |
153 |
154 | SEXP _unpack_msg(SEXP buf, SEXP optsxp) {
155 | calls++;
156 | cw_unpack_context cxt;
157 | int protections = init_unpack_context(&cxt, optsxp, buf, 0, LENGTH(buf));
158 | LOG("depth = %u, pending = %lu", cxt.opts->depth, cxt.opts->pending);
159 | SEXP out = PROTECT(extract_sexp(&cxt));
160 | protections++;
161 | LOG("depth = %u, pending = %lu", cxt.opts->depth, cxt.opts->pending);
162 | ASSERT(cxt.opts->depth == 0);
163 | ASSERT(cxt.opts->pending == 0);
164 | UNPROTECT(protections);
165 | return out;
166 | }
167 |
168 |
169 | int handle_unpack_underflow(cw_unpack_context *cxt, unsigned long x) {
170 | unsigned long need = x - ((cxt->end - cxt->current) / sizeof(*(cxt->end)));
171 | /* bounce back to R to read X bytes */
172 |
173 | if (cxt->opts->underflow_handler != R_NilValue) {
174 | unsigned long had = 0;
175 | while(TRUE) {
176 | assert_type(cxt->opts->underflow_handler, CLOSXP);
177 | LOGD("Asking R to read %d bytes", need - had);
178 | SEXP scurrent = PROTECT(ScalarInteger(cxt->current - RAW(cxt->opts->buf)));
179 | SEXP sneed = PROTECT(ScalarInteger(need - had));
180 | SEXP call = PROTECT(lang3(cxt->opts->underflow_handler,
181 | scurrent,
182 | sneed));
183 | /* result should be a list( rawsxp, start, end, current ) */
184 | SEXP result = PROTECT(eval(call, cxt->opts->package));
185 | assert_type(result, VECSXP);
186 | ASSERT(LENGTH(result) == 4);
187 | SEXP buf = VECTOR_ELT(result, 0);
188 | assert_type(buf, RAWSXP);
189 | unsigned long start = asInteger(VECTOR_ELT(result, 1));
190 | unsigned long end = asInteger(VECTOR_ELT(result, 2));
191 | unsigned long current = asInteger(VECTOR_ELT(result, 3));
192 | unsigned long have = (end - current) - (cxt->end - cxt->current);
193 | if (have >= need) {
194 | LOGD("Swapping buffers, ( 0x%x[%d:%d][%d] -> 0x%x[%d:%d][%d] )",
195 | RAW(cxt->opts->buf),
196 | cxt->start - RAW(cxt->opts->buf),
197 | cxt->end - RAW(cxt->opts->buf),
198 | cxt->current - RAW(cxt->opts->buf),
199 | RAW(buf), start, end, current);
200 |
201 | ASSERT(cxt->opts->buf_index != -1);
202 | REPROTECT(cxt->opts->buf = buf, cxt->opts->buf_index);
203 | cxt->start = RAW(buf) + start;
204 | cxt->end = RAW(buf) + end;
205 | cxt->current = RAW(buf) + current;
206 | UNPROTECT(4);
207 | return CWP_RC_OK;
208 | } else if (have > had) {
209 | LOGD("Not enough new data read, have %d, need %d", have, need);
210 | /* try again */
211 | had = have;
212 | UNPROTECT(4);
213 | } else {
214 | LOGD("No new data read");
215 | /* give up */
216 | UNPROTECT(4);
217 | return CWP_RC_END_OF_INPUT;
218 | }
219 | }
220 | } else {
221 | LOGD("No underflow handler");
222 | return CWP_RC_END_OF_INPUT;
223 | }
224 | }
225 |
226 |
227 |
228 | SEXP _unpack_msg_partial(SEXP buf, SEXP startx, SEXP endx, SEXP optsxp) {
229 | /* Note that this must be able to cope with underflow handlers. */
230 | calls++; /* reset WARN_ONCE */
231 |
232 | cw_unpack_context cxt;
233 | assert_type(optsxp, RAWSXP);
234 |
235 | unsigned long start = asInteger(startx);
236 | unsigned long end = asInteger(endx);
237 | int protections = init_unpack_context(&cxt, optsxp, buf, start, end);
238 | LOG("Before: buf = 0x%x[%d:%d][%d], status = '%s'",
239 | RAW(cxt.opts->buf),
240 | cxt.start - RAW(cxt.opts->buf),
241 | cxt.end - RAW(cxt.opts->buf),
242 | cxt.current - RAW(cxt.opts->buf),
243 | decode_return_code(cxt.return_code));
244 |
245 | SEXP msg = PROTECT(extract_sexp(&cxt));
246 | protections++;
247 |
248 | LOG("After: buf = 0x%x[%d:%d][%d], status = '%s'",
249 | RAW(cxt.opts->buf),
250 | cxt.start - RAW(cxt.opts->buf),
251 | cxt.end - RAW(cxt.opts->buf),
252 | cxt.current - RAW(cxt.opts->buf),
253 | decode_return_code(cxt.return_code));
254 |
255 | ASSERT(cxt.opts->depth == 0);
256 | ASSERT(cxt.opts->pending == 0);
257 |
258 | SEXP out = list3(
259 | PROTECT(ScalarString(mkChar(decode_return_code(cxt.return_code)))),
260 | msg,
261 | PROTECT(ScalarInteger((cxt.current - RAW(cxt.opts->buf)) / sizeof(Rbyte))));
262 |
263 | protections += 2;
264 |
265 | UNPROTECT(protections);
266 | return out;
267 | }
268 |
269 |
270 | SEXP extract_sexp(cw_unpack_context *cxt) {
271 | cw_unpack_next_or_fail(cxt);
272 | return make_sexp_from_context(cxt);
273 | }
274 |
275 |
276 | void cw_unpack_next_or_fail(cw_unpack_context *cxt) {
277 | cw_unpack_next(cxt);
278 | if (cxt->return_code != CWP_RC_OK) {
279 | error("%s",
280 | decode_return_code(cxt->return_code));
281 | }
282 | }
283 |
284 |
285 | void add_pending(cw_unpack_context *cxt, unsigned long howmany) {
286 | if (cxt->opts->depth >= cxt->opts->max_depth) {
287 | error("Message too deeply nested");
288 | }
289 | cxt->opts->pending += howmany;
290 | if (cxt->opts->pending >= cxt->opts->max_pending) {
291 | error("Pending message of %lu bytes too long, max is %lu", cxt->opts->pending, cxt->opts->max_pending);
292 | }
293 | LOGD("depth = %u, pending = %lu", cxt->opts->depth, cxt->opts->pending);
294 | }
295 |
296 |
297 | SEXP alloc_vector(cw_unpack_context *cxt, SEXPTYPE type, unsigned long len) {
298 | add_pending(cxt, len);
299 | return allocVector(type, len);
300 | }
301 |
302 |
303 | SEXP make_sexp_from_context(cw_unpack_context *cxt) {
304 | LOGD("Making sexp from a %s", decode_item_type(cxt->item.type));
305 | cwpack_item_types x = cxt->item.type;
306 | switch (x) {
307 | case CWP_ITEM_NIL:
308 | return ScalarLogical(NA_LOGICAL);
309 |
310 | case CWP_ITEM_BOOLEAN:
311 | return ScalarLogical(cxt->item.as.boolean);
312 |
313 | case CWP_ITEM_ARRAY:
314 | return extract_simplified_vector(cxt);
315 |
316 | case CWP_ITEM_BIN:
317 | return make_raw_from_context(cxt);
318 |
319 | case CWP_ITEM_STR: {
320 | SEXP x = PROTECT(make_charsxp_or_raw_from_context(cxt));
321 | LOGD("x is a %s", type2char(TYPEOF(x)));
322 | if (TYPEOF(x) == CHARSXP) {
323 | UNPROTECT(1);
324 | return ScalarString(x);
325 | } else {
326 | UNPROTECT(1);
327 | return x;
328 | }
329 | }
330 |
331 | case CWP_ITEM_NEGATIVE_INTEGER:
332 | case CWP_ITEM_POSITIVE_INTEGER:
333 | case CWP_ITEM_FLOAT:
334 | case CWP_ITEM_DOUBLE: {
335 | SEXP buf;
336 | PROTECT_INDEX ix;
337 | PROTECT_WITH_INDEX(buf = alloc_vector(cxt, type_to_sexptype(cxt->item.type), 1),
338 | &ix);
339 | buf = fill_vector(cxt, buf, 1, ix, R_NilValue);
340 | UNPROTECT(1);
341 | return buf;
342 | }
343 |
344 | case CWP_ITEM_MAP:
345 | if (TYPEOF(cxt->opts->dict) == ENVSXP) {
346 | return extract_env(cxt);
347 | } else {
348 | return extract_simplified_vector(cxt);
349 | }
350 |
351 | default:
352 | if (x <= CWP_ITEM_MAX_USER_EXT
353 | && x >= CWP_ITEM_MIN_USER_EXT) {
354 | return make_ext_from_context(cxt);
355 | } else if (x <= CWP_ITEM_MAX_RESERVED_EXT
356 | && x >= CWP_ITEM_MIN_RESERVED_EXT) {
357 | return make_ext_from_context(cxt);
358 | } else {
359 | error("Unsupported item: %s", decode_item_type(cxt->item.type));
360 | }
361 | }
362 | }
363 |
364 |
365 | SEXP extract_simplified_vector(cw_unpack_context *cxt) {
366 | int has_names = 0;
367 | switch(cxt->item.type) {
368 | case CWP_ITEM_ARRAY:
369 | has_names = 0; break;
370 | case CWP_ITEM_MAP:
371 | has_names = 1; break;
372 | default:
373 | error("Can't extract vector from a %s", decode_item_type(cxt->item.type));
374 | }
375 |
376 | uint32_t len = cxt->item.as.array.size;
377 | LOGD("Extracting array of %d elements, simplifying", len);
378 | SEXP buf;
379 | PROTECT_INDEX ix;
380 | int protections = 0;
381 |
382 | SEXP names = R_NilValue;
383 | if(has_names) {
384 | LOGD("Allocating names[%d]", len);
385 | PROTECT(names = alloc_vector(cxt, STRSXP, len));
386 | protections++;
387 | }
388 |
389 | /* Peek at the first item, allocate a buffer */
390 | if (len > 0) {
391 | if(has_names) {
392 | cw_unpack_next_or_fail(cxt);
393 | SEXP name = PROTECT(always_make_charsxp_from_context(cxt));
394 | SET_STRING_ELT(names, 0, name);
395 | UNPROTECT(1);
396 | }
397 | cw_unpack_next_or_fail(cxt);
398 | LOGD("first is a %s", decode_item_type(cxt->item.type));
399 |
400 | SEXPTYPE type;
401 | if (cxt->opts->simplify) {
402 | type = type_to_sexptype(cxt->item.type);
403 | } else {
404 | type = VECSXP;
405 | }
406 | PROTECT_WITH_INDEX(buf = alloc_vector(cxt, type, len),
407 | &ix);
408 | protections++;
409 | LOGD("Allocated %s vector of length %d, protection id %d",
410 | type2char(type), len, ix);
411 |
412 | buf = fill_vector(cxt, buf, len, ix, names);
413 |
414 | } else { /* len == 0 */
415 | buf = allocVector(LGLSXP, 0);
416 | }
417 |
418 | UNPROTECT(protections);
419 | return buf;
420 | }
421 |
422 |
423 | SEXP fill_vector(cw_unpack_context *cxt, SEXP buf, uint32_t len,
424 | PROTECT_INDEX ix, SEXP names) {
425 | cxt->opts->depth++;
426 |
427 | /* when called, we have the first item "primed" for us in cxt. Also
428 | buf has been protected. */
429 |
430 | int non_null_seen = 0;
431 | int output_data_frame = ((names != R_NilValue)
432 | && cxt->opts->use_df
433 | && cxt->item.type == CWP_ITEM_ARRAY
434 | && len > 0);
435 | long df_rows = 0;
436 | if (output_data_frame) {
437 | LOGD("Might be a data frame");
438 | df_rows = cxt->item.as.array.size;
439 | }
440 |
441 | for (int i = 0; i < len; i++) { /* loop once per item read */
442 | if (i > 0) { /* skip what we did when we peeked at first item */
443 | if (names != R_NilValue) {
444 | LOGD("Reading name %d", i);
445 | cw_unpack_next_or_fail(cxt);
446 | SEXP c = PROTECT(always_make_charsxp_from_context(cxt));
447 | SET_STRING_ELT(names, i, c);
448 | UNPROTECT(1);
449 | }
450 |
451 | LOGD("Unpacking item %d", i);
452 | cw_unpack_next_or_fail(cxt);
453 |
454 | if (output_data_frame
455 | && (cxt->item.type != CWP_ITEM_ARRAY
456 | || cxt->item.as.array.size != df_rows)) {
457 | LOGD("Not a data frame after all");
458 | output_data_frame = 0;
459 | }
460 | }
461 |
462 | cxt->opts->pending -= (names == R_NilValue ? 1 : 2);
463 |
464 | UNPACK_VALUE: /* goto here to "try again" after coercion */
465 | LOGD("Storing item %d, a %s, into a %s",
466 | i, decode_item_type(cxt->item.type), type2char(TYPEOF(buf)));
467 |
468 | switch (TYPEOF(buf)) {
469 |
470 | case LGLSXP:
471 |
472 | switch(cxt->item.type) {
473 |
474 | case CWP_ITEM_NIL:
475 | LOGICAL(buf)[i] = NA_LOGICAL;
476 | break;
477 |
478 | case CWP_ITEM_BOOLEAN:
479 | non_null_seen = 1;
480 | LOGICAL(buf)[i] = (cxt->item.as.boolean ? TRUE : FALSE);
481 | break;
482 |
483 | default:
484 | if (non_null_seen) {
485 | REPROTECT(buf = coerce(buf, VECSXP), ix);
486 | } else {
487 | non_null_seen = 1;
488 | REPROTECT(buf = coerce(buf, type_to_sexptype(cxt->item.type)), ix);
489 | }
490 | goto UNPACK_VALUE;
491 | }
492 | break;
493 |
494 |
495 | case INTSXP:
496 |
497 | switch(cxt->item.type) {
498 |
499 | case CWP_ITEM_NIL:
500 | INTEGER(buf)[i] = NA_INTEGER;
501 | break;
502 |
503 |
504 | case CWP_ITEM_NEGATIVE_INTEGER:
505 | if (cxt->item.as.i64 == NA_INTEGER
506 | || cxt->item.as.i64 < INT_MIN) {
507 | REPROTECT(buf = coerce(buf, REALSXP), ix);
508 | goto UNPACK_VALUE;
509 | } else {
510 | INTEGER(buf)[i] = cxt->item.as.i64;
511 | }
512 | break;
513 |
514 | case CWP_ITEM_POSITIVE_INTEGER:
515 | if (cxt->item.as.u64 > INT_MAX) {
516 | REPROTECT(buf = coerce(buf, REALSXP), ix);
517 | goto UNPACK_VALUE;
518 | } else {
519 | INTEGER(buf)[i] = cxt->item.as.u64;
520 | }
521 | break;
522 |
523 | case CWP_ITEM_FLOAT:
524 | case CWP_ITEM_DOUBLE:
525 | REPROTECT(buf = coerce(buf, REALSXP), ix);
526 | goto UNPACK_VALUE;
527 |
528 | default:
529 | REPROTECT(buf = coerce(buf, VECSXP), ix);
530 | goto UNPACK_VALUE;
531 | }
532 |
533 | break;
534 |
535 |
536 | case REALSXP:
537 | switch(cxt->item.type) {
538 |
539 | case CWP_ITEM_NIL:
540 | REAL(buf)[i] = NA_REAL;
541 | break;
542 |
543 | case CWP_ITEM_NEGATIVE_INTEGER:
544 | REAL(buf)[i] = i64_to_double(cxt, cxt->item.as.i64);
545 | break;
546 |
547 | case CWP_ITEM_POSITIVE_INTEGER:
548 | REAL(buf)[i] = u64_to_double(cxt, cxt->item.as.u64);
549 | break;
550 |
551 | case CWP_ITEM_FLOAT:
552 | REAL(buf)[i] = cxt->item.as.real;
553 | break;
554 |
555 | case CWP_ITEM_DOUBLE:
556 | REAL(buf)[i] = cxt->item.as.long_real;
557 | break;
558 |
559 | default:
560 | REPROTECT(buf = coerce(buf, VECSXP), ix);
561 | goto UNPACK_VALUE;
562 | }
563 |
564 | break;
565 |
566 |
567 | case STRSXP:
568 | switch(cxt->item.type) {
569 |
570 | case CWP_ITEM_NIL:
571 | SET_STRING_ELT(buf, i, NA_STRING);
572 | break;
573 |
574 | case CWP_ITEM_STR:
575 | {
576 | SEXP s = PROTECT(make_charsxp_or_null(cxt));
577 | if (s == R_NilValue) {
578 | REPROTECT(buf = coerce(buf, VECSXP), ix);
579 | UNPROTECT(1);
580 | goto UNPACK_VALUE;
581 | } else {
582 | SET_STRING_ELT(buf, i, s);
583 | }
584 | UNPROTECT(1);
585 | }
586 | break;
587 |
588 | default:
589 | REPROTECT(buf = coerce(buf, VECSXP), ix);
590 | goto UNPACK_VALUE;
591 | }
592 |
593 | break;
594 |
595 |
596 | case VECSXP:
597 | SET_VECTOR_ELT(buf, i, make_sexp_from_context(cxt));
598 |
599 | break;
600 |
601 |
602 | default:
603 | error("Don't know how to fill out a %s (this shouldn't happen)",
604 | type2char(TYPEOF(buf)));
605 |
606 | } /* switch(TYPEOF(buf)) */
607 |
608 | } /* for(i = ... */
609 |
610 | if (names != R_NilValue) {
611 | setAttrib(buf, R_NamesSymbol, names);
612 | }
613 |
614 | if (output_data_frame) {
615 | LOGD("Making a data frame");
616 | SEXP nrows = PROTECT(ScalarInteger(df_rows));
617 | SEXP call = PROTECT(lang2(install(".set_row_names"), nrows));
618 | SEXP rn = PROTECT(eval(call, R_BaseEnv));
619 | setAttrib(buf, R_RowNamesSymbol, rn);
620 | setAttrib(buf, R_ClassSymbol, ScalarString(mkChar("data.frame")));
621 | UNPROTECT(3);
622 | }
623 |
624 | LOGD("Returning a %s", type2char(TYPEOF(buf)));
625 | cxt->opts->depth--;
626 | return buf;
627 | } /* SEXP fill_vector(... */
628 |
629 |
630 | SEXP coerce(SEXP buf, SEXPTYPE type) {
631 | LOG("Coercing %s to %s", type2char(TYPEOF(buf)), type2char(type));
632 | return coerceVector(buf, type);
633 | }
634 |
635 |
636 | SEXP extract_env(cw_unpack_context *cxt) {
637 | ASSERT(cxt->item.type == CWP_ITEM_MAP);
638 | ASSERT(TYPEOF(cxt->opts->dict) == ENVSXP);
639 | int nkeys = cxt->item.as.map.size;
640 | add_pending(cxt, 2*nkeys);
641 | SEXP env = PROTECT(new_env(cxt->opts->dict));
642 | for (int i = 0; i < nkeys; i++) {
643 | cw_unpack_next_or_fail(cxt);
644 | cxt->opts->pending--;
645 | SEXP key = PROTECT(always_make_charsxp_from_context(cxt));
646 | SEXP value = PROTECT(extract_sexp(cxt));
647 | cxt->opts->pending--;
648 | if (LENGTH(key) == 0) {
649 | WARN_ONCE("Item with empty name was discarded");
650 | } else {
651 | SEXP sym = PROTECT(installChar(key));
652 |
653 | if (sym == R_MissingArg || sym == R_DotsSymbol || DDVAL(sym)) {
654 | WARN_ONCE("Item with key `%s` was discarded", CHAR(PRINTNAME(sym)));
655 | } else {
656 | LOGD("storing variable `%s` into a %s",
657 | CHAR(PRINTNAME(sym)), type2char(TYPEOF(env)));
658 | defineVar(sym, value, env);
659 | }
660 | UNPROTECT(1);
661 | }
662 | UNPROTECT(2);
663 | }
664 | UNPROTECT(1);
665 | return env;
666 | }
667 |
668 |
669 | SEXP new_env(SEXP parent) {
670 | ASSERT(TYPEOF(parent) == ENVSXP);
671 | SEXP call = PROTECT(lang3(install("new.env"), ScalarLogical(TRUE), parent));
672 | SEXP x = eval(call, R_BaseEnv);
673 | UNPROTECT(1);
674 | return x;
675 | }
676 |
677 |
678 | SEXP make_charsxp_or_raw_from_context(cw_unpack_context *cxt) {
679 | ASSERT(cxt->item.type == CWP_ITEM_STR || cxt->item.type == CWP_ITEM_BIN);
680 | SEXP c = PROTECT(make_charsxp_or_null(cxt));
681 | if (c == R_NilValue) {
682 | UNPROTECT(1);
683 | return make_raw_from_context(cxt);
684 | } else {
685 | UNPROTECT(1);
686 | return c;
687 | }
688 | }
689 |
690 |
691 | SEXP make_raw_from_context(cw_unpack_context *cxt) {
692 | uint32_t len = cxt->item.as.str.length;
693 | SEXP out = PROTECT(allocVector(RAWSXP, len));
694 | memcpy(RAW(out), cxt->item.as.str.start, len * sizeof(uint8_t));
695 | UNPROTECT(1);
696 | return out;
697 | }
698 |
699 |
700 | SEXP make_ext_from_context(cw_unpack_context *cxt) {
701 | char typename[32];
702 | WARN_ONCE("Extension type %d decoded as raw", cxt->item.type);
703 | snprintf(typename, sizeof(typename)/sizeof(char), "ext%d", cxt->item.type);
704 | SEXP out = PROTECT(make_raw_from_context(cxt));
705 | setAttrib(out, R_ClassSymbol, ScalarString(mkChar(typename)));
706 | UNPROTECT(1);
707 | return out;
708 | }
709 |
710 |
711 | SEXP make_charsxp_or_null(cw_unpack_context *cxt) {
712 | if (cxt->item.type == CWP_ITEM_STR || cxt->item.type == CWP_ITEM_BIN) {
713 | const char *buf = cxt->item.as.str.start;
714 | int len = cxt->item.as.str.length;
715 | int non_ascii = 0;
716 |
717 | for (int i = 0; i < len; i++) {
718 | if (buf[i] == 0) {
719 | WARN_ONCE("Embedded null in string, returning raw instead.");
720 | return R_NilValue;
721 | }
722 | if (buf[i] & 0x80) {
723 | non_ascii = 1;
724 | }
725 | }
726 |
727 | if (non_ascii) {
728 | /* maybe overly paranoid but I didn't see mkCharLenCE doing such
729 | verification */
730 | if (!verify_utf8(buf, len)) {
731 | WARN_ONCE("String is not valid UTF-8, returning raw instead.");
732 | return R_NilValue;
733 | }
734 | }
735 |
736 | return mkCharLenCE(buf, len, CE_UTF8);
737 | } else if (cxt->item.type == CWP_ITEM_NIL) {
738 | return NA_STRING;
739 | } else {
740 | return R_NilValue;
741 | }
742 | }
743 |
744 |
745 | SEXP always_make_charsxp_from_context(cw_unpack_context *cxt) {
746 | SEXP x = PROTECT(make_charsxp_or_null(cxt));
747 | if (x == R_NilValue) {
748 | WARN_ONCE("Non-string used as key was coerced to string");
749 | SEXP item = PROTECT(make_sexp_from_context(cxt));
750 | SEXP call = PROTECT(lang2(install("repr"), item));
751 | SEXP convert = PROTECT(eval(call, cxt->opts->package));
752 | LOG("repr returned a %s", type2char(TYPEOF(convert)));
753 | SEXP chr = STRING_ELT(convert, 0);
754 | UNPROTECT(4);
755 | return chr;
756 | } else {
757 | UNPROTECT(1);
758 | return x;
759 | }
760 | }
761 |
762 |
763 | double i64_to_double(cw_unpack_context *cxt, int64_t x) {
764 | double xx = x;
765 | if ((int64_t)xx != x) {
766 | WARN_ONCE("Cast of integer %" PRId64 " to double loses precision", x);
767 | };
768 | return xx;
769 | }
770 |
771 |
772 | double u64_to_double(cw_unpack_context *cxt, uint64_t x) {
773 | double xx = x;
774 | if ((uint64_t)xx != x) {
775 | WARN_ONCE("Cast of integer %" PRIu64 " to double loses precision", x);
776 | }
777 | return xx;
778 | }
779 |
780 |
781 | SEXPTYPE type_to_sexptype(int t) {
782 | switch(t) {
783 | case CWP_ITEM_NIL:
784 | case CWP_ITEM_BOOLEAN: return LGLSXP;
785 | case CWP_ITEM_POSITIVE_INTEGER:
786 | case CWP_ITEM_NEGATIVE_INTEGER: return INTSXP;
787 | case CWP_ITEM_FLOAT:
788 | case CWP_ITEM_DOUBLE: return REALSXP;
789 | case CWP_ITEM_STR: return STRSXP;
790 | default:
791 | {
792 | if (t <= CWP_ITEM_MAX_USER_EXT
793 | && t >= CWP_ITEM_MIN_USER_EXT) {
794 | return RAWSXP;
795 | } else if (t <= CWP_ITEM_MAX_RESERVED_EXT
796 | && t >= CWP_ITEM_MIN_RESERVED_EXT) {
797 | return RAWSXP;
798 | } else {
799 | return VECSXP;
800 | }
801 | }
802 | }
803 | }
804 |
805 |
806 | const char *decode_item_type(cwpack_item_types x) {
807 | switch(x) {
808 | case CWP_ITEM_NIL: return "nil";
809 | case CWP_ITEM_BOOLEAN: return "boolean";
810 | case CWP_ITEM_POSITIVE_INTEGER: return "positive integer";
811 | case CWP_ITEM_NEGATIVE_INTEGER: return "negative integer";
812 | case CWP_ITEM_FLOAT: return "float";
813 | case CWP_ITEM_DOUBLE: return "double";
814 | case CWP_ITEM_STR: return "string";
815 | case CWP_ITEM_BIN: return "binary";
816 | case CWP_ITEM_ARRAY: return "array";
817 | case CWP_ITEM_MAP: return "map";
818 | case CWP_NOT_AN_ITEM: return "not an item";
819 | default:
820 | if (x <= CWP_ITEM_MAX_USER_EXT
821 | && x >= CWP_ITEM_MIN_USER_EXT) {
822 | return "CWP_ITEM_USER_EXT(n)";
823 | } else if (x <= CWP_ITEM_MAX_RESERVED_EXT
824 | && x >= CWP_ITEM_MIN_RESERVED_EXT) {
825 | return "CWP_ITEM_RESERVED_EXT(n)";
826 | } else {
827 | return "???";
828 | }
829 | }
830 | }
831 |
--------------------------------------------------------------------------------
/src/decode.h:
--------------------------------------------------------------------------------
1 | #ifndef DECODE_H
2 | #define DECODE_H
3 |
4 | #include "vadr.h"
5 | #include "cwpack.h"
6 |
7 | SEXP extract_sexp(cw_unpack_context *);
8 | int init_unpack_context(cw_unpack_context *, SEXP, SEXP, unsigned long, unsigned long);
9 |
10 | SEXP _unpack_msg(SEXP, SEXP);
11 | SEXP _unpack_msg_partial(SEXP, SEXP, SEXP, SEXP);
12 | SEXP _unpack_opts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
13 |
14 | #endif
15 |
--------------------------------------------------------------------------------
/src/encode.c:
--------------------------------------------------------------------------------
1 | #include "cwpack.h"
2 | #include "vadr.h"
3 |
4 | int handle_overflow(cw_pack_context *, unsigned long);
5 | SEXP copy_to_new_raw(SEXP, unsigned long, unsigned long);
6 |
7 | void pack_sexp(cw_pack_context *, SEXP);
8 |
9 | void pack_singleton(cw_pack_context *, SEXP);
10 | void pack_vector(cw_pack_context*, SEXP);
11 | void pack_named_vector(cw_pack_context*, SEXP, SEXP);
12 |
13 | void pack_logical(cw_pack_context *, int);
14 | void pack_integer(cw_pack_context *, int);
15 | void pack_real(cw_pack_context *, double);
16 | void pack_string(cw_pack_context *, SEXP);
17 | void pack_raw(cw_pack_context *, SEXP);
18 |
19 | static long calls = 0;
20 |
21 | SEXP _pack_opts(SEXP compatible,
22 | SEXP as_is,
23 | SEXP use_dict,
24 | SEXP max_size,
25 | SEXP buf_size,
26 | SEXP package) {
27 | SEXP optsxp = PROTECT(allocVector(RAWSXP, sizeof(pack_opts)));
28 | pack_opts *opts = (pack_opts*)RAW(optsxp);
29 | opts->buf = R_NilValue;
30 | opts->buf_index = -1;
31 | opts->as_is = asLogical(as_is);
32 | opts->use_dict = asLogical(use_dict);
33 | opts->buf_size = asInteger(buf_size);
34 | double max_sized = asInteger(max_size);
35 | if (isinf(max_sized)) {
36 | opts->max_size = LONG_MAX;
37 | } else {
38 | opts->max_size = max_sized;
39 | }
40 | opts->compatible = asLogical(compatible);
41 |
42 | opts->package = package;
43 | /* also hang onto explicit refs for the SEXP values*/
44 | SEXP pkg = PROTECT(CONS(package, R_NilValue));
45 | setAttrib(optsxp, install("refs"), pkg);
46 | UNPROTECT(2);
47 | return optsxp;
48 | }
49 |
50 |
51 | int init_pack_context(cw_pack_context *cxt, SEXP opts) {
52 | ASSERT(TYPEOF(opts) == RAWSXP && LENGTH(opts) == sizeof(pack_opts));
53 | cxt->opts = (pack_opts *) RAW(opts);
54 |
55 | /* allocate a buffer and protect it. This is unprotected by caller */
56 | PROTECT_WITH_INDEX(cxt->opts->buf = allocVector(RAWSXP, cxt->opts->buf_size),
57 | &cxt->opts->buf_index);
58 | cw_pack_context_init(cxt,
59 | RAW(cxt->opts->buf),
60 | LENGTH(cxt->opts->buf),
61 | &handle_overflow);
62 | cw_pack_set_compatibility(cxt, cxt->opts->compatible);
63 | return 1; /* return protections */
64 | }
65 |
66 |
67 | SEXP _pack_msg(SEXP input, SEXP opts) {
68 | calls++;
69 | cw_pack_context cxt;
70 | int protections = init_pack_context(&cxt, opts);
71 |
72 | pack_sexp(&cxt, input);
73 |
74 | if (cxt.return_code != CWP_RC_OK) {
75 | error("%s", decode_return_code(cxt.return_code));
76 | }
77 | unsigned long newlength = (cxt.current - cxt.start) / sizeof(Rbyte);
78 | SETLENGTH(cxt.opts->buf, newlength);
79 | UNPROTECT(protections);
80 | return cxt.opts->buf;
81 | }
82 |
83 |
84 | int handle_overflow(cw_pack_context *cxt, unsigned long more) {
85 | /* allocate a vector twice as big, copy data into the new vector,
86 | and update context. */
87 | unsigned long newlen = LENGTH(cxt->opts->buf);
88 | unsigned long req = LENGTH(cxt->opts->buf) + more;
89 |
90 | if (req > cxt->opts->max_size) return CWP_RC_BUFFER_OVERFLOW;
91 |
92 | while (newlen < req) newlen *= 2;
93 | if (newlen > cxt->opts->max_size) {
94 | newlen = cxt->opts->max_size;
95 | }
96 |
97 | LOG("%d / %d bytes used, need %d more, resizing to %d\n",
98 | cxt->current - cxt->start,
99 | LENGTH(cxt->opts->buf),
100 | more,
101 | newlen);
102 |
103 | REPROTECT(cxt->opts->buf =
104 | copy_to_new_raw(cxt->opts->buf,
105 | newlen,
106 | LENGTH(cxt->opts->buf)),
107 | cxt->opts->buf_index);
108 |
109 | // update the context structure to point to the new buf
110 | cxt->current = cxt->current - cxt->start + RAW(cxt->opts->buf);
111 | cxt->end = RAW(cxt->opts->buf) + LENGTH(cxt->opts->buf) * sizeof(Rbyte);
112 | cxt->start = RAW(cxt->opts->buf);
113 | return 0;
114 | }
115 |
116 |
117 | SEXP copy_to_new_raw(SEXP from, unsigned long new_len, unsigned long copy_len) {
118 | assert_type(from, RAWSXP);
119 | SEXP to = PROTECT(allocVector(RAWSXP, new_len));
120 | memcpy(RAW(to), RAW(from), MIN(new_len, copy_len) * sizeof(Rbyte));
121 | UNPROTECT(1);
122 | return to;
123 | }
124 |
125 |
126 | int containsString(SEXP cl, const char *ch) {
127 | assert_type(cl, STRSXP);
128 | SEXP cmp = PROTECT(mkChar(ch));
129 | int found = 0;
130 | for (int i = 0; i < LENGTH(cl); i++) {
131 | if (STRING_ELT(cl, i) == cmp) {
132 | found = 1; break;
133 | }
134 | }
135 | UNPROTECT(1);
136 | return found;
137 | }
138 |
139 |
140 | void pack_sexp(cw_pack_context* cxt, SEXP dat) {
141 | int as_is_sto = cxt->opts->as_is;
142 | int unp = 0;
143 |
144 | /* check for asIs, classes */
145 | SEXP cl = getAttrib(dat, R_ClassSymbol);
146 | if ((cl) != R_NilValue) {
147 | if (containsString(cl, "AsIs")) {
148 | cxt->opts->as_is = TRUE;
149 | } else {
150 | /* Preprocess (unless in the middle of an AsIs) */
151 | LOG("Preprocessing a %s of class '%s'\n",
152 | type2char(TYPEOF(cxt->opts->package)),
153 | CHAR(STRING_ELT(cl, 0)));
154 | SEXP call = PROTECT(lang2(install("prepack"), dat));
155 | dat = PROTECT(eval(call, cxt->opts->package));
156 | unp += 2;
157 | /* Check if the preprocessor gave us an AsIs, but don't
158 | preprocess again */
159 | cl = getAttrib(dat, R_ClassSymbol);
160 | if (cl != R_NilValue && containsString(cl, "AsIs")) {
161 | cxt->opts->as_is = TRUE;
162 | LOG("Preprocessor returned an AsIs!");
163 | }
164 | }
165 | }
166 |
167 | if (isVector(dat)) {
168 | SEXP names = getAttrib(dat, R_NamesSymbol);
169 | if (names != R_NilValue && cxt->opts->use_dict) {
170 | PROTECT(names); unp += 1;
171 | pack_named_vector(cxt, dat, names);
172 | } else if (LENGTH(dat) == 1 && !cxt->opts->as_is) {
173 | pack_singleton(cxt, dat);
174 | } else {
175 | pack_vector(cxt, dat);
176 | }
177 | } else {
178 | switch (TYPEOF(dat)) {
179 |
180 | case NILSXP:
181 | cw_pack_nil(cxt); break;
182 |
183 | case ENVSXP:
184 | {
185 | SEXP args = PROTECT(lang4(install("as.list.environment"),
186 | dat,
187 | ScalarLogical(1),
188 | ScalarLogical(1)));
189 | SEXP list = PROTECT(eval(args, R_BaseEnv));
190 | unp += 2;
191 | SEXP names = getAttrib(list, R_NamesSymbol);
192 | pack_named_vector(cxt, list, names);
193 | break;
194 | }
195 |
196 | default:
197 | cxt->opts->buf = NULL;
198 | error("can't pack a %s", type2char(TYPEOF(dat)));
199 | }
200 | }
201 | UNPROTECT(unp);
202 | cxt->opts->as_is = as_is_sto;
203 | }
204 |
205 |
206 | void pack_singleton(cw_pack_context *cxt, SEXP dat) {
207 | switch (TYPEOF(dat)) {
208 | case LGLSXP:
209 | pack_logical(cxt, LOGICAL(dat)[0]);
210 | break;
211 |
212 | case INTSXP:
213 | pack_integer(cxt, INTEGER(dat)[0]);
214 | break;
215 |
216 | case REALSXP:
217 | pack_real(cxt, REAL(dat)[0]);
218 | break;
219 |
220 | case STRSXP:
221 | pack_string(cxt, STRING_ELT(dat, 0));
222 | break;
223 |
224 | case RAWSXP:
225 | pack_raw(cxt, dat);
226 | break;
227 |
228 | case VECSXP:
229 | pack_vector(cxt, dat);
230 | break;
231 |
232 | default:
233 | cxt->opts->buf = NULL;
234 | error("can't pack a singleton %s", type2char(TYPEOF(dat)));
235 | }
236 | }
237 |
238 | // generic for-loop macro
239 | #define PACK_VECTOR(CXT, X, ACCESSOR, STORE) { \
240 | int len = LENGTH(X); \
241 | cw_pack_array_size(cxt, len); \
242 | for (int i = 0; i < len; i++) { \
243 | STORE(CXT, ACCESSOR(X, i)); \
244 | } \
245 | }
246 |
247 | void pack_vector(cw_pack_context *cxt, SEXP x) {
248 | ASSERT(isVector(x));
249 |
250 | switch(TYPEOF(x)) {
251 |
252 | case RAWSXP:
253 | pack_raw(cxt, x);
254 | break;
255 |
256 | case LGLSXP:
257 | PACK_VECTOR(cxt, x, LOGICAL_ELT, pack_logical);
258 | break;
259 |
260 | case INTSXP:
261 | PACK_VECTOR(cxt, x, INTEGER_ELT, pack_integer);
262 | break;
263 |
264 | case REALSXP:
265 | PACK_VECTOR(cxt, x, REAL_ELT, pack_real);
266 | break;
267 |
268 | case VECSXP:
269 | PACK_VECTOR(cxt, x, VECTOR_ELT, pack_sexp);
270 | break;
271 |
272 | case STRSXP:
273 | PACK_VECTOR(cxt, x, STRING_ELT, pack_string);
274 | break;
275 |
276 | default:
277 | cxt->opts->buf = NULL;
278 | error("Don't know how to pack a %s vector", type2char(TYPEOF(x)));
279 | }
280 | }
281 |
282 | // generic for-loop macro
283 | #define PACK_NAMED_VECTOR(CXT, X, NAMES, ACCESSOR, STORE) { \
284 | int len = LENGTH(X); \
285 | cw_pack_map_size(cxt, len); \
286 | for (int i = 0; i < len; i++) { \
287 | pack_string(CXT, STRING_ELT(NAMES, i)); \
288 | STORE(CXT, ACCESSOR(X, i)); \
289 | } \
290 | }
291 |
292 | void pack_named_vector(cw_pack_context *cxt, SEXP x, SEXP names) {
293 | ASSERT(isVector(x));
294 |
295 | switch(TYPEOF(x)) {
296 |
297 | case RAWSXP:
298 | WARN_ONCE("Names discarded from raw object");
299 | pack_raw(cxt, x);
300 | break;
301 |
302 | case LGLSXP:
303 | PACK_NAMED_VECTOR(cxt, x, names, LOGICAL_ELT, pack_logical);
304 | break;
305 |
306 | case INTSXP:
307 | PACK_NAMED_VECTOR(cxt, x, names, INTEGER_ELT, pack_integer);
308 | break;
309 |
310 | case REALSXP:
311 | PACK_NAMED_VECTOR(cxt, x, names, REAL_ELT, pack_real);
312 | break;
313 |
314 | case VECSXP:
315 | PACK_NAMED_VECTOR(cxt, x, names, VECTOR_ELT, pack_sexp);
316 | break;
317 |
318 | case STRSXP:
319 | PACK_NAMED_VECTOR(cxt, x, names, STRING_ELT, pack_string);
320 | break;
321 |
322 | default:
323 | cxt->opts->buf = NULL;
324 | error("Don't know how to pack a %s vector", type2char(TYPEOF(x)));
325 | }
326 | }
327 |
328 |
329 | void pack_logical(cw_pack_context *cxt, int x) {
330 | if (x == NA_LOGICAL) {
331 | cw_pack_nil(cxt);
332 | } else if (x) {
333 | cw_pack_true(cxt);
334 | } else {
335 | cw_pack_false(cxt);
336 | }
337 | }
338 |
339 | void pack_integer(cw_pack_context *cxt, int x) {
340 | if (x == NA_INTEGER) {
341 | cw_pack_nil(cxt);
342 | } else {
343 | cw_pack_signed(cxt, x);
344 | }
345 | }
346 |
347 | void pack_real(cw_pack_context *cxt, double x) {
348 | /* save bytes by packing integer if possible. */
349 | if (ISNA(x)) {
350 | cw_pack_nil(cxt);
351 | } else if (ceil(x) == x) {
352 | if (x >= 0 && x <= UINT32_MAX) {
353 | cw_pack_unsigned(cxt, x);
354 | } else if (x < 0 && x >= INT32_MIN) {
355 | cw_pack_signed(cxt, x);
356 | } else {
357 | /* NaN's end up here. */
358 | cw_pack_real(cxt, x);
359 | }
360 | } else {
361 | cw_pack_real(cxt, x);
362 | }
363 | }
364 |
365 | void pack_string(cw_pack_context *cxt, SEXP x) {
366 | assert_type(x, CHARSXP);
367 | if (x == NA_STRING) {
368 | cw_pack_nil(cxt);
369 | } else {
370 | if (getCharCE(x) != CE_UTF8) {
371 | /* reEnc allocates temp memory */
372 | void *vmax = vmaxget();
373 | const char *newbuf = reEnc(CHAR(x), getCharCE(x), CE_UTF8, 1);
374 | x = PROTECT(mkCharCE(newbuf, CE_UTF8));
375 | int len = R_nchar(x, Bytes, 0, 0, "");
376 | cw_pack_str(cxt, CHAR(x), len);
377 | UNPROTECT(1);
378 | /* free temp memory */
379 | vmaxset(vmax);
380 | } else {
381 | int len = R_nchar(x, Bytes, 0, 0, "");
382 | cw_pack_str(cxt, CHAR(x), len);
383 | }
384 | }
385 | }
386 |
387 | void pack_raw(cw_pack_context *cxt, SEXP x) {
388 | assert_type(x, RAWSXP);
389 | cw_pack_bin(cxt, RAW(x), LENGTH(x));
390 | }
391 |
--------------------------------------------------------------------------------
/src/encode.h:
--------------------------------------------------------------------------------
1 | #ifndef ENCODE_H
2 | #define ENCODE_H
3 |
4 | #include "vadr.h"
5 | #include "cwpack.h"
6 |
7 | SEXP _pack_opts(SEXP, SEXP, SEXP, SEXP, SEXP, SEXP);
8 | SEXP _pack_msg(SEXP, SEXP);
9 |
10 | void pack_sexp(cw_pack_context *, SEXP);
11 | int init_pack_context(cw_pack_context *, SEXP);
12 |
13 | #endif
14 |
15 |
--------------------------------------------------------------------------------
/src/init.c:
--------------------------------------------------------------------------------
1 | #include
2 | #include
3 | #include // for NULL
4 | #include
5 | #include "encode.h"
6 | #include "decode.h"
7 |
8 | /* FIXME:
9 | Check these declarations against the C/Fortran source code.
10 | */
11 |
12 | static const R_CallMethodDef CallEntries[] = {
13 | {"_pack_msg", (DL_FUNC) &_pack_msg, 2},
14 | {"_pack_opts", (DL_FUNC) &_pack_opts, 6},
15 | {"_unpack_msg", (DL_FUNC) &_unpack_msg, 2},
16 | {"_unpack_msg_partial", (DL_FUNC) &_unpack_msg_partial, 4},
17 | {"_unpack_opts", (DL_FUNC) &_unpack_opts, 7},
18 | {NULL, NULL, 0}
19 | };
20 |
21 | void R_init_msgpack(DllInfo *dll)
22 | {
23 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
24 | R_useDynamicSymbols(dll, FALSE);
25 | }
26 |
--------------------------------------------------------------------------------
/src/opts.h:
--------------------------------------------------------------------------------
1 | #ifndef OPTS_H
2 | #define OPTS_H
3 |
4 | #include
5 | #include
6 |
7 | /* To hand options down from R code into the callbacks, I attach this
8 | options structure to the CWpack state structures.
9 |
10 | These structures exist at R level as the contents of some RAWSXP.
11 | This seems safe enough. But part of these structures are also
12 | pointers to SEXPs. I cover for this by also holding the SEXP
13 | pointers as an attrubute. But this feels tacky.
14 | */
15 |
16 | typedef struct unpack_opts {
17 | SEXP dict;
18 | int use_df;
19 | int simplify;
20 | SEXP package;
21 | unsigned int max_depth;
22 | unsigned long max_pending;
23 |
24 | /* state variables used in unpack */
25 | SEXP buf;
26 | PROTECT_INDEX buf_index;
27 | unsigned int depth;
28 | unsigned long pending;
29 | SEXP underflow_handler;
30 |
31 | } unpack_opts;
32 |
33 |
34 | typedef struct pack_opts {
35 | int as_is;
36 | int compatible;
37 | int use_dict;
38 | long max_size;
39 | long buf_size;
40 |
41 | /* state used by pack / write */
42 | SEXP buf;
43 | PROTECT_INDEX buf_index;
44 | SEXP package;
45 |
46 | /* state used by write */
47 | SEXP conn;
48 |
49 | } pack_opts;
50 |
51 | #endif
52 |
--------------------------------------------------------------------------------
/src/utf8.c:
--------------------------------------------------------------------------------
1 | #include "utf8.h"
2 | #include "vadr.h"
3 |
4 | /* Copyright (c) 2008-2009 Bjoern Hoehrmann
5 | See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details. */
6 |
7 | #define UTF8_ACCEPT 0
8 | #define UTF8_REJECT 1
9 |
10 | static const uint8_t utf8d[] = {
11 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
12 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
13 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
14 | 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
15 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
16 | 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
17 | 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
18 | 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
19 | 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
20 | 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
21 | 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
22 | 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
23 | 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
24 | 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
25 | };
26 |
27 | uint32_t
28 | decode_utf8(uint32_t* state, uint32_t* codep, uint32_t byte) {
29 | uint32_t type = utf8d[byte];
30 |
31 | *codep = (*state != UTF8_ACCEPT) ?
32 | (byte & 0x3fu) | (*codep << 6) :
33 | (0xff >> type) & (byte);
34 |
35 | *state = utf8d[256 + *state*16 + type];
36 | return *state;
37 | }
38 |
39 | /* LICENSE for above code: */
40 |
41 | /* Permission is hereby granted, free of charge, to any person
42 | obtaining a copy of this software and associated documentation
43 | files (the "Software"), to deal in the Software without
44 | restriction, including without limitation the rights to use, copy,
45 | modify, merge, publish, distribute, sublicense, and/or sell copies
46 | of the Software, and to permit persons to whom the Software is
47 | furnished to do so, subject to the following conditions: */
48 |
49 | /* The above copyright notice and this permission notice shall be
50 | included in all copies or substantial portions of the Software. */
51 |
52 | /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
53 | EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
54 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
55 | NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
56 | BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
57 | ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
58 | CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
59 | SOFTWARE. */
60 |
61 |
62 | int verify_utf8(const char *x, uint32_t len) {
63 | uint32_t codepoint = 0;
64 | uint32_t state = 0;
65 |
66 | for (uint32_t i = 0; i < len; i++) {
67 | decode_utf8(&state, &codepoint, (unsigned char) x[i]);
68 | }
69 |
70 | return state == UTF8_ACCEPT;
71 | }
72 |
--------------------------------------------------------------------------------
/src/utf8.h:
--------------------------------------------------------------------------------
1 | #ifndef UTF8_H
2 | #define UTF8_H
3 |
4 | #include
5 |
6 | int verify_utf8(const char *, uint32_t len);
7 |
8 | #endif
9 |
--------------------------------------------------------------------------------
/src/vadr.c:
--------------------------------------------------------------------------------
1 | #include "vadr.h"
2 | #include "cwpack.h"
3 | #include
4 |
5 | void assert_type5(SEXP x, SEXPTYPE type, const char *where,
6 | const char *file, int line) {
7 | if (TYPEOF(x) != type) {
8 | error("Expected %s in %s, got %s (%s:%d)",
9 | type2char(type), where, type2char(TYPEOF(x)), file, line);
10 | }
11 | }
12 |
13 | const char *decode_return_code(int x) {
14 | switch(x) {
15 | case CWP_RC_OK: return "ok";
16 | case CWP_RC_END_OF_INPUT: return "end of input";
17 | case CWP_RC_BUFFER_OVERFLOW: return "buffer overflow";
18 | case CWP_RC_BUFFER_UNDERFLOW: return "buffer underflow";
19 | case CWP_RC_MALFORMED_INPUT: return "malformed input";
20 | case CWP_RC_WRONG_BYTE_ORDER: return "wrong byte order";
21 | case CWP_RC_ERROR_IN_HANDLER: return "error in handler";
22 | case CWP_RC_ILLEGAL_CALL: return "illegal call";
23 | case CWP_RC_MALLOC_ERROR: return "malloc error";
24 | case CWP_RC_STOPPED: return "stopped";
25 | default: return "unknown error";
26 | }
27 | }
28 |
29 |
--------------------------------------------------------------------------------
/src/vadr.h:
--------------------------------------------------------------------------------
1 | #ifndef _VADR_H
2 | #define _VADR_H
3 |
4 | #include
5 | #include
6 |
7 | #define MIN(x,y) ((x) < (y) ? (x) : (y))
8 | #define MAX(x,y) ((x) > (y) ? (x) : (y))
9 | #define ASSERT(X) {if (! (X)) { error("%s: expected that (%s) @%s:%d\n", __func__, #X, __FILE__, __LINE__); }}
10 |
11 | #define LOGICAL_ELT(O, I) (LOGICAL(O)[I])
12 | #define INTEGER_ELT(O, I) (INTEGER(O)[I])
13 | #define REAL_ELT(O, I) (REAL(O)[I])
14 |
15 | #define WARN_ONCE(...) { \
16 | static long last_warned = 0; \
17 | if (last_warned < calls) { \
18 | last_warned = calls; \
19 | warning(__VA_ARGS__); \
20 | } \
21 | }
22 |
23 | #undef DEBUG
24 |
25 | #ifdef DEBUG
26 | #define LOG(FMT, ...) Rprintf("%s: " FMT " @%s:%d\n", \
27 | __func__, ##__VA_ARGS__, __FILE__, __LINE__)
28 | #else
29 | #define LOG(...) NULL
30 | #endif
31 |
32 | #define assert_type(x, type) assert_type3(x, type, __func__)
33 | #define assert_type3(x, type, where) assert_type5(x, type, where, __FILE__, __LINE__)
34 | void assert_type5(SEXP x, SEXPTYPE type, const char *where, const char *file, int line);
35 | const char *decode_return_code(int);
36 |
37 | #endif
38 |
39 |
--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(msgpack)
3 |
4 | test_check("msgpack")
5 |
--------------------------------------------------------------------------------
/tests/testthat/test-static.R:
--------------------------------------------------------------------------------
1 | context("msgpackr static API")
2 |
3 | `%is%` <- expect_equal
4 |
5 |
6 | roundtrip <- function(start) {
7 | bin <- packMsg(start)
8 | end <- unpackMsg(bin)
9 | expect_equal(start, end)
10 | bin
11 | }
12 |
13 |
14 | pack_rt <- function (start, cmp) {
15 | bin <- packMsg(start)
16 | expect_equal(bin, cmp)
17 | end <- unpackMsg(bin)
18 | expect_equivalent(start, end)
19 | }
20 |
21 | test_that("pack singletons", {
22 | #null
23 | pack_rt(NA, as.raw(0xc0))
24 | packMsg(NULL) %is% as.raw(0xc0)
25 |
26 | #logical
27 | pack_rt(FALSE, as.raw(0xc2))
28 | pack_rt(TRUE, as.raw(0xc3))
29 |
30 | #small ints
31 | pack_rt(12L, as.raw(0x0c))
32 | pack_rt(-4L, as.raw(0xfc))
33 |
34 | #32 bit ints
35 | pack_rt(2147483647L, as.raw(c(0xCE, 0x7f, 0xff, 0xff, 0xff)))
36 |
37 | # cwpack will use 32 bit float if precision is preserved. For example, Inf:
38 | pack_rt(Inf, as.raw(c(0xca, 0x7f, 0x80, 0x00, 0x00)))
39 |
40 | # and a float64:
41 | x <- 1.7976931348623157e308 # .Machine$double.xmax
42 | # 0 11111111110 1111111111111111111111111111111111111111111111111111
43 | pack_rt(x, as.raw(c(0xCB, 0x7F, 0xEF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF)))
44 |
45 | # character
46 | pack_rt("hello",
47 | as.raw(c(0xa5, 0x68, 0x65, 0x6c, 0x6c, 0x6f)))
48 |
49 | # raw bytes
50 | pack_rt(as.raw(0xab),
51 | as.raw(c(0xc4, 0x01, 0xab)))
52 |
53 | #NAs and NULL all collapse to nil
54 | packMsg(NA_character_) %is% as.raw(0xc0)
55 | packMsg(NA_real_) %is% as.raw(0xc0)
56 | packMsg(NA_integer_) %is% as.raw(0xc0)
57 | packMsg(NULL) %is% as.raw(0xc0)
58 | })
59 |
60 | test_that("unpack large ints to float", {
61 | bigint = as.raw(c(0xcf, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01))
62 | negint = as.raw(c(0xd3, 0xff, 0xff, 0xff, 0xff, 0x7f, 0xff, 0xff, 0xff))
63 | not_na = as.raw(c(0xd3, 0xff, 0xff, 0xff, 0xff, 0x80, 0x00, 0x00, 0x00))
64 | bigfloat = 9007199254740993 #read as float loses precision
65 | negfloat = -2147483649
66 | not_na_float = -2147483648
67 | expect_warning(unpackMsg(bigint) %is% bigfloat, "precision")
68 | unpackMsg(negint) %is% negfloat
69 | unpackMsg(not_na) %is% not_na_float
70 | #
71 | # and all in a vector...
72 | expect_warning(
73 | unpackMsg(c(as.raw(0x93), bigint, negint, not_na)) %is%
74 | c(bigfloat, negfloat, not_na_float),
75 | "precision")
76 | })
77 |
78 | test_that("nice errors from unpack", {
79 | expect_error(unpackMsg(as.raw(c(0x92, 0xc0))),
80 | "end of input")
81 | })
82 |
83 |
84 | test_that("Pack raws", {
85 | roundtrip(as.raw(c(0xab, 0xbc, 0x00)))
86 | })
87 |
88 |
89 | test_that("Pack lists", {
90 | roundtrip(list(1, "what"))
91 | roundtrip(list("a", list("b", 4)))
92 | })
93 |
94 |
95 | test_that("unpack simplified vectors", {
96 | roundtrip(c(FALSE, NA)) #bool
97 | roundtrip(list(FALSE, 3L)) #list; don't coerce logicals (that aren't all NA)
98 | roundtrip(c(1L, NA)) #integer
99 | roundtrip(c(1L, NA, 1.0)) #real
100 | roundtrip(c("hello", NA)) #string
101 | roundtrip(list(1L, 2L, "hi")) #list, don't coerce to char
102 | roundtrip(list(c(1,2), c("hi", "bye"))) #list
103 | })
104 |
105 |
106 | test_that("unpack simplified vectors starting with NA", {
107 | roundtrip(c(NA, FALSE, TRUE))
108 | roundtrip(c(NA, 1L, 2L))
109 | roundtrip(c(NA, exp(0), pi))
110 | roundtrip(c(NA, "hi", "bye"))
111 | })
112 |
113 |
114 | test_that("pack zero length vectors", {
115 | roundtrip(logical(0))
116 | })
117 |
118 |
119 | test_that("packing overflow handler works", {
120 | expect_true(length(packMsg(1:10000)) > 1000)
121 | })
122 |
123 | test_that("extension mechanism", {
124 | obj <- c(1, 2, 3)
125 | class(obj) <- c("reverse")
126 | assign(envir = globalenv(), "prepack.reverse", function(x) rev(x))
127 | unpackMsg(packMsg(obj)) %is% c(3, 2, 1)
128 | })
129 |
130 |
131 | test_that("recursive use of msgpack works", {
132 | assign(envir = globalenv(), "prepack.blob", function(x) packMsg(unclass(x)))
133 | obj <- "hello"
134 | class(obj) <- "blob"
135 | typeof(unpackMsg(packMsg(obj))) %is% "raw"
136 | })
137 |
138 |
139 | test_that("Max buffer size", {
140 | packMsg(300:400, max_size=306, buf_size=10)
141 | expect_error(packMsg(300:401, max_size=306, buf_size = 10), "overflow")
142 | })
143 |
144 |
145 | test_that("NA and NaN are distinct doubles,", {
146 | roundtrip(c(NA, NaN))
147 | })
148 |
149 |
150 | test_that("compatibility mode", {
151 | packMsg(as.raw(c(1, 2, 3))) %is% as.raw(c(0xc4, 0x03, 0x01, 0x02, 0x03))
152 | packMsg(as.raw(c(1, 2, 3)), compatible=TRUE) %is% as.raw(c(0xa3, 0x01, 0x02, 0x03))
153 | })
154 |
155 |
156 | test_that("UnpackMsg: detect bad strings, warn, and return raw", {
157 |
158 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa3, 0x00, 0x62, 0x63))),
159 | as.raw(c(0x00, 0x62, 0x63))),
160 | "nul")
161 |
162 | #and check for malformed UTF8
163 | #3 byte sequence with last continuation byte missing
164 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa2, 0x30, 0x80))),
165 | as.raw(c(0x30, 0x80))),
166 | "UTF")
167 |
168 | #2 bytes of 3 byte sequence followed by space
169 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa3, 0x30, 0x80, 0x20))),
170 | as.raw(c(0x30, 0x80, 0x20))),
171 | "UTF")
172 |
173 | # illegal byte
174 | expect_warning(expect_equal(unpackMsg(as.raw(c(0xa1, 0xff))),
175 | as.raw(c(0xff))),
176 | "UTF")
177 | #also for malformed UTF8?
178 |
179 | })
180 |
181 | test_that("always emit strings in UTF8,", {
182 | x <- "fa\xE7ile"
183 | Encoding(x) <- "latin1"
184 | packMsg(x) %is% as.raw(c(0xa7, 0x66, 0x61, 0xc3, 0xa7, 0x69, 0x6c, 0x65))
185 | })
186 |
187 | test_that("use ints for integral floats under 32 bits", {
188 | packMsg(1) %is% packMsg(1L)
189 | length(packMsg(2^32)) %is% 5
190 | length(packMsg(2^32+1)) %is% 9
191 | length(packMsg(-2^31)) %is% 5
192 | length(packMsg(-2^31-1)) %is% 9
193 | })
194 |
195 |
196 | test_that("as_is uses arrays even for singletons", {
197 | length(packMsg(1)) %is% 1
198 | length(packMsg(1, as_is=TRUE)) %is% 2
199 | length(packMsg(list(1, 2, 3))) %is% 4
200 | length(packMsg(list(1, 2, 3), as_is = TRUE)) %is% 7
201 | unpackMsg(packMsg(list(1, 2, 3))) %is% c(1, 2, 3)
202 | unpackMsg(packMsg(list(1, 2, 3), as_is = TRUE)) %is% list(1, 2, 3)
203 | length(packMsg(I(1), as_is=FALSE)) %is% 2
204 | })
205 |
206 |
207 | test_that( "single row data frames also pack with asIs", {
208 | expect_true( length(packMsg(data.frame( a=1, b=2)))
209 | > length(packMsg(list(a=1, b=2))))
210 | })
211 |
212 |
213 | test_that("pack named vectors into dicts", {
214 | unpackMsg(packMsg(list(a=1, b=NULL))) %is% c(a=1, b=NA)
215 | unpackMsg(packMsg(list(a=1, b=NULL), use_dict=FALSE)) %is% c(1, NA)
216 | })
217 |
218 |
219 | test_that("Unpack dicts into envs", {
220 | unpackMsg(packMsg(list2env(list(a=1, b=2)))) %is% c(a=1, b=2)
221 | x <- new.env()
222 | e <- unpackMsg(packMsg(list(a = 1, b = NA)), parent = x)
223 | typeof(e) %is% "environment"
224 | as.list(e) %is% list(a = 1, b = NA)
225 | parent.env(e) %is% x
226 | unpackMsg(packMsg(emptyenv()), parent=emptyenv())
227 | })
228 |
229 |
230 | test_that("non-string dict keys", {
231 | d = as.raw(c(0x82, 0xa1, 0x61, 0x01, 0x02, 0x02))
232 | expect_warning(unpackMsg(d) %is% c(a=1, `2`=2), "string")
233 | d2 = as.raw(c(as.raw(c(0x82, 0xa1, 0x61, 0x92, 0x01, 0x04, 0x92, 0x02, 0x04, 0x00))))
234 | expect_warning(unpackMsg(d2) %is% list(a=c(1, 4), `c(2, 4)` = 0), "string")
235 | })
236 |
237 |
238 | test_that("pack envs into sorted dicts", {
239 | e <- list2env(list(c=3, b=1, d=4, a=2))
240 | unpackMsg(packMsg(e)) %is% c(a=2, b=1, c=3, d=4)
241 | })
242 |
243 |
244 |
245 | test_that("Unpack dicts into envs", {
246 | x <- unpackMsg(packMsg(c(a=2, b=1, c=3, d=4)), parent=environment())
247 | expect_equal(as.list.environment(x, sorted = TRUE),
248 | list(a=2, b=1, c=3, d=4))
249 | })
250 |
251 |
252 | test_that("warn bad var names and discard", {
253 | b <- packMsg(c(a=1, 3, b=4))
254 | expect_warning(e <- unpackMsg(b, parent=environment()), "empty")
255 | as.list.environment(e, all.names=TRUE, sorted=TRUE) %is% list(a=1, b=4)
256 | })
257 |
258 |
259 | test_that("NA names, dots names...", {
260 | x <- list(2, "four", rep(1,6), c(), list())
261 | n <- c("two", NA, "...", "..5", "")
262 | names(x) <- n
263 | names(unpackMsg(packMsg(x))) %is% n
264 | expect_warning(e <- unpackMsg(packMsg(x), parent=environment()))
265 | ls(e, all.names=TRUE) %is% c("NA", "two")
266 | })
267 |
268 |
269 | test_that("detect data frames", {
270 | expect_false(is.data.frame(unpackMsg(packMsg(list(a=1, b=2)))))
271 | expect_true(is.data.frame(unpackMsg(packMsg(list(a=1, b=2), as_is=TRUE))))
272 | expect_true(is.data.frame(unpackMsg(packMsg(list(a=numeric(0))))))
273 | expect_false(is.data.frame(unpackMsg(packMsg(list(a=c(1, 2, 3), b=c(2, 3))))))
274 | })
275 |
276 |
277 | test_that("Raw with names", {
278 | expect_warning(packMsg(structure(as.raw(c(1,2)), names=c("a", "b"))))
279 | })
280 |
281 |
282 | test_that("unpackMsg refusing to simplify", {
283 | unpackMsg(packMsg(list(1, 2))) %is% c(1L, 2L)
284 | unpackMsg(packMsg(list(1, 2)), simplify=FALSE) %is% list(1L, 2L)
285 | })
286 |
287 |
288 | test_that("Homepage example", {
289 | pack_rt(list(compact=TRUE, schema=0),
290 | c(as.raw(c(0x82, 0xa7)),
291 | charToRaw("compact"),
292 | as.raw(c(0xc3, 0xa6)),
293 | charToRaw("schema"),
294 | as.raw(00)))
295 | })
296 |
297 |
298 | test_that("warnings trigger once per message", {
299 | bigint = as.raw(c(0xcf, 0x00, 0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01))
300 | length(capture_warnings(unpackMsg(c(as.raw(0x92), bigint, bigint)))) %is% 1
301 | })
302 |
303 |
304 | test_that("limit pending size", {
305 | # the idea is that you may get a message like,
306 | # which if not defended against, will make
307 | # you malloc up all your memory with a small message. So we need to
308 | # limit the number of items we'va allocated, as well. E.G. each
309 | # array we are in the middle of, treat as a promise for at least one
310 | # byte per item in the message.
311 | bad <- packMsg(c(1:5, list(1:10), 6:10))
312 | unpackMsg(bad, max_size = 16)
313 | expect_error(unpackMsg(bad, max_size=15), "long")
314 | })
315 |
316 |
317 | test_that("prevent stack overflows", {
318 | # another attack may be to try to overflow the stack with an indefinitely
319 | # nested array, e.g. 0x919191919191919191.....
320 | bad <- packMsg(list(list(list(1), list(list(2, list(3))))))
321 | expect_error(unpackMsg(bad, max_depth=4), "nest")
322 | unpackMsg(bad, max_depth=5)
323 | })
324 |
325 |
326 | test_that("extension types unpacked as raw with a class attr", {
327 | x <- as.raw(c(0xd4, 0xfe, 0xdd))
328 | expect_warning(d <- unpackMsg(x), "raw")
329 | expect_equal(d, structure(as.raw(0xdd), class="ext-2"))
330 |
331 | x <- as.raw(c(0xd4, 0x7f, 0xdd))
332 | expect_warning(d <- unpackMsg(x), "raw")
333 | expect_equal(d, structure(as.raw(0xdd), class="ext127"))
334 | })
335 |
--------------------------------------------------------------------------------
/tests/testthat/test-streams.R:
--------------------------------------------------------------------------------
1 | context("unpack from streams")
2 | #most of these tests to be implemented using a fifo connection.
3 |
4 | `%is%` <- expect_equal
5 |
6 | msgs <- as.raw(c(0xa5, 0x68, 0x65, 0x6c, 0x6c, 0x6f,
7 | 0xa3, 0x61, 0x6e, 0x64,
8 | 0xa7, 0x67, 0x6f, 0x6f))
9 |
10 | two_ends <- function(f, read_size = 2^16, max_buf = NA) {
11 | fl <- file("", open="w+b", blocking=FALSE)
12 | on.exit(close(fl), add=TRUE)
13 | f(fl, msgConnection(fl))
14 | }
15 |
16 | test_that("consume N messages and return remaining data", {
17 | expect_equal(unpackMsgs(as.raw(1:10)),
18 | list(msgs = as.list(1:10),
19 | remaining = raw(0),
20 | status = "end of input",
21 | bytes_read = 10))
22 |
23 | expect_equal(unpackMsgs(msgs),
24 | list(msgs = list("hello", "and"),
25 | remaining = as.raw(c(0xa7, 0x67, 0x6f, 0x6f)),
26 | status = "buffer underflow",
27 | bytes_read = 10))
28 |
29 | expect_equal(unpackMsgs(as.raw(c(0xa7, 0x6f, 0x6f))),
30 | list(msgs = list(),
31 | remaining = as.raw(c(0xa7, 0x6f, 0x6f)),
32 | status = "buffer underflow",
33 | bytes_read = 0))
34 |
35 | expect_equal(unpackMsgs(as.raw(c(1:10)), 3),
36 | list(msgs = as.list(1:3),
37 | remaining = as.raw(4:10),
38 | status = "ok",
39 | bytes_read = 3))
40 | })
41 |
42 | test_that("packMsgs round trip", {
43 | unpackMsgs(packMsgs(c(list(1:10), 1:10)))$msgs %is%
44 | c(list(1:10), as.list(1:10))
45 | })
46 |
47 | test_that("Errors raised in parsing are caught", {
48 | unpackMsgs(as.raw(c(0xc1)))$status %is% "malformed input"
49 | })
50 |
51 | test_that("read from a connection (blocking)", {
52 |
53 | conn <- msgConnection(rawConnection(msgs, open="r"))
54 | readMsg(conn) %is% "hello"
55 | readMsg(conn) %is% "and"
56 | expect_error(readMsg(conn))
57 | close(conn)
58 |
59 | conn <- msgConnection(rawConnection(msgs, open="r"))
60 | readMsgs(conn) %is% list("hello", "and")
61 | status(conn) %is% "buffer underflow"
62 | close(conn)
63 | })
64 |
65 | test_that("write to a connection", {
66 | conn <- rawConnection(raw(), open="w")
67 | writeMsg(1:10, conn)
68 | writeMsgs(1:10, conn)
69 | expect_equal(
70 | unpackMsgs(rawConnectionValue(conn))$msgs,
71 | c(list(1:10), as.list(1:10)))
72 | close(conn)
73 | })
74 |
75 | test_that("write to and read from connections", {
76 | con <- msgConnection(rawBuffer(raw(0)))
77 | writeMsgs(1:10, con)
78 | readMsgs(con) %is% as.list(1:10)
79 | close(con)
80 | two_ends(function(A, B) {
81 | writeMsgs(1:10, A)
82 | flush(A)
83 | readMsgs(B)
84 | }) %is% as.list(1:10)
85 | })
86 |
87 | test_that("read non-blocking with complete message", {
88 | test <- packMsgs(list("hello", "and"))
89 | conn <- msgConnection(
90 | rawConnection(packMsgs(list("hello", "and", "world")), open="r"),
91 | read_size = length(test))
92 | readMsgs(conn) %is% list("hello", "and", "world")
93 | close(conn)
94 | })
95 |
96 | test_that("read non-blocking with incomplete message", {
97 | two_ends(function(endA, endB) {
98 | writeMsg(1, endA)
99 | partial <- packMsgs(list("here is a partial message", 2))
100 | writeBin(partial[1:10], endA)
101 | flush(endA)
102 | readMsgs(endB) %is% list(1)
103 | status(endB) %is% "buffer underflow"
104 | partial(endB) %is% partial[1:10]
105 | writeBin(partial[11:length(partial)], endA)
106 | flush(endA)
107 | Sys.sleep(0.1)
108 | readMsgs(endB) %is% list("here is a partial message", 2)
109 | })
110 | })
111 |
112 | test_that("underflow at incomplete message (1-process)", {
113 | buf <- rawBuffer()
114 | partial <- packMsgs(list(1, "here is a partial message", 2))
115 | writeRaw(partial[1:10], buf)
116 | con <- msgConnection(buf)
117 | readMsgs(con) %is% list(1)
118 | writeRaw(partial[11:length(partial)], buf)
119 | readMsgs(con) %is% list("here is a partial message", 2)
120 | readMsgs(con) %is% list()
121 | close(con)
122 | })
123 |
124 | test_that("read non-blocking with array breaking over chunks", {
125 | ##this should at least trigger the underflow handler, no?
126 | partial <- packMsgs(list("hello", 1:2))
127 | full <- packMsgs(list("hello", 1:10))
128 | conn <- rawConnection(full, open="r")
129 | conn <- msgConnection(conn, read_size = length(partial))
130 | readMsgs(conn) %is% list("hello", 1:10)
131 | close(conn)
132 | })
133 |
134 | test_that("rawBuffer", {
135 | x <- rawBuffer(as.raw(1:5))
136 | tryCatch({
137 | readRaw(x, 3) %is% as.raw(1:3)
138 | writeRaw(as.raw(6:10), x)
139 | readRaw(x, 5) %is% as.raw(4:8)
140 | readRaw(x, 10) %is% as.raw(9:10)
141 | readRaw(x, 10) %is% raw(0)
142 | writeRaw(as.raw(11:25), x)
143 | readRaw(x, 100) %is% as.raw(11:25)
144 | writeRaw(as.raw(26:27), x)
145 | readRaw(x, 2) %is% as.raw(26:27)
146 | readRaw(x, 0) %is% raw(0)
147 | readRaw(x, 10) %is% raw(0)
148 | }, finally = {
149 | close(x)
150 | })
151 | })
152 |
153 | test_that("read non-blocking and underflow handling when variously interrupted",
154 | {
155 | orig <- list("hello", c("hello", "world"), list("hello", "world", c(1, 2, 3)))
156 | packed <- packMsgs(orig)
157 |
158 | cut <- 28
159 | for (cut in 1:(length(packed) - 1)) {
160 | firstChunk <- packed[1:cut]
161 | secondChunk <- packed[ (cut+1) : (length(packed)) ]
162 | con <- msgConnection(rawBuffer(firstChunk))
163 | expect_error(read1 <- readMsgs(con), NA, info = paste0("at cut ", cut))
164 | writeRaw(secondChunk, con)
165 | expect_error(read2 <- readMsgs(con), NA, info = paste0("at cut ", cut))
166 | expect_equal(c(read1, read2), orig, info = paste0("at cut ", cut))
167 | close(con)
168 | }
169 | })
170 |
171 | test_that("assembling an array > read_size", {
172 | mess <- (1:25) + rep(0, 1000)
173 | c <- msgConnection(rawBuffer(raw(0)), read_size=32)
174 | writeMsg(mess, c)
175 | readMsgs(c) %is% list(mess)
176 | close(c)
177 | })
178 |
179 | test_that("resume from interrupt when message >> read_size",
180 | {
181 | mess <- (1:25) + rep(0, 1000)
182 | con <- msgConnection(rawBuffer(raw(0)), read_size=32)
183 | bin <- packMsg(mess)
184 | writeRaw(bin[1:100], con)
185 | readMsgs(con) %is% list()
186 | writeRaw(bin[101:200], con)
187 | readMsgs(con) %is% list()
188 | writeRaw(bin[201:length(bin)], con)
189 | readMsgs(con) %is% list(mess)
190 | close(con)
191 | })
192 |
193 | test_that("Assembling a string >> read size", {
194 | ## strings are individual messages that stretch over potentially many reads.
195 | mess <- paste0(letters[sample(26, 1000, replace=TRUE)], collapse="")
196 | con <- msgConnection(rawBuffer(raw(0)), read_size=32)
197 | writeMsgs(list(mess), con)
198 | readMsgs(con) %is% list(mess)
199 | expect_equal(seek(con, rw="r"), 1003)
200 | close(con)
201 | })
202 |
203 | test_that("seek method", {
204 | con = msgConnection(rawConnection(msgs, open="r"))
205 | readMsgs(con)
206 | seek(con) %is% 10
207 | close(con)
208 |
209 | con = msgConnection(rawConnection(msgs, open="w"))
210 | writeMsg("hello", con)
211 | seek(con) %is% 6
212 | close(con)
213 |
214 | con = msgConnection(rawBuffer())
215 | writeMsg("hello", con)
216 | expect_error(seek(con))
217 | seek(con, rw = "w") %is% 6
218 | seek(con, rw = "r") %is% 0
219 | readMsg(con)
220 | seek(con, rw = "r") %is% 6
221 | close(con)
222 | })
223 |
224 | `%@%` <- function(x, name) {
225 | attr(x, as.character(substitute(name)))
226 | }
227 |
228 | test_that("large blob to force GC?", {
229 | data <- sample(as.raw(0:255), 0x2000000, TRUE)
230 | con <- msgConnection(rawConnection(raw(0), open="wb"))
231 | packet <- packMsg(data)
232 | writeMsg(data, con)
233 | bytes <- rawConnectionValue(con)
234 | close(con)
235 | con2 <- msgConnection(rawConnection(bytes, open="rb"))
236 | as.read <<- readMsg(con2)
237 | close(con2)
238 | expect_identical(as.read, data)
239 | })
240 |
241 | test_that("smallish blob under gctorture", {
242 | data <- sample(as.raw(0:255), 0x30000, TRUE)
243 | con <- msgConnection(rawConnection(raw(0), open="wb"))
244 | packet <- packMsg(data)
245 | writeMsg(data, con)
246 | bytes <- rawConnectionValue(con)
247 | close(con)
248 | con2 <- msgConnection(rawConnection(bytes, open="rb"))
249 | as.read <- NULL
250 | local({
251 | gctorture(TRUE)
252 | on.exit(gctorture(FALSE))
253 | # getting Error: RAW() can only be applied to a 'raw', not a 'NULL'
254 | # I think perhaps because the buffer gets collected.
255 | # I should hold an external pointer to the buffer, then?
256 | # gctorture triggers an early error "2" so yeah.
257 | # any way to debug GC, though?
258 | as.read <<- readMsg(con2)
259 | })
260 | close(con2)
261 | expect_identical(as.read, data)
262 | })
263 |
264 |
265 | # I'd like to have some tests with reading/writing to a separate
266 | # process. I tried with parallel/mcfork, makeForkCluster, but forking
267 | # seems to cause weirdness with nonblocking. (however this was before
268 | # I sorted out partial reads.) Subprocess package seems to not have
269 | # that prob. but the subprocess that is created doesn't have my
270 | # functions loaded.
271 |
272 | print.raw <- function(x, max.print = getOption("max.print"), ...) {
273 | # Display raw vectors as hex dumps. Debugging use, not exported.
274 | con <- pipe("hexdump -C", "wb")
275 | if (length(x) > max.print) {
276 | writeBin(x[1:max.print], con)
277 | close(con)
278 | cat(paste0("[ reached getOption(\"max.print\") -- omitted ",
279 | length(x) - max.print,
280 | " entries ] \n"))
281 | } else {
282 | writeBin(x, con[1:max.print])
283 | close(con)
284 | }
285 | }
286 |
--------------------------------------------------------------------------------
/vignettes/comparison.html.asis:
--------------------------------------------------------------------------------
1 | %\VignetteIndexEntry{Which Data Format is Fastest Data Format?}
2 | %\VignetteEngine{R.rsp::asis}
3 | %\VignetteKeyword{HTML}
4 | %\VignetteKeyword{vignette}
5 | %\VignetteKeyword{package}
6 |
--------------------------------------------------------------------------------