├── .Rbuildignore
├── DESCRIPTION
├── NAMESPACE
├── R
    └── sas7bdat.R
├── README
├── data
    └── sas7bdat.sources.RData
├── man
    ├── read.sas7bdat.Rd
    └── sas7bdat.sources.Rd
└── vignettes
    ├── reverse-engineering.Rnw
    ├── rst2Rnw.sh
    ├── sas7bdat.Rnw
    └── sas7bdat.rst


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | doc/CCNotes
2 | doc/reverse-engineering.Rnw
3 | doc/rst2Rnw.sh
4 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: sas7bdat
 2 | Type: Package
 3 | Title: sas7bdat Reverse Engineering Documentation
 4 | Version: 0.8
 5 | Date: 2024-08-28
 6 | Authors@R: c(person("Matt", "Shotwell", role=c("aut", "cre"),
 7 |                    email="matt.shotwell@vanderbilt.edu"),
 8 |              person("Clint", "Cummins", role="ctb",
 9 |                    email="clint@stanford.edu"))
10 | Maintainer: Matt Shotwell <matt.shotwell@vanderbilt.edu>
11 | Description: Documentation and prototypes for the earliest (circa 2010) open-source effort to reverse engineer the sas7bdat file format. The package includes a prototype reader for sas7bdat files. However, newer packages may contain more robust readers for sas7bdat files.
12 | Depends: R (>= 2.10)
13 | License: GPL (>= 2)
14 | LazyLoad: yes
15 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
1 | export(read.sas7bdat)
2 | importFrom("utils", "download.file")
3 | 


--------------------------------------------------------------------------------
/R/sas7bdat.R:
--------------------------------------------------------------------------------
  1 | #    Copyright (C) 2015 Matt Shotwell, VUMC
  2 | #
  3 | #    This program is free software; you can redistribute it and/or modify
  4 | #    it under the terms of the GNU General Public License as published by
  5 | #    the Free Software Foundation; either version 2 of the License, or
  6 | #    (at your option) any later version.
  7 | #
  8 | #    This program is distributed in the hope that it will be useful,
  9 | #    but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | #    GNU General Public License for more details.
 12 | #
 13 | #    You should have received a copy of the GNU General Public License along
 14 | #    with this program; if not, write to the Free Software Foundation, Inc.,
 15 | #    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
 16 | 
 17 | # Download all files listed in sas7bdat.sources
 18 | # path - where to save files
 19 | # max.size - limit on the size of downloaded files (bytes)
 20 | download.sas7bdat.sources <-
 21 |   function(ss, path=normalizePath("."), max.size=2^20) {
 22 |     # don't download zip files or files larger than max.size
 23 |     ss <- subset(ss, !grepl(".zip$", ss$url) & ss$uncompressed < max.size)
 24 |     if(!file.exists(path))
 25 |         dir.create(path)
 26 |     apply(ss, 1, function(r)
 27 |       download.file(r["url"], file.path(path, r["filename"])))
 28 | }
 29 | 
 30 | # Compress a file on disk
 31 | # desc - file path
 32 | # type - compression type ("gzip", "bzip2", "xz")
 33 | file.compress <- function(desc, type = "gzip") {
 34 |     if(type == "gzip") {
 35 |         ext <- ".gz"; cfile <- gzfile
 36 |     } else if(type == "bzip2") {
 37 |         ext <- ".bz2"; cfile <- bzfile
 38 |     } else if(type == "xz") {
 39 |         ext <- ".xz"; cfile <- xzfile
 40 |     } else {
 41 |         stop("compression 'type' unrecognized")
 42 |     }
 43 |     inp <- file(desc, open="rb")
 44 |     oup <- cfile(paste(desc, ext, sep=""), open="wb")
 45 |     while(length(dat <- readBin(inp, "raw", 2^13)) > 0)
 46 |         writeBin(dat, oup)
 47 |     close(inp)
 48 |     close(oup)
 49 |     return(paste(desc, ext, sep=""))
 50 | }
 51 | 
 52 | # Generate an entry for sas7bdat.sources
 53 | # fn - a local file name
 54 | # url - url of the file
 55 | generate.sas7bdat.source <- function(fn, url) {
 56 |     dl <- try(download.file(url, fn))
 57 |     if(inherits(dl, "try-error") || dl != 0)
 58 |         return(FALSE)
 59 |     sz <- file.info(fn)$size
 60 |     cat("gzip compress...")
 61 |     fn.gz <- file.compress(fn, "gzip")
 62 |     sz.gz <- file.info(fn.gz)$size
 63 |     cat("done\nbzip2 compress...")
 64 |     fn.bz2 <- file.compress(fn, "bzip2")
 65 |     sz.bz2 <- file.info(fn.bz2)$size
 66 |     cat("done\nxz compress...")
 67 |     fn.xz <- file.compress(fn, "xz")
 68 |     sz.xz <- file.info(fn.xz)$size
 69 |     cat("done\nparsing file...")
 70 |     dat <- try(read.sas7bdat(fn))
 71 |     cat("done\n")
 72 |     if(!inherits(dat, "try-error")) {
 73 |         # the two date variables below are not used 
 74 |         as.character(attr(dat, 'date.created')) -> datecreated 
 75 |         as.character(attr(dat, 'date.modified')) -> datemodified
 76 |         attr(dat, 'SAS.release') -> SAS_release
 77 |         attr(dat, 'SAS.host')    -> SAS_host
 78 |         attr(dat, 'OS.version')  -> OS_version
 79 |         attr(dat, 'OS.maker')    -> OS_maker
 80 |         attr(dat, 'OS.name')     -> OS_name
 81 |         attr(dat, 'endian')      -> endian
 82 |         attr(dat, 'winunix')     -> winunix
 83 |         dat <- "OK"
 84 |     } else {
 85 |         datecreated  <- ""
 86 |         datemodified <- ""
 87 |         SAS_release  <- ""
 88 |         SAS_host     <- ""
 89 |         OS_version   <- ""
 90 |         OS_maker     <- ""
 91 |         OS_name      <- ""
 92 |         endian       <- ""
 93 |         winunix      <- ""
 94 |         dat <- dat[1]
 95 |     }
 96 |     data.frame(
 97 |         filename = fn, accessed = Sys.time(), uncompressed = sz,
 98 |         gzip = sz.gz, bzip2 = sz.bz2, xz = sz.xz, url = url,
 99 |         PKGversion = VERSION, message = dat, SASrelease = SAS_release,
100 |         SAShost = SAS_host, OSversion = OS_version, OSmaker = OS_maker,
101 |         OSname = OS_name, endian = endian, winunix = winunix,
102 |         stringsAsFactors=FALSE)
103 | }
104 | 
105 | update.sas7bdat.source <- function(df) {
106 |     re <- generate.sas7bdat.source(df$filename, df$url)
107 |     if(inherits(re, "logical")) return(df)
108 |     return(re)
109 | }
110 | 
111 | 
112 | # Update sas7bdat.sources
113 | update.sas7bdat.sources <- function(ss) {
114 |     for(i in 1:nrow(ss))
115 |         ss[i,] <- update.sas7bdat.source(ss[i,])
116 |     return(ss)
117 | }
118 |     
119 | VERSION   <- "0.5"
120 | BUGREPORT <- "please report bugs to maintainer"
121 | CAUTION   <- "please verify data correctness"
122 | 
123 | # Subheader 'signatures'
124 | SUBH_ROWSIZE <- as.raw(c(0xF7,0xF7,0xF7,0xF7))
125 | SUBH_COLSIZE <- as.raw(c(0xF6,0xF6,0xF6,0xF6))
126 | SUBH_COLTEXT <- as.raw(c(0xFD,0xFF,0xFF,0xFF))
127 | SUBH_COLATTR <- as.raw(c(0xFC,0xFF,0xFF,0xFF))
128 | SUBH_COLNAME <- as.raw(c(0xFF,0xFF,0xFF,0xFF))
129 | SUBH_COLLABS <- as.raw(c(0xFE,0xFB,0xFF,0xFF))
130 | SUBH_COLLIST <- as.raw(c(0xFE,0xFF,0xFF,0xFF))
131 | SUBH_SUBHCNT <- as.raw(c(0x00,0xFC,0xFF,0xFF))
132 | 
133 | # Page types
134 | PAGE_META <- 0
135 | PAGE_DATA <- 256        #1<<8
136 | PAGE_MIX  <- c(512,640) #1<<9,1<<9|1<<7
137 | PAGE_AMD  <- 1024       #1<<10
138 | PAGE_METC <- 16384      #1<<14 (compressed data)
139 | PAGE_COMP <- -28672     #~(1<<14|1<<13|1<<12) 
140 | PAGE_MIX_DATA <- c(PAGE_MIX, PAGE_DATA)
141 | PAGE_META_MIX_AMD <- c(PAGE_META, PAGE_MIX, PAGE_AMD)
142 | PAGE_ANY  <- c(PAGE_META_MIX_AMD, PAGE_DATA, PAGE_METC, PAGE_COMP)
143 | 
144 | page_type_strng <- function(type) {
145 |     if(type %in% PAGE_META)
146 |         return('meta')
147 |     if(type %in% PAGE_DATA)
148 |         return('data')
149 |     if(type %in% PAGE_MIX)
150 |         return('mix')
151 |     if(type %in% PAGE_AMD)
152 |         return('amd')
153 |     return('unknown')
154 | }
155 | 
156 | read_subheaders <- function(page, u64) {
157 |     subhs <- list()
158 |     subh_total <- 0
159 |     if(!(page$type %in% PAGE_META_MIX_AMD))
160 |         return(subhs)
161 |     # page offset of subheader pointers
162 |     oshp <- if(u64) 40 else 24
163 |     # length of subheader pointers
164 |     lshp <- if(u64) 24 else 12
165 |     # length of first two subheader fields
166 |     lshf <- if(u64) 8  else 4
167 |     for(i in 1:page$subh_count) {
168 |         subh_total <- subh_total + 1
169 |         base <- oshp + (i - 1) * lshp
170 |         subhs[[subh_total]] <- list()
171 |         subhs[[subh_total]]$page <- page$page 
172 |         subhs[[subh_total]]$offset <- read_int(page$data, base, lshf)
173 |         subhs[[subh_total]]$length <- read_int(page$data, base + lshf, lshf)
174 |         if(subhs[[subh_total]]$length > 0) {
175 |             subhs[[subh_total]]$raw <- read_raw(page$data, 
176 |                 subhs[[subh_total]]$offset, subhs[[subh_total]]$length)
177 |             subhs[[subh_total]]$signature <- read_raw(subhs[[subh_total]]$raw, 0, 4)
178 |         }
179 |     }
180 |     return(subhs)
181 | }
182 | 
183 | read_column_names <- function(col_name, col_text, u64) {
184 |     names <- list()
185 |     name_count <- 0
186 |     offp <- if(u64) 8 else 4
187 |     for(subh in col_name) {
188 |         cmax <- (subh$length - if(u64) 28 else 20)/8
189 |         for(i in 1:cmax) {
190 |             name_count <- name_count + 1
191 |             names[[name_count]] <- list()
192 |             base <- (if(u64) 16 else 12) + (i-1) * 8
193 |             hdr  <- read_int(subh$raw, base, 2)
194 |             off  <- read_int(subh$raw, base + 2, 2)
195 |             len  <- read_int(subh$raw, base + 4, 2)
196 |             names[[name_count]]$name <- read_str(col_text[[hdr+1]]$raw,
197 |                                                  off + offp, len)
198 |         }
199 |     }
200 |     return(names)
201 | }
202 | 
203 | read_column_labels_formats <- function(col_labs, col_text, u64) {
204 |     if(length(col_labs) < 1)
205 |         return(NULL)
206 |     offp <- if(u64) 8  else 4
207 |     labs <- list()
208 |     for(i in 1:length(col_labs)) {
209 |         labs[[i]] <- list()
210 |         base <- if(u64) 46 else 34
211 |         hdr  <- read_int(col_labs[[i]]$raw, base, 2)
212 |         off  <- read_int(col_labs[[i]]$raw, base + 2, 2)
213 |         len  <- read_int(col_labs[[i]]$raw, base + 4, 2)
214 |         if(len > 0)
215 |             labs[[i]]$format <- read_str(col_text[[hdr+1]]$raw,
216 |                                          off + offp, len)
217 |         labs[[i]]$fhdr <- hdr;
218 |         labs[[i]]$foff <- off
219 |         labs[[i]]$flen <- len
220 |         base <- if(u64) 52 else 40
221 |         hdr  <- read_int(col_labs[[i]]$raw, base, 2)
222 |         off  <- read_int(col_labs[[i]]$raw, base + 2, 2)
223 |         len  <- read_int(col_labs[[i]]$raw, base + 4, 2)
224 |         if(len > 0)
225 |             labs[[i]]$label <- read_str(col_text[[hdr+1]]$raw,
226 |                                         off + offp, len)
227 |         labs[[i]]$lhdr <- hdr;
228 |         labs[[i]]$loff <- off
229 |         labs[[i]]$llen <- len
230 |     }
231 |     return(labs)
232 | }
233 |  
234 | read_column_attributes <- function(col_attr, u64) {
235 |     info <- list()
236 |     info_ct <- 0
237 |     lcav <- if(u64) 16 else 12
238 |     for(subh in col_attr) {
239 |         cmax <- (subh$length - if(u64) 28 else 20)/lcav
240 |         for(i in 1:cmax) {
241 |             info_ct <- info_ct + 1
242 |             info[[info_ct]] <- list()
243 |             base <- lcav + (i-1) * lcav
244 |             info[[info_ct]]$offset <- read_int(subh$raw, base,
245 |                                                if(u64) 8 else 4)
246 |             info[[info_ct]]$length <- read_int(subh$raw,
247 |                                                base + if(u64) 8 else 4,
248 |                                                4)
249 |             info[[info_ct]]$type   <- read_int(subh$raw,
250 |                                                base + if(u64) 14 else 10,
251 |                                                1)
252 |             info[[info_ct]]$type   <- ifelse(info[[info_ct]]$type == 1,
253 |                                              "numeric", "character")
254 |         }
255 |     }
256 |     return(info)
257 | }
258 | 
259 | # Magic number
260 | MAGIC     <- as.raw(c(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
261 |                       0x0, 0x0, 0x0, 0x0, 0xc2,0xea,0x81,0x60,
262 |                       0xb3,0x14,0x11,0xcf,0xbd,0x92,0x8, 0x0,
263 |                       0x9, 0xc7,0x31,0x8c,0x18,0x1f,0x10,0x11))
264 | 
265 | check_magic_number <- function(data)
266 |     identical(data[1:length(MAGIC)], MAGIC)
267 | 
268 | # These functions utilize offset + length addressing
269 | read_bin <- function(buf, off, len, type, ...)
270 |     readBin(buf[(off+1):(off+len)], type, 1, len, ...)
271 | read_raw <- function(buf, off, len, ...)
272 |     readBin(buf[(off+1):(off+len)], "raw", len, 1, ...)
273 | read_int <- function(buf, off, len, ...)
274 |     read_bin(buf, off, len, "integer", ...)
275 | read_str <- function(buf, off, len, ...)
276 |     read_bin(buf, off, len, "character", ...)
277 | read_flo <- function(buf, off, len, ...)
278 |     read_bin(buf, off, len, "double", ...)
279 | 
280 | get_subhs <- function(subhs, signature) {
281 |     keep <- sapply(subhs, function(subh) {
282 |         identical(subh$signature, signature)
283 |     })
284 |     subhs[keep]
285 | } 
286 | 
287 | # Sometimes there is more than one column attribute subheader.
288 | # In these cases, the column attribute data are spliced together
289 | # so that the appear to have been in the same subheader
290 | splice_col_attr_subheaders <- function(col_attr) {
291 |     raw <- read_raw(col_attr[[1]]$raw, 0, col_attr[[1]]$length - 8)
292 |     for(i in 2:length(col_attr))
293 |         raw <- c(raw, read_raw(col_attr[[i]]$raw, 12,
294 |             col_attr[[i]]$length - 20))
295 |     return(list(raw=raw))
296 | }
297 | 
298 | read.sas7bdat <- function(file, encoding="", debug=FALSE) {
299 |     if(inherits(file, "connection") && isOpen(file, "read")) {
300 |         con <- file
301 |         close_con <- FALSE
302 |     } else if (is.character(file)) {
303 |         con <- file(file, "rb")
304 |         close_con <- TRUE
305 |     } else {
306 |         stop("invalid 'file' argument")
307 |     }
308 | 
309 |     
310 |     # Check magic number
311 |     header <- readBin(con, "raw", 288, 1)
312 |     if(length(header) < 288)
313 |         stop("header too short (not a sas7bdat file?)")
314 |     if(!check_magic_number(header))
315 |         stop(paste("magic number mismatch", BUGREPORT))
316 | 
317 |     # Check for 32 or 64 bit alignment
318 |     align1 <- read_raw(header, 32, 1)
319 |     if(identical(align1, as.raw(0x33))) {
320 |         align1 <- 4
321 |     } else {
322 |         align1 <- 0
323 |     }
324 | 
325 |     # If align1 == 4, file is u64 type
326 |     if(align1 == 4) {
327 |         u64 <- TRUE
328 |     } else {
329 |         u64 <- FALSE
330 |     }
331 | 
332 |     align2 <- read_raw(header, 35, 1)
333 |     if(identical(align2, as.raw(0x33))) {
334 |         align2 <- 4
335 |     } else {
336 |         align2 <- 0
337 |     }
338 | 
339 |     endian <- read_raw(header, 37, 1)
340 |     if(identical(endian, as.raw(0x01))) {
341 |         endian <- "little"
342 |     } else {
343 |         endian <- "big"
344 |         stop("big endian files are not supported")
345 |     }
346 | 
347 |     winunix <- read_str(header, 39, 1)
348 |     if(identical(winunix, "1")) {
349 |         winunix <- "unix"
350 |     } else if(identical(winunix, "2")) {
351 |         winunix <- "windows"
352 |     } else {
353 |         winunix <- "unknown"
354 |     }   
355 | 
356 |     # Timestamp is epoch 01/01/1960
357 |     datecreated <- read_flo(header, 164+align1, 8)
358 |     datecreated <- datecreated + as.POSIXct("1960/01/01", format="%Y/%m/%d")
359 |     datemodified <- read_flo(header, 172+align1, 8)
360 |     datemodified <- datemodified + as.POSIXct("1960/01/01", format="%Y/%m/%d")
361 |     
362 |     # Read the remaining header
363 |     header_length <- read_int(header, 196 + align2, 4)
364 |     header <- c(header, readBin(con, "raw", header_length-288, 1))
365 |     if(length(header) < header_length)
366 |         stop("header too short (not a sas7bdat file?)")
367 | 
368 |     page_size   <- read_int(header, 200 + align2, 4)
369 |     if(page_size < 0)
370 |         stop(paste("page size is negative", BUGREPORT))
371 | 
372 |     page_count  <- read_int(header, 204 + align2, 4)
373 |     if(page_count < 1)
374 |         stop(paste("page count is not positive", BUGREPORT))
375 |     
376 | 
377 |     SAS_release <- read_str(header, 216 + align1 + align2, 8)
378 | 
379 |     # SAS_host is a 16 byte field, but only the first eight are used
380 |     # FIXME: It would be preferable to eliminate this check
381 |     SAS_host    <- read_str(header, 224 + align1 + align2, 8)
382 | 
383 |     OS_version  <- read_str(header, 240 + align1 + align2, 16) 
384 |     OS_maker    <- read_str(header, 256 + align1 + align2, 16) 
385 |     OS_name     <- read_str(header, 272 + align1 + align2, 16) 
386 | 
387 |     # Read pages
388 |     pages <- list()
389 |     for(page_num in 1:page_count) {
390 |         pages[[page_num]] <- list()
391 |         pages[[page_num]]$page <- page_num
392 |         pages[[page_num]]$data <- readBin(con, "raw", page_size, 1)
393 |         pages[[page_num]]$type <- read_int(pages[[page_num]]$data, if(u64) 32 else 16, 2)
394 |         pages[[page_num]]$type_strng <- page_type_strng(pages[[page_num]]$type)
395 |         pages[[page_num]]$blck_count <- read_int(pages[[page_num]]$data, if(u64) 34 else 18, 2) 
396 |         pages[[page_num]]$subh_count <- read_int(pages[[page_num]]$data, if(u64) 36 else 20, 2)
397 |     }
398 | 
399 |     # Read all subheaders
400 |     subhs <- list()
401 |     for(page in pages)
402 |         subhs <- c(subhs, read_subheaders(page, u64)) 
403 | 
404 |     # Parse row size subheader
405 |     row_size <- get_subhs(subhs, SUBH_ROWSIZE)
406 |     if(length(row_size) != 1)
407 |         stop(paste("found", length(row_size),
408 |             "row size subheaders where 1 expected", BUGREPORT))
409 |     row_size <- row_size[[1]]
410 |     row_length   <- read_int(row_size$raw,
411 |                              if(u64) 40 else 20,
412 |                              if(u64) 8  else 4)
413 |     row_count    <- read_int(row_size$raw,
414 |                              if(u64) 48 else 24,
415 |                              if(u64) 8  else 4)
416 |     col_count_p1 <- read_int(row_size$raw,
417 |                              if(u64) 72 else 36,
418 |                              if(u64) 8  else 4)
419 |     col_count_p2 <- read_int(row_size$raw,
420 |                              if(u64) 80 else 40,
421 |                              if(u64) 8  else 4)
422 |     row_count_fp <- read_int(row_size$raw,
423 |                              if(u64) 120 else 60,
424 |                              if(u64) 8   else 4)
425 | 
426 |     # Parse col size subheader
427 |     col_size <- get_subhs(subhs, SUBH_COLSIZE)
428 |     if(length(col_size) != 1)
429 |         stop(paste("found", length(col_size),
430 |             "column size subheaders where 1 expected", BUGREPORT))
431 |     col_size <- col_size[[1]]
432 |     col_count_6  <- read_int(col_size$raw,
433 |                              if(u64) 8 else 4,
434 |                              if(u64) 8 else 4)
435 |     col_count    <- col_count_6
436 | 
437 |     #if((col_count_p1 + col_count_p2) != col_count_6)
438 |     #    warning(paste("column count mismatch" , CAUTION))
439 | 
440 |     # Read column information
441 |     col_text <- get_subhs(subhs, SUBH_COLTEXT)
442 |     if(length(col_text) < 1)
443 |         stop(paste("no column text subheaders found", BUGREPORT))
444 | 
445 |     # Test for COMPRESS=CHAR compression
446 |     # This test is done earlier at the page level
447 |     #if("SASYZCRL" == read_str(col_text[[1]]$raw, 16, 8))
448 |     #    stop(paste("file uses unsupported CHAR compression"))
449 | 
450 |     col_attr <- get_subhs(subhs, SUBH_COLATTR)            
451 |     if(length(col_attr) < 1)
452 |         stop(paste("no column attribute subheaders found", BUGREPORT))
453 | 
454 |     col_attr <- read_column_attributes(col_attr, u64)
455 |     if(length(col_attr) != col_count)
456 |         stop(paste("found", length(col_attr), 
457 |             "column attributes where", col_count,
458 |             "expected", BUGREPORT))
459 | 
460 |     col_name <- get_subhs(subhs, SUBH_COLNAME)
461 |     if(length(col_name) < 1)
462 |         stop(paste("no column name subheaders found", BUGREPORT))
463 | 
464 |     col_name <- read_column_names(col_name, col_text, u64)
465 |     if(length(col_name) != col_count)
466 |         stop(paste("found", length(col_name), 
467 |             "column names where", col_count, "expected", BUGREPORT))
468 | 
469 |     # Make column names unique, if not already
470 |     col_name_uni <- make.unique(sapply(col_name, function(x)x$name)) 
471 |     for(i in 1:length(col_name_uni))
472 |         col_name[[i]]$name <- col_name_uni[i]
473 | 
474 |     col_labs <- get_subhs(subhs, SUBH_COLLABS)
475 |     col_labs <- read_column_labels_formats(col_labs, col_text, u64)
476 |     if(is.null(col_labs))
477 |         col_labs <- list(length=col_count)
478 |     if(length(col_labs) != col_count)
479 |         stop(paste("found", length(col_labs), 
480 |             "column formats and labels", col_count, "expected", BUGREPORT))
481 | 
482 |     # Collate column information
483 |     col_info <- list()
484 |     for(i in 1:col_count)
485 |         col_info[[i]] <- c(col_name[[i]], col_attr[[i]], col_labs[[i]]) 
486 | 
487 |     # Check pages for known type 
488 |     for(page_num in 1:page_count) {
489 |         if(!(pages[[page_num]]$type %in% PAGE_ANY))
490 |             stop(paste("page", page_num, "has unknown type:",
491 |                 pages[[page_num]]$type, BUGREPORT))
492 |         if(pages[[page_num]]$type %in% c(PAGE_METC, PAGE_COMP))
493 |             stop("file contains compressed data")
494 |     }
495 |         
496 |     # Parse data
497 |     data  <- list()
498 |     for(col in col_info)
499 |         if(col$length > 0)
500 |             data[[col$name]] <- vector(col$type, length=row_count)
501 | 
502 |     row   <- 0
503 |     for(page in pages) {
504 |         #FIXME are there data on pages of type 4?
505 |         if(!(page$type %in% PAGE_MIX_DATA))
506 |             next 
507 |         base <- (if(u64) 32 else 16) + 8
508 |         if(page$type %in% PAGE_MIX) {
509 |             row_count_p <- row_count_fp
510 |             # skip subheader pointers
511 |             base <- base + page$subh_count * if(u64) 24 else 12
512 |             base <- base + base %% 8
513 |         } else {
514 |             row_count_p <- read_int(page$data, if(u64) 34 else 18, 2)
515 |         }
516 |         # round up to 8-byte boundary	
517 |         base <- ((base+7) %/% 8) * 8 + base %% 8
518 |         if(row_count_p > row_count)
519 |             row_count_p <- row_count
520 |         for(row in (row+1):(row+row_count_p)) {
521 |             for(col in col_info) {
522 |                 off <- base + col$offset
523 |                 if(col$length > 0) {
524 |                     raw <- read_raw(page$data, off, col$length)
525 |                     if(col$type == "numeric" && col$length < 8) {
526 |                         raw <- c(as.raw(rep(0x00, 8 - col$length)),raw)
527 |                         col$length <- 8
528 |                     }
529 |                     data[[col$name]][row] <- readBin(raw, col$type, 1, col$length)
530 |                     if(col$type == "character") {
531 |                         # Apply encoding
532 |                         Encoding(data[[col$name]][row]) <- encoding
533 |                         # Strip beginning and trailing spaces
534 |                         data[[col$name]][row] <- gsub('^ +| +$', '', data[[col$name]][row])
535 |                     }
536 |                 }
537 |             }
538 |             base <- base + row_length
539 |         }
540 |     }        
541 | 
542 |     if(row != row_count)
543 |         warning(paste("found", row, "records where", row_count,
544 |             "expected", BUGREPORT))
545 | 
546 |     if(close_con)
547 |         close(con)
548 | 
549 |     data <- as.data.frame(data)
550 |     attr(data, 'pkg.version')   <- VERSION
551 |     attr(data, 'column.info')   <- col_info
552 |     attr(data, 'date.created')  <- datecreated
553 |     attr(data, 'date.modified') <- datemodified
554 |     attr(data, 'SAS.release')   <- SAS_release
555 |     attr(data, 'SAS.host')      <- SAS_host
556 |     attr(data, 'OS.version')    <- OS_version
557 |     attr(data, 'OS.maker')      <- OS_maker
558 |     attr(data, 'OS.name')       <- OS_name
559 |     attr(data, 'endian')        <- endian
560 |     attr(data, 'winunix')       <- winunix
561 |     if(debug)
562 |         attr(data, 'debug')     <- sys.frame(1)
563 |     return(data)
564 | }
565 | 


--------------------------------------------------------------------------------
/README:
--------------------------------------------------------------------------------
1 | This package contains documents and software related to a compatibility study of the SAS7BDAT database file format. The 'data/sas7bdat.sources.RData' references a collection of SAS7BDAT database files freely avalilable from internet resources. The data are not redistributed due to licensing concerns. Files in the collection are used for testing and investigating the SAS7BDAT file format. 
2 |  
3 | The included vignette documents various aspects of the compatibility study, including a detailed description of the binary structure of SAS7BDAT formatted databases. 
4 |   
5 | The R/ directory contains R (www.r-project.org) code implementing a prototype SAS7BDAT file reader.
6 | 


--------------------------------------------------------------------------------
/data/sas7bdat.sources.RData:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/BioStatMatt/sas7bdat/46418b2892d8f30b55ef497f1eb81e9db4822076/data/sas7bdat.sources.RData


--------------------------------------------------------------------------------
/man/read.sas7bdat.Rd:
--------------------------------------------------------------------------------
 1 | \name{read.sas7bdat}
 2 | \alias{read.sas7bdat}
 3 | \title{
 4 | SAS Database Reader (experimental)
 5 | }
 6 | \description{
 7 | Read SAS files in the sas7bdat data format.
 8 | }
 9 | \usage{
10 | read.sas7bdat(file, encoding="", debug=FALSE)
11 | }
12 | \arguments{
13 |   \item{file}{character: Path to a file or an URL.}
14 |   \item{encoding}{character: Character encoding for strings}
15 |   \item{debug}{logical: Save function environment as attribute of returned object.}
16 | }
17 | \section{Warning}{
18 | The functionality in this package is EXPERIMENTAL. Use at your own risk. For the latest details, see the \sQuote{sas7bdat} vignette (\emph{i.e.}, \code{vignette('sas7bdat')}). 
19 | }
20 | \value{
21 |    A data frame corresponding to the SAS database. The returned data frame has an \code{column.info} attribute and other attributes that contain additional information about each field in the data frame, respectively. The \code{column.info} attribute is a list of lists, containing each of the following:
22 |    \item{name}{The field name}
23 |    \item{offset}{The field offset in packed binary row data (bytes)}
24 |    \item{length}{The field length (bytes)}
25 |    \item{type}{The field type, either 'character' or 'numeric'}
26 | When the database specifies a field format and/or label, the following may also be present:
27 |    \item{format}{The field display format}
28 |    \item{label}{The field label (usually a longer description)}
29 | }
30 | \references{
31 | http://biostatmatt.com/archives/tag/sas7bdat
32 | }
33 | \author{
34 | Matt Shotwell
35 | }
36 | 
37 | \examples{
38 | ## see \code{data(sas7bdat.sources)}
39 | }
40 | 


--------------------------------------------------------------------------------
/man/sas7bdat.sources.Rd:
--------------------------------------------------------------------------------
 1 | \name{sas7bdat.sources}
 2 | \alias{sas7bdat.sources}
 3 | \docType{data}
 4 | \title{
 5 | Internet SAS Database Resources
 6 | }
 7 | \description{
 8 | These data are a collection of internet resources for SAS database files in the sas7bdat format.
 9 | }
10 | \usage{data(sas7bdat.sources)}
11 | \format{
12 |   A data frame with records on the following fields:
13 |   \describe{
14 |     \item{\code{filename}}{character, the SAS database filename}
15 |     \item{\code{accessed}}{POSIXct, the date last retrieved}
16 |     \item{\code{uncompressed}}{numeric, file size (bytes)}
17 |     \item{\code{gzip}}{numeric, gzip compressed file size (bytes)}
18 |     \item{\code{bzip2}}{numeric, bzip2 compressed file size (bytes)}
19 |     \item{\code{xz}}{numeric, xz compressed file size (bytes)}
20 |     \item{\code{url}}{character, the Universal Resource Locator}
21 |     \item{\code{PKGversion}}{character, the \pkg{sas7bdat} package version}
22 |     \item{\code{message}}{character, message returned by \code{read.sas7bdat} (if any)}
23 |     \item{\code{SASrelease}}{character, SAS release}
24 |     \item{\code{SAShost}}{character, SAS host platform}
25 |     \item{\code{OSversion}}{character, OS version}
26 |     \item{\code{OSmaker}}{character, OS maker}
27 |     \item{\code{OSname}}{character, OS name}
28 |     \item{\code{endianness}}{character, endianness of header fields}
29 |     \item{\code{winunix}}{character, platform type}
30 |   }
31 | }
32 | \examples{
33 | data(sas7bdat.sources)
34 | }
35 | \keyword{datasets}
36 | 


--------------------------------------------------------------------------------
/vignettes/reverse-engineering.Rnw:
--------------------------------------------------------------------------------
  1 | \documentclass{article}
  2 | \usepackage{cmap} % fix search and cut-and-paste in Acrobat
  3 | \usepackage{ifthen}
  4 | \usepackage[T1]{fontenc}
  5 | \usepackage[utf8]{inputenc}
  6 | \usepackage{alltt}
  7 | \setcounter{secnumdepth}{0}
  8 | \usepackage{longtable,ltcaption,array}
  9 | \setlength{\extrarowheight}{2pt}
 10 | \newlength{\DUtablewidth} % internal use in tables
 11 | \usepackage{natbib}
 12 | %%% Fallback definitions for Docutils-specific commands
 13 | 
 14 | % Provide a length variable and set default, if it is new
 15 | \providecommand*{\DUprovidelength}[2]{
 16 |   \ifthenelse{\isundefined{#1}}{\newlength{#1}\setlength{#1}{#2}}{}
 17 | }
 18 | 
 19 | % line block environment
 20 | \DUprovidelength{\DUlineblockindent}{2.5em}
 21 | \ifthenelse{\isundefined{\DUlineblock}}{
 22 |   \newenvironment{DUlineblock}[1]{%
 23 |     \list{}{\setlength{\partopsep}{\parskip}
 24 |             \addtolength{\partopsep}{\baselineskip}
 25 |             \setlength{\topsep}{0pt}
 26 |             \setlength{\itemsep}{0.15\baselineskip}
 27 |             \setlength{\parsep}{0pt}
 28 |             \setlength{\leftmargin}{#1}}
 29 |     \raggedright
 30 |   }
 31 |   {\endlist}
 32 | }{}
 33 | 
 34 | \ifthenelse{\isundefined{\hypersetup}}{
 35 |   \usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue]{hyperref}
 36 |   \usepackage{bookmark}
 37 |   \urlstyle{same} % normal text font (alternatives: tt, rm, sf)
 38 | }{}
 39 | \hypersetup{
 40 |   pdftitle={Tools and Strategies for Reverse Engineering the Format of Statistical Data Files (Draft)},
 41 | }
 42 | \begin{document}
 43 | 
 44 | \title{Tools and Strategies for Reverse Engineering the Format of Statistical Data Files (Draft)}
 45 | \author{}
 46 | \date{}
 47 | \maketitle
 48 | 
 49 | by:
 50 | 
 51 | \begin{quote}
 52 | \begin{DUlineblock}{0em}
 53 | \item[] Matthew S. Shotwell, PhD
 54 | \item[] Associate Professor
 55 | \item[] Department of Biostatistics
 56 | \item[] Vanderbilt University
 57 | \item[] \href{mailto:matt.shotwell@vanderbilt.edu}{matt.shotwell@vanderbilt.edu}
 58 | \end{DUlineblock}
 59 | \end{quote}
 60 | \section{Introduction}
 61 | Reverse engineering is much like forensics. At some point in the past, the process was conceived and implemented. In the present, the input and output are observable, but the process itself is a {\it black box}, that is, the details of its design and implementation have been lost, or concealed. Forensics, and reverse engineering attempt to recover the details.
 62 | 
 63 | %Rewards and risks
 64 | The risks of a reverse engineering venture may be considerable. Indeed, the principal risk is failure to recover the details of a process under study. This is compounded by the consequences that may arise regardless of success or failure. For instance, merely the attempt to reverse engineer certain computer software may be in violation of the associated end-user license agreement. In an extreme example, an attempt to ``crack'' encrypted communications may be illegal.
 65 | 
 66 | %Baggerly and Coombes "forensic bioinformatics"
 67 | 
 68 | %Statistical data files whose format descriptions are distributed under commercial license, or otherwise unpublished, impose a barrier to reproducible research. Because such formats generally require the user to purchase and learn to operate an associated software package, the barriers are financial and practical in nature. In the worst case, the necessary software may cease to be supported, or become unavailable, rendering formatted data inaccessable.
 69 | 
 70 | %This article aims to facilitate the process of reverse engineering to identify format of statistical data files. This content is appropriate for readers with programming skill that is typical of most statisticians. Indeed, the statisitcal programming enviroment R provides a familiar and useful framework for developing a reverse engineering toolkit.
 71 | 
 72 | \section{Prerequisites}
 73 | 
 74 | This discussion assumes familiarity with some modern computer concepts, such as file input and output, and how computers store data in bits and bytes. References are given throughout for further reading on key concepts. 
 75 | 
 76 | Statistical data files are regular computer files that contain structured data, such as a table of records and fields. A data file may also contain metadata, such as a record count or field labels. A file {\it format} is a specification that determines how structured data and metadata are organized into a computer file. Logically, a computer file is simply a sequence of eight bit binary values(bytes). Hence, a file format describes how structured data ({\it e.g.} integers, text) are (1) represented and (2) serialized in a sequence of bytes.
 77 | 
 78 | Formatted data may be human readable, that is, consisting of bytes that are interpreted as character strings ({\it e.g.}, comma separated values). Encoded data that are not human readable are generically said to have 'binary' formatting ({\it e.g.,} XBase and dBase formats, \cite{XBase2010}).
 79 | 
 80 | %@misc{XBase2010,
 81 | %    author = {Erik Bachmann},
 82 | %    title = {XBase (and dBase) File Format Description},
 83 | %    publisher = {Clickety Click Software},
 84 | %    year = {2010},
 85 | %    note = {This is an electronic document. Date retrieved: August 12, 2011.},
 86 | %    url = {http://www.clicketyclick.dk/databases/xbase/format/index.html}
 87 | %}
 88 | 
 89 | 
 90 | % Perhaps reference Knuth, The Art of Computer Programming (no, don't think this reference is useful)
 91 | 
 92 | %Although \citep{Goldberg}'s popular discussion of computer representations of floating-point numbers is primarily concerned with rounding errors, the article introduces the topic nicely.  
 93 | 
 94 | \section{Basics}
 95 | %counting: decimal, hex, octal, binary
 96 | %macro structure: header-data paradigm, offset-length
 97 | 
 98 | 
 99 | %@article{Goldberg1991,
100 | %    author = {Goldberg, David},
101 | %    title = {What every computer scientist should know about floating-point arithmetic},
102 | %    journal = {ACM Computing Surveys},
103 | %    number = {1},
104 | %    pages = {5--48},
105 | %    volume = {23},
106 | %    year = {1991}
107 | %}
108 | 
109 | %visual and automated detection of floating point data
110 | %%floating point representations
111 | %%statistical properties
112 | 
113 | %visual and automated detection of encoded character data
114 | %%character representations
115 | 
116 | %alignment issues
117 | %%C structures can have different sizes, depending on the compiler and platform
118 | 
119 | \section{The Knockout Strategy}
120 | %This is the strategy of modifying (breaking) fields that have unknown meaning, and then attempting to open the file using software design to open the file. The software may reveal the field's purpose.
121 | 
122 | \section{Deducing Field Width Using Endianness}
123 | When a binary field encodes a multi-byte quantity, it may not be clear how many bytes contribute to the value. For instance, suppose that a multi-byte, unsigned integer field is suspected to be four bytes in length, but is only observed for values less than or equal to $2^{16}-1$. In this case, it is possible that the field is only two bytes in length, and the remaining bytes constitute a separate two-byte field. 
124 | 
125 | If the suspect field is subsequently observed in the opposite endianness, the field length becomes clear. To illustrate, consider the four bytes (in hexadecimal representation) {\tt 01 00 00 00}. Then suppose we observe the same field in opposite endianness. There are several possibilities: 1) {\tt 01 00 00 00} - the field is single byte, {\tt 00 01 00 00} - the field is two-byte, {\tt 00 00 01 00} - the field is three-byte, and {\tt 00 00 00 01} the field is four-byte. 
126 | 
127 | If there is concern that bytes at a particular offset may form incomplete parts of adjacent fields, then this test for field withd may be misleading. If the byte values observed in opposite endianness cannot be obtained by reordering the original byte values, then these bytes must span two adjacent fields. %if you discover that some files are big endian, this can be used confirm the length of multi-byte fields that were discovered in little-endian format.
128 | \section{Legality}
129 | Some material from:
130 | https://www.eff.org/issues/coders/reverse-engineering-faq
131 | 
132 | Among intellectual property laws, including Copyright and fair use (17 U.S.C. 107), DMCA (17 U.S.C. section 1201), trade secrets (Uniform trade secrets act with amendments 1985), Contract law (EULA, TOS, TOU, NDA, developer or API agreement), Electronics Communications Privacy Act (18 U.S.C. 2510 et.seq.), trade secret law is most directly impacts reverse engineering of statistical data files (since copyright (of code), cracking, are not involved, but be careful about TOS and EULAs). However, the UTSA makes explicit that reverse engineering is a {\it proper} means.
133 | 
134 | \end{document}
135 | 


--------------------------------------------------------------------------------
/vignettes/rst2Rnw.sh:
--------------------------------------------------------------------------------
1 | rst2latex sas7bdat.rst > sas7bdat.tex
2 | sed -re '/^%%% User specified packages and stylesheets/a \
3 |     \\usepackage{fullpage}\
4 |     \\usepackage{Sweave}\
5 |     %\\VignetteIndexEntry{sas7bdat}' sas7bdat.tex > sas7bdat.Rnw
6 | rm sas7bdat.tex
7 | 


--------------------------------------------------------------------------------
/vignettes/sas7bdat.Rnw:
--------------------------------------------------------------------------------
   1 | \documentclass[a4paper]{article}
   2 | % generated by Docutils <http://docutils.sourceforge.net/>
   3 | \usepackage{cmap} % fix search and cut-and-paste in Acrobat
   4 | \usepackage{ifthen}
   5 | \usepackage[T1]{fontenc}
   6 | \usepackage[utf8]{inputenc}
   7 | \usepackage{alltt}
   8 | \setcounter{secnumdepth}{0}
   9 | \usepackage{longtable,ltcaption,array}
  10 | \setlength{\extrarowheight}{2pt}
  11 | \newlength{\DUtablewidth} % internal use in tables
  12 | 
  13 | %%% Custom LaTeX preamble
  14 | % PDF Standard Fonts
  15 | \usepackage{mathptmx} % Times
  16 | \usepackage[scaled=.90]{helvet}
  17 | \usepackage{courier}
  18 | 
  19 | %%% User specified packages and stylesheets
  20 |     \usepackage{fullpage}
  21 |     \usepackage{Sweave}
  22 |     %\VignetteIndexEntry{sas7bdat}
  23 | 
  24 | %%% Fallback definitions for Docutils-specific commands
  25 | 
  26 | % Provide a length variable and set default, if it is new
  27 | \providecommand*{\DUprovidelength}[2]{
  28 |   \ifthenelse{\isundefined{#1}}{\newlength{#1}\setlength{#1}{#2}}{}
  29 | }
  30 | 
  31 | % line block environment
  32 | \DUprovidelength{\DUlineblockindent}{2.5em}
  33 | \ifthenelse{\isundefined{\DUlineblock}}{
  34 |   \newenvironment{DUlineblock}[1]{%
  35 |     \list{}{\setlength{\partopsep}{\parskip}
  36 |             \addtolength{\partopsep}{\baselineskip}
  37 |             \setlength{\topsep}{0pt}
  38 |             \setlength{\itemsep}{0.15\baselineskip}
  39 |             \setlength{\parsep}{0pt}
  40 |             \setlength{\leftmargin}{#1}}
  41 |     \raggedright
  42 |   }
  43 |   {\endlist}
  44 | }{}
  45 | 
  46 | % hyperlinks:
  47 | \ifthenelse{\isundefined{\hypersetup}}{
  48 |   \usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue]{hyperref}
  49 |   \usepackage{bookmark}
  50 |   \urlstyle{same} % normal text font (alternatives: tt, rm, sf)
  51 | }{}
  52 | \hypersetup{
  53 |   pdftitle={SAS7BDAT Database Binary Format},
  54 | }
  55 | 
  56 | %%% Body
  57 | \begin{document}
  58 | \title{SAS7BDAT Database Binary Format%
  59 |   \label{sas7bdat-database-binary-format}}
  60 | \author{}
  61 | \date{}
  62 | \maketitle
  63 | 
  64 | by:
  65 | 
  66 | \begin{quote}
  67 | \begin{DUlineblock}{0em}
  68 | \item[] Matthew S. Shotwell, PhD
  69 | \item[] Assistant Professor
  70 | \item[] Department of Biostatistics
  71 | \item[] Vanderbilt University
  72 | \item[] \href{mailto:matt.shotwell@vanderbilt.edu}{matt.shotwell@vanderbilt.edu}
  73 | \end{DUlineblock}
  74 | \end{quote}
  75 | 
  76 | 1/9/2013 update (\textbf{u64} format extensions, Row Size fields, and RLE compression) by:
  77 | 
  78 | \begin{quote}
  79 | \begin{DUlineblock}{0em}
  80 | \item[] Clint Cummins, PhD
  81 | \item[] \href{mailto:clint@stanford.edu}{clint@stanford.edu}
  82 | \end{DUlineblock}
  83 | \end{quote}
  84 | 
  85 | Copyright (C) 2013 is retained by the authors listed above. This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit \url{http://creativecommons.org/licenses/by-sa/3.0/}.
  86 | 
  87 | 
  88 | \section{Contents%
  89 |   \label{contents}%
  90 | }
  91 | 
  92 | \begin{itemize}
  93 | \item \hyperref[introduction]{Introduction}
  94 | 
  95 | \item \hyperref[sas7bdat-header]{SAS7BDAT Header}
  96 | 
  97 | \item \hyperref[sas7bdat-pages]{SAS7BDAT Pages}
  98 | 
  99 | \item \hyperref[sas7bdat-subheaders]{SAS7BDAT Subheaders}
 100 | 
 101 | \item \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT Packed Binary Data}
 102 | 
 103 | \item \hyperref[platform-differences]{Platform Differences}
 104 | 
 105 | \item \hyperref[compression-data]{Compression Data}
 106 | 
 107 | \item \hyperref[software-prototype]{Software Prototype}
 108 | 
 109 | \item \hyperref[todo]{ToDo}
 110 | \end{itemize}
 111 | 
 112 | 
 113 | \section{Introduction%
 114 |   \label{introduction}%
 115 | }
 116 | 
 117 | The SAS7BDAT file is a binary database storage file. At the time of this writing, no description of the SAS7BDAT file format was publicly available. Hence, users who wish to read and manipulate these files were required to obtain a license for the SAS software, or third party software with support for SAS7BDAT files. The purpose of this document is to promote interoperability between SAS and other popular statistical software packages, especially R (\url{http://www.r-project.org/}).
 118 | 
 119 | The information below was deduced by examining the contents of many SAS7BDAT databases downloaded freely from internet resources (see \texttt{data/sas7bdat.sources.RData}). No guarantee is made regarding its accuracy. No SAS software, nor any other software requiring the purchase of a license was used.
 120 | 
 121 | SAS7BDAT files consist of binary encoded data. Data files encoded in this format often have the extension '.sas7bdat'. The name 'SAS7BDAT' is not official, but is used throughout this document to refer to SAS database files formatted according to the descriptions below.
 122 | 
 123 | There are significant differences in the SAS7BDAT format depending on the operating systems and computer hardware platforms (32bit vs. 64bit). See the section on \hyperref[platform-differences]{platform differences} for more details. The format described below is sufficient to read the entire collection of test files referenced in \texttt{data/sas7bdat.sources.RData} (i.e. files associated with 32bit and some 64bit builds of SAS for Microsoft Windows, and \textbf{u64} SAS versions).  This includes files created with COMPRESS=CHAR. The format described here is probably not sufficient to \textbf{write} SAS7BDAT format files, due to lingering uncertainties.
 124 | 
 125 | The figure below illustrates the overall structure of the SAS7BDAT database. Each file consists of a header (length := HL bytes), followed by PC pages, each of length PL bytes (PC and PL are shorthand for 'page count' and 'page size' respectively, and are used to denote these quantities throughout this document).:
 126 | 
 127 | \begin{quote}
 128 | \begin{alltt}
 129 | -{}-{}-{}-{}-{}-{}-{}-{}-{}-
 130 | |   HL   |  header
 131 | -{}-{}-{}-{}-{}-{}-{}-{}-{}-
 132 | |   PL   |  page 1
 133 | -{}-{}-{}-{}-{}-{}-{}-{}-{}-
 134 | |   PL   |  page 2
 135 | -{}-{}-{}-{}-{}-{}-{}-{}-{}-
 136 |    ...
 137 | -{}-{}-{}-{}-{}-{}-{}-{}-{}-
 138 | |   PL   |  page PC
 139 | -{}-{}-{}-{}-{}-{}-{}-{}-{}-
 140 | \end{alltt}
 141 | \end{quote}
 142 | 
 143 | Throughout this document, hexadecimal digits are denoted with a preceding 'x', binary digits with a preceding 'b', and decimal digits with no preceding character. For example, see the below \hyperref[table-of-hexadecimal-decimal-and-binary-values]{table of hexadecimal, decimal, and binary values}.
 144 | 
 145 | 
 146 | \section{SAS7BDAT Header%
 147 |   \label{sas7bdat-header}%
 148 | }
 149 | 
 150 | The SAS7BDAT file header contains a binary file identifier (\emph{i.e.}, a magic number), the dataset name, timestamp, the number pages (PC), their size (PL) and a variety of other values that pertain to the database as a whole. The purpose of many header fields remain unknown, but are likely to include specifications for data compression and encryption, password protection, and dates/times of creation and/or modification. Most files encountered encode multi-byte values little-endian (least significant byte first). However, some files have big-endian values. Hence, it appears that multi-byte values are encoded using endianness of the platform where the file was written.  See \hyperref[platform-differences]{Platform Differences} for a table of key test files which differ in several ways.
 151 | 
 152 | The \emph{offset table} below describes the SAS7BDAT file header as a sequence of bytes. Information stored in the table is indexed by its byte offset (first column) in the header and its length (second column) in bytes. For example, the field at offset 0 has length 32 bytes. Hence, bytes 0,1,...,31 comprise the data for this field. Byte lengths having the form '\%n' should read: 'the number of bytes remaining up to, but not including byte n'. The fourth column gives a shorthand description of the data contained at the corresponding offset. For example, 'int, page size := PL' indicates that the data stored at the corresponding location is a signed integer representing the page size, which we denote PL. The description \emph{????????????} indicates that the meaning of data stored at the corresponding offset is unknown. The third column represents the author's confidence (low, medium, high) in the corresponding offset, length, and description. Each offset table in this document is formatted in a similar fashion. Variables defined in an offset table are sometimes used in subsequent tables.
 153 | 
 154 | 
 155 | \subsection{Header Offset Table%
 156 |   \label{header-offset-table}%
 157 | }
 158 | 
 159 | \setlength{\DUtablewidth}{\linewidth}%
 160 | \begin{longtable*}{|p{0.114\DUtablewidth}|p{0.056\DUtablewidth}|p{0.056\DUtablewidth}|p{0.724\DUtablewidth}|}
 161 | \hline
 162 | \textbf{%
 163 | offset
 164 | } & \textbf{%
 165 | length
 166 | } & \textbf{%
 167 | conf.
 168 | } & \textbf{%
 169 | description
 170 | } \\
 171 | \hline
 172 | \endfirsthead
 173 | \hline
 174 | \textbf{%
 175 | offset
 176 | } & \textbf{%
 177 | length
 178 | } & \textbf{%
 179 | conf.
 180 | } & \textbf{%
 181 | description
 182 | } \\
 183 | \hline
 184 | \endhead
 185 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
 186 | \endfoot
 187 | \endlastfoot
 188 | 
 189 | 0
 190 |  & 
 191 | 32
 192 |  & 
 193 | high
 194 |  & 
 195 | binary, \hyperref[magic-number]{magic number}
 196 |  \\
 197 | \hline
 198 | 
 199 | 32
 200 |  & 
 201 | 1
 202 |  & 
 203 | high
 204 |  & 
 205 | binary, \hyperref[alignment]{Alignment}: if (byte==x33) a2=4 else a2=0 .  \textbf{u64} is true if a2=4 (unix 64 bit format).
 206 |  \\
 207 | \hline
 208 | 
 209 | 33
 210 |  & 
 211 | 2
 212 |  & 
 213 | low
 214 |  & 
 215 | \emph{????????????}
 216 |  \\
 217 | \hline
 218 | 
 219 | 35
 220 |  & 
 221 | 1
 222 |  & 
 223 | high
 224 |  & 
 225 | binary, \hyperref[alignment]{Alignment}  if (byte==x33) a1=4 else a1=0
 226 |  \\
 227 | \hline
 228 | 
 229 | 36
 230 |  & 
 231 | 1
 232 |  & 
 233 | low
 234 |  & 
 235 | \emph{????????????}
 236 |  \\
 237 | \hline
 238 | 
 239 | 37
 240 |  & 
 241 | 1
 242 |  & 
 243 | high
 244 |  & 
 245 | int, endianness (x01-little {[}Intel{]} x00-big)
 246 |  \\
 247 | \hline
 248 | 
 249 | 38
 250 |  & 
 251 | 1
 252 |  & 
 253 | low
 254 |  & 
 255 | \emph{????????????}
 256 |  \\
 257 | \hline
 258 | 
 259 | 39
 260 |  & 
 261 | 1
 262 |  & 
 263 | medium
 264 |  & 
 265 | ascii, OS type (1-UNIX or 2-WIN).  Does not affect format except for the OS strings.
 266 |  \\
 267 | \hline
 268 | 
 269 | 40
 270 |  & 
 271 | 8
 272 |  & 
 273 | low
 274 |  & 
 275 | \emph{????????????}
 276 |  \\
 277 | \hline
 278 | 
 279 | 48
 280 |  & 
 281 | 8
 282 |  & 
 283 | low
 284 |  & 
 285 | \emph{????????????}
 286 |  \\
 287 | \hline
 288 | 
 289 | 56
 290 |  & 
 291 | 8
 292 |  & 
 293 | low
 294 |  & 
 295 | repeat of 32:32+8
 296 |  \\
 297 | \hline
 298 | 
 299 | 64
 300 |  & 
 301 | 6
 302 |  & 
 303 | low
 304 |  & 
 305 | \emph{????????????}
 306 |  \\
 307 | \hline
 308 | 
 309 | 70
 310 |  & 
 311 | 2
 312 |  & 
 313 | low
 314 |  & 
 315 | int, \hyperref[character-encoding]{Character Encoding}
 316 |  \\
 317 | \hline
 318 | 
 319 | 72
 320 |  & 
 321 | 12
 322 |  & 
 323 | low
 324 |  & 
 325 | \emph{????????????}
 326 |  \\
 327 | \hline
 328 | 
 329 | 84
 330 |  & 
 331 | 8
 332 |  & 
 333 | high
 334 |  & 
 335 | ascii 'SAS FILE'
 336 |  \\
 337 | \hline
 338 | 
 339 | 92
 340 |  & 
 341 | 64
 342 |  & 
 343 | high
 344 |  & 
 345 | ascii, dataset name
 346 |  \\
 347 | \hline
 348 | 
 349 | 156
 350 |  & 
 351 | 8
 352 |  & 
 353 | medium
 354 |  & 
 355 | ascii, file type, e.g. \texttt{'DATA ~ ~'}
 356 |  \\
 357 | \hline
 358 | 
 359 | 164
 360 |  & 
 361 | a1
 362 |  & 
 363 | medium
 364 |  & 
 365 | zero padding when a1=4 .  Aligns the double timestamps below on double word boundaries.
 366 |  \\
 367 | \hline
 368 | 
 369 | 164+a1
 370 |  & 
 371 | 8
 372 |  & 
 373 | high
 374 |  & 
 375 | double, timestamp, date created, secs since 1/1/60 (for SAS version 8.x and higher)
 376 |  \\
 377 | \hline
 378 | 
 379 | 172+a1
 380 |  & 
 381 | 8
 382 |  & 
 383 | high
 384 |  & 
 385 | double, timestamp, date modified, secs since 1/1/60 (for SAS version 8.x and higher)
 386 |  \\
 387 | \hline
 388 | 
 389 | 180+a1
 390 |  & 
 391 | 16
 392 |  & 
 393 | low
 394 |  & 
 395 | \emph{????????????}
 396 |  \\
 397 | \hline
 398 | 
 399 | 196+a1
 400 |  & 
 401 | 4
 402 |  & 
 403 | high
 404 |  & 
 405 | int, length of SAS7BDAT header := HL
 406 |  \\
 407 | \hline
 408 | 
 409 | 200+a1
 410 |  & 
 411 | 4
 412 |  & 
 413 | high
 414 |  & 
 415 | int, page size := %
 416 | \phantomsection\label{pl}PL
 417 |  \\
 418 | \hline
 419 | 
 420 | 204+a1
 421 |  & 
 422 | 4+a2
 423 |  & 
 424 | high
 425 |  & 
 426 | int, page count := PC .  Length 4 or 8 (\textbf{u64}), henceforth denoted \textbf{4|8}
 427 |  \\
 428 | \hline
 429 | 
 430 | 208+a1+a2
 431 |  & 
 432 | 8
 433 |  & 
 434 | low
 435 |  & 
 436 | \emph{????????????}
 437 |  \\
 438 | \hline
 439 | 
 440 | 216+a1+a2
 441 |  & 
 442 | 8
 443 |  & 
 444 | high
 445 |  & 
 446 | ascii, SAS release  (e.g. 9.0101M3 )
 447 |  \\
 448 | \hline
 449 | 
 450 | 224+a1+a2
 451 |  & 
 452 | 16
 453 |  & 
 454 | high
 455 |  & 
 456 | ascii, host  (SAS server type, longest observed string has 9 bytes)
 457 |  \\
 458 | \hline
 459 | 
 460 | 240+a1+a2
 461 |  & 
 462 | 16
 463 |  & 
 464 | high
 465 |  & 
 466 | ascii, OS version number (for UNIX, else null)
 467 |  \\
 468 | \hline
 469 | 
 470 | 256+a1+a2
 471 |  & 
 472 | 16
 473 |  & 
 474 | high
 475 |  & 
 476 | ascii, OS maker or version (SUN, IBM, sometimes WIN)
 477 |  \\
 478 | \hline
 479 | 
 480 | 272+a1+a2
 481 |  & 
 482 | 16
 483 |  & 
 484 | high
 485 |  & 
 486 | ascii, OS name (for UNIX, else null)
 487 |  \\
 488 | \hline
 489 | 
 490 | 288+a1+a2
 491 |  & 
 492 | 32
 493 |  & 
 494 | low
 495 |  & 
 496 | \emph{????????????}
 497 |  \\
 498 | \hline
 499 | 
 500 | 320+a1+a2
 501 |  & 
 502 | 4
 503 |  & 
 504 | low
 505 |  & 
 506 | int, page sequence signature? (value is close to the value at start of each Page Offset Table)
 507 |  \\
 508 | \hline
 509 | 
 510 | 324+a1+a2
 511 |  & 
 512 | 4
 513 |  & 
 514 | low
 515 |  & 
 516 | \emph{????????????}
 517 |  \\
 518 | \hline
 519 | 
 520 | 328+a1+a2
 521 |  & 
 522 | 8
 523 |  & 
 524 | medium
 525 |  & 
 526 | double, 3rd timestamp, sometimes zero
 527 |  \\
 528 | \hline
 529 | 
 530 | 336+a1+a2
 531 |  & 
 532 | \%HL
 533 |  & 
 534 | medium
 535 |  & 
 536 | zeros
 537 |  \\
 538 | \hline
 539 | 
 540 | 1024|8192
 541 |  &  & 
 542 | medium
 543 |  & 
 544 | Total length of header (8192 for \textbf{u64}), HL
 545 |  \\
 546 | \hline
 547 | \end{longtable*}
 548 | 
 549 | The 8 bytes beginning at offset 32 hold information which affects the offset of the 'release' and 'host' information. In particular:
 550 | 
 551 | \begin{enumerate}
 552 | \item The byte at offset 32 defines the \textbf{u64} (unix 64 bit) file format, which affects many field and header lengths (usually via 4 vs. 8 byte integers).
 553 | 
 554 | \item The byte at offset 35 controls an offset before the timestamps.
 555 | 
 556 | \item The byte at offset 37 defines byte ordering of ints and doubles (most test files were created on Windows and use Intel byte ordering; little endian).
 557 | 
 558 | \item The byte at offset 39 appears to distinguish the OS type, where '1' indicates that the file was generated on a UNIX-like system, such as Linux or SunOS, and '2' indicates the file was generated on a Microsoft Windows platform. However, this does not affect any important fields in the file format.
 559 | \end{enumerate}
 560 | 
 561 | The following table describes some of the possible polymorphisms for the 8 bytes at offset 32. The first field lists the name of the file where the sequence was found (see \texttt{data/sas7bdat.sources.RData}), the second lists the eight byte values (hexadecimal), the third field shows bytes 216-239 in ASCII ('.' represents a non-ASCII character or '0'), and the fourth field lists the SAS7BDAT sub-format.
 562 | 
 563 | \setlength{\DUtablewidth}{\linewidth}%
 564 | \begin{longtable*}{|p{0.229\DUtablewidth}|p{0.294\DUtablewidth}|p{0.237\DUtablewidth}|p{0.189\DUtablewidth}|}
 565 | \hline
 566 | \textbf{%
 567 | filename
 568 | } & \textbf{%
 569 | bytes 32-39
 570 | } & \textbf{%
 571 | bytes 216-239
 572 | } & \textbf{%
 573 | format
 574 | } \\
 575 | \hline
 576 | \endfirsthead
 577 | \hline
 578 | \textbf{%
 579 | filename
 580 | } & \textbf{%
 581 | bytes 32-39
 582 | } & \textbf{%
 583 | bytes 216-239
 584 | } & \textbf{%
 585 | format
 586 | } \\
 587 | \hline
 588 | \endhead
 589 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
 590 | \endfoot
 591 | \endlastfoot
 592 | 
 593 | \texttt{compress\_no.sas7bdat}
 594 |  & 
 595 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32}
 596 |  & 
 597 | \texttt{9.0101M3NET\_ASRV........}
 598 |  & 
 599 | Windows Intel
 600 |  \\
 601 | \hline
 602 | 
 603 | \texttt{compress\_yes.sas7bdat}
 604 |  & 
 605 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32}
 606 |  & 
 607 | \texttt{9.0101M3NET\_ASRV........}
 608 |  & 
 609 | Windows Intel
 610 |  \\
 611 | \hline
 612 | 
 613 | \texttt{lowbwt\_i386.sas7bdat}
 614 |  & 
 615 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32}
 616 |  & 
 617 | \texttt{9.0202M0W32\_VSPRO.......}
 618 |  & 
 619 | Windows Intel
 620 |  \\
 621 | \hline
 622 | 
 623 | \texttt{missing\_values.sas7bdat}
 624 |  & 
 625 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32}
 626 |  & 
 627 | \texttt{9.0202M0W32\_VSPRO.......}
 628 |  & 
 629 | Windows Intel
 630 |  \\
 631 | \hline
 632 | 
 633 | \texttt{obs\_all\_perf\_1.sas7bdat}
 634 |  & 
 635 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32}
 636 |  & 
 637 | \texttt{9.0101M3XP\_PRO..........}
 638 |  & 
 639 | Windows Intel
 640 |  \\
 641 | \hline
 642 | 
 643 | \texttt{adsl.sas7bdat}
 644 |  & 
 645 | \texttt{x22 x22 x00 x33 x33 x01 x02 x32}
 646 |  & 
 647 | \texttt{....9.0202M3X64\_ESRV....}
 648 |  & 
 649 | Windows x64 Intel
 650 |  \\
 651 | \hline
 652 | 
 653 | \texttt{eyecarex.sas7bdat}
 654 |  & 
 655 | \texttt{x22 x22 x00 x33 x22 x00 x02 x31}
 656 |  & 
 657 | \texttt{....9.0000M0WIN.........}
 658 |  & 
 659 | Unix non-Intel
 660 |  \\
 661 | \hline
 662 | 
 663 | \texttt{lowbwt\_x64.sas7bdat}
 664 |  & 
 665 | \texttt{x22 x22 x00 x33 x33 x01 x02 x32}
 666 |  & 
 667 | \texttt{....9.0202M2X64\_VSPRO...}
 668 |  & 
 669 | Windows x64 Intel
 670 |  \\
 671 | \hline
 672 | 
 673 | \texttt{natlterr1994.sas7bdat}
 674 |  & 
 675 | \texttt{x33 x22 x00 x33 x33 x00 x02 x31}
 676 |  & 
 677 | \texttt{........9.0101M3SunOS...}
 678 |  & 
 679 | u64 Unix non-Intel
 680 |  \\
 681 | \hline
 682 | 
 683 | \texttt{natlterr2006.sas7bdat}
 684 |  & 
 685 | \texttt{x33 x22 x00 x33 x33 x00 x02 x31}
 686 |  & 
 687 | \texttt{........9.0101M3SunOS...}
 688 |  & 
 689 | u64 Unix non-Intel
 690 |  \\
 691 | \hline
 692 | 
 693 | \texttt{txzips.sas7bdat}
 694 |  & 
 695 | \texttt{x33 x22 x00 x33 x33 x01 x02 x31}
 696 |  & 
 697 | \texttt{........9.0201M0Linux...}
 698 |  & 
 699 | u64 Unix Intel
 700 |  \\
 701 | \hline
 702 | \end{longtable*}
 703 | 
 704 | \phantomsection\label{table-of-hexadecimal-decimal-and-binary-values}
 705 | The binary representation for the hexadecimal values present in the table above are given below.
 706 | 
 707 | \setlength{\DUtablewidth}{\linewidth}%
 708 | \begin{longtable*}{|p{0.145\DUtablewidth}|p{0.098\DUtablewidth}|p{0.168\DUtablewidth}|}
 709 | \hline
 710 | \textbf{%
 711 | hexadecimal
 712 | } & \textbf{%
 713 | decimal
 714 | } & \textbf{%
 715 | binary
 716 | } \\
 717 | \hline
 718 | \endfirsthead
 719 | \hline
 720 | \textbf{%
 721 | hexadecimal
 722 | } & \textbf{%
 723 | decimal
 724 | } & \textbf{%
 725 | binary
 726 | } \\
 727 | \hline
 728 | \endhead
 729 | \multicolumn{3}{c}{\hfill ... continued on next page} \\
 730 | \endfoot
 731 | \endlastfoot
 732 | 
 733 | \texttt{x01}
 734 |  & 
 735 | \texttt{001}
 736 |  & 
 737 | \texttt{b00000001}
 738 |  \\
 739 | \hline
 740 | 
 741 | \texttt{x02}
 742 |  & 
 743 | \texttt{002}
 744 |  & 
 745 | \texttt{b00000010}
 746 |  \\
 747 | \hline
 748 | 
 749 | \texttt{x22}
 750 |  & 
 751 | \texttt{034}
 752 |  & 
 753 | \texttt{b00010010}
 754 |  \\
 755 | \hline
 756 | 
 757 | \texttt{x31}
 758 |  & 
 759 | \texttt{049}
 760 |  & 
 761 | \texttt{b00011001}
 762 |  \\
 763 | \hline
 764 | 
 765 | \texttt{x32}
 766 |  & 
 767 | \texttt{050}
 768 |  & 
 769 | \texttt{b00011010}
 770 |  \\
 771 | \hline
 772 | 
 773 | \texttt{x33}
 774 |  & 
 775 | \texttt{051}
 776 |  & 
 777 | \texttt{b00011011}
 778 |  \\
 779 | \hline
 780 | \end{longtable*}
 781 | 
 782 | 
 783 | \subsubsection{Alignment%
 784 |   \label{alignment}%
 785 | }
 786 | 
 787 | In files generated by 64 bit builds of SAS, 'alignment' means that all data field offsets containing doubles or 8 byte ints should be a factor of 8 bytes. For files generated by 32 bit builds of SAS, the alignment is 4 bytes. Because \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT Packed Binary Data} may contain double precision values, it appears that all data rows are 64 bit aligned, regardless of whether the file was written with a 32 bit or 64 bit build of SAS. Alignment of data structures according to the platform word length (4 bytes for 32 bit, and 8 bytes for 64 bit architectures) facilitates efficient operations on data stored in memory. It also suggests that parts of SAS7BDAT data file format are platform dependent. One theory is that the SAS implementation utilizes a common C or C++ structure or class to reference data stored in memory. When compiled, these structures are aligned according to the word length of the target platform. Of course, when SAS was originally written, platform differences may not have been forseeable. Hence, these inconsistencies may not have been intentional.
 788 | 
 789 | 
 790 | \subsubsection{Magic Number%
 791 |   \label{magic-number}%
 792 | }
 793 | 
 794 | The SAS7BDAT magic number is the following 32 byte (hex) sequence.:
 795 | 
 796 | \begin{quote}
 797 | \begin{alltt}
 798 | x00 x00 x00 x00   x00 x00 x00 x00
 799 | x00 x00 x00 x00   xc2 xea x81 x60
 800 | xb3 x14 x11 xcf   xbd x92 x08 x00
 801 | x09 xc7 x31 x8c   x18 x1f x10 x11
 802 | \end{alltt}
 803 | \end{quote}
 804 | 
 805 | In all test files except one (not listed in \texttt{data/sas7bdat.sources.RData}), the magic number above holds. The one anomalous file has the following magic number:
 806 | 
 807 | \begin{quote}
 808 | \begin{alltt}
 809 | x00 x00 x00 x00   x00 x00 x00 x00
 810 | x00 x00 x00 x00   x00 x00 x00 x00
 811 | x00 x00 x00 x00   x00 x00 x00 x00
 812 | x00 x00 x00 x00   x18 x1f x10 x11
 813 | \end{alltt}
 814 | \end{quote}
 815 | 
 816 | In addition, the anomalous file is associated with the SAS release \textquotedbl{}3.2TK\textquotedbl{}. Indeed, this file may not have been written by SAS. Otherwise, the anomalous file appears to be formatted similarly to other test files.
 817 | 
 818 | 
 819 | \subsubsection{Character Encoding%
 820 |   \label{character-encoding}%
 821 | }
 822 | 
 823 | The integer (one or two bytes) at header offset 70 (bytes) indicates the character encoding of string data. The table below lists the values that are known to occur and the associated character encoding.
 824 | 
 825 | \setlength{\DUtablewidth}{\linewidth}%
 826 | \begin{longtable*}{|p{0.179\DUtablewidth}|p{0.179\DUtablewidth}|p{0.168\DUtablewidth}|}
 827 | \hline
 828 | \textbf{%
 829 | bytes 70-72
 830 | } & \textbf{%
 831 | SAS name
 832 | } & \textbf{%
 833 | iconv name
 834 | } \\
 835 | \hline
 836 | \endfirsthead
 837 | \hline
 838 | \textbf{%
 839 | bytes 70-72
 840 | } & \textbf{%
 841 | SAS name
 842 | } & \textbf{%
 843 | iconv name
 844 | } \\
 845 | \hline
 846 | \endhead
 847 | \multicolumn{3}{c}{\hfill ... continued on next page} \\
 848 | \endfoot
 849 | \endlastfoot
 850 | 
 851 | 0
 852 |  & 
 853 | (Unspecified)
 854 |  & 
 855 | (Unspecified)
 856 |  \\
 857 | \hline
 858 | 
 859 | 20
 860 |  & 
 861 | utf-8
 862 |  & 
 863 | UTF-8
 864 |  \\
 865 | \hline
 866 | 
 867 | 28
 868 |  & 
 869 | us-ascii
 870 |  & 
 871 | US-ASCII
 872 |  \\
 873 | \hline
 874 | 
 875 | 29
 876 |  & 
 877 | latin1
 878 |  & 
 879 | ISO-8859-1
 880 |  \\
 881 | \hline
 882 | 
 883 | 30
 884 |  & 
 885 | latin2
 886 |  & 
 887 | ISO-8859-2
 888 |  \\
 889 | \hline
 890 | 
 891 | 31
 892 |  & 
 893 | latin3
 894 |  & 
 895 | ISO-8859-3
 896 |  \\
 897 | \hline
 898 | 
 899 | 34
 900 |  & 
 901 | arabic
 902 |  & 
 903 | ISO-8859-6
 904 |  \\
 905 | \hline
 906 | 
 907 | 36
 908 |  & 
 909 | hebrew
 910 |  & 
 911 | ISO-8859-8
 912 |  \\
 913 | \hline
 914 | 
 915 | 39
 916 |  & 
 917 | thai
 918 |  & 
 919 | ISO-8859-11
 920 |  \\
 921 | \hline
 922 | 
 923 | 40
 924 |  & 
 925 | latin5
 926 |  & 
 927 | ISO-8859-9
 928 |  \\
 929 | \hline
 930 | 
 931 | 60
 932 |  & 
 933 | wlatin2
 934 |  & 
 935 | WINDOWS-1250
 936 |  \\
 937 | \hline
 938 | 
 939 | 61
 940 |  & 
 941 | wcyrillic
 942 |  & 
 943 | WINDOWS-1251
 944 |  \\
 945 | \hline
 946 | 
 947 | 62
 948 |  & 
 949 | wlatin1
 950 |  & 
 951 | WINDOWS-1252
 952 |  \\
 953 | \hline
 954 | 
 955 | 63
 956 |  & 
 957 | wgreek
 958 |  & 
 959 | WINDOWS-1253
 960 |  \\
 961 | \hline
 962 | 
 963 | 64
 964 |  & 
 965 | wturkish
 966 |  & 
 967 | WINDOWS-1254
 968 |  \\
 969 | \hline
 970 | 
 971 | 65
 972 |  & 
 973 | whebrew
 974 |  & 
 975 | WINDOWS-1255
 976 |  \\
 977 | \hline
 978 | 
 979 | 66
 980 |  & 
 981 | warabic
 982 |  & 
 983 | WINDOWS-1256
 984 |  \\
 985 | \hline
 986 | 
 987 | 119
 988 |  & 
 989 | euc-tw
 990 |  & 
 991 | EUC-TW
 992 |  \\
 993 | \hline
 994 | 
 995 | 123
 996 |  & 
 997 | big5
 998 |  & 
 999 | BIG-5
1000 |  \\
1001 | \hline
1002 | 
1003 | 125
1004 |  & 
1005 | euc-cn
1006 |  & 
1007 | EUC-CN
1008 |  \\
1009 | \hline
1010 | 
1011 | 134
1012 |  & 
1013 | euc-jp
1014 |  & 
1015 | EUC-JP
1016 |  \\
1017 | \hline
1018 | 
1019 | 138
1020 |  & 
1021 | shift-jis
1022 |  & 
1023 | SHIFT-JIS
1024 |  \\
1025 | \hline
1026 | 
1027 | 140
1028 |  & 
1029 | euc-kr
1030 |  & 
1031 | EUC-KR
1032 |  \\
1033 | \hline
1034 | \end{longtable*}
1035 | 
1036 | When the encoding is unspecified, the file uses the encoding of the SAS session that produced it (usually Windows-1252).
1037 | 
1038 | 
1039 | \section{SAS7BDAT Pages%
1040 |   \label{sas7bdat-pages}%
1041 | }
1042 | 
1043 | Following the SAS7BDAT header are pages of data. Each page can be one of (at least) four types. The first three are those that contain meta-information (e.g. field/column attributes), packed binary data, or a combination of both. These types are denoted 'meta', 'data', and 'mix' respectively. Meta-information is required to correctly interpret the packed binary information. Hence, this information must be parsed first. In test files, 'meta' and 'mix' pages always precede 'data' pages. In some test data files, there is a fourth page type, denoted 'amd' which appears to encode additional meta information. This page usually occurs last, and appears to contain amended meta information.
1044 | 
1045 | The \hyperref[page-offset-table]{page offset table} below describes each page type. Byte offsets appended with one of '(meta/mix)', '(mix)', or '(data)' indicate that the corresponding length and description apply only to pages of the listed type. Provisionally, the internal structure of the 'amd' page type is considered identical to the 'meta' page type.
1046 | 
1047 | 
1048 | \subsection{Page Offset Table%
1049 |   \label{page-offset-table}%
1050 | }
1051 | 
1052 | \setlength{\DUtablewidth}{\linewidth}%
1053 | \begin{longtable*}{|p{0.146\DUtablewidth}|p{0.146\DUtablewidth}|p{0.071\DUtablewidth}|p{0.587\DUtablewidth}|}
1054 | \hline
1055 | \textbf{%
1056 | offset
1057 | } & \textbf{%
1058 | length
1059 | } & \textbf{%
1060 | conf.
1061 | } & \textbf{%
1062 | description
1063 | } \\
1064 | \hline
1065 | \endfirsthead
1066 | \hline
1067 | \textbf{%
1068 | offset
1069 | } & \textbf{%
1070 | length
1071 | } & \textbf{%
1072 | conf.
1073 | } & \textbf{%
1074 | description
1075 | } \\
1076 | \hline
1077 | \endhead
1078 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
1079 | \endfoot
1080 | \endlastfoot
1081 | 
1082 | 0
1083 |  & 
1084 | 4
1085 |  & 
1086 | low
1087 |  & 
1088 | int, page sequence signature?
1089 |  \\
1090 | \hline
1091 | 
1092 | 4
1093 |  & 
1094 | 12|28
1095 |  & 
1096 | low
1097 |  & 
1098 | \emph{????????????} length 12 or 28 (\textbf{u64})
1099 |  \\
1100 | \hline
1101 | 
1102 | B
1103 |  & 
1104 | 2
1105 |  & 
1106 | medium
1107 |  & 
1108 | int, bit field \hyperref[page-type]{page type} := \_PGTYPE; B = 16|32
1109 |  \\
1110 | \hline
1111 | 
1112 | B+2
1113 |  & 
1114 | 2
1115 |  & 
1116 | medium
1117 |  & 
1118 | int, data block count := %
1119 | \phantomsection\label{bc}BC
1120 |  \\
1121 | \hline
1122 | 
1123 | B+4
1124 |  & 
1125 | 2
1126 |  & 
1127 | medium
1128 |  & 
1129 | int, \hyperref[subheader-pointers]{subheader pointers} count := %
1130 | \phantomsection\label{sc}SC <= \hyperref[bc]{BC}
1131 |  \\
1132 | \hline
1133 | 
1134 | B+6
1135 |  & 
1136 | 2
1137 |  & 
1138 | low
1139 |  & 
1140 | \emph{????????????}
1141 |  \\
1142 | \hline
1143 | 
1144 | B+8
1145 |  & 
1146 | SC*SL
1147 |  & 
1148 | medium
1149 |  & 
1150 | SC \hyperref[subheader-pointers]{subheader pointers}, SL = 12|24
1151 |  \\
1152 | \hline
1153 | 
1154 | B+8+SC*SL
1155 |  & 
1156 | DL
1157 |  & 
1158 | medium
1159 |  & 
1160 | if NRD>0, 8 byte alignment; DL = (B+8+SC*SL+7) \% 8 * 8
1161 |  \\
1162 | \hline
1163 | 
1164 | B+8+SC*SL+DL
1165 |  & 
1166 | RC*`RL`\_
1167 |  & 
1168 | medium
1169 |  & 
1170 | \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT packed binary data} data row count := RC = (BC-SC)
1171 |  \\
1172 | \hline
1173 | 
1174 | C
1175 |  & 
1176 | \%`PL`\_
1177 |  & 
1178 | medium
1179 |  & 
1180 | subheader data and/or filler; C = (B+8+SC*SL+DL+RC*RL)
1181 |  \\
1182 | \hline
1183 | \end{longtable*}
1184 | 
1185 | 
1186 | \subsubsection{Page Type%
1187 |   \label{page-type}%
1188 | }
1189 | 
1190 | \setlength{\DUtablewidth}{\linewidth}%
1191 | \begin{longtable*}{|p{0.070\DUtablewidth}|p{0.051\DUtablewidth}|p{0.107\DUtablewidth}|p{0.386\DUtablewidth}|p{0.340\DUtablewidth}|}
1192 | \hline
1193 | \textbf{%
1194 | PGTYPE
1195 | } & \textbf{%
1196 | name
1197 | } & \textbf{%
1198 | subheaders
1199 | } & \textbf{%
1200 | uncompressed row data (after subheaders)
1201 | } & \textbf{%
1202 | compressed row data (in subheaders)
1203 | } \\
1204 | \hline
1205 | \endfirsthead
1206 | \hline
1207 | \textbf{%
1208 | PGTYPE
1209 | } & \textbf{%
1210 | name
1211 | } & \textbf{%
1212 | subheaders
1213 | } & \textbf{%
1214 | uncompressed row data (after subheaders)
1215 | } & \textbf{%
1216 | compressed row data (in subheaders)
1217 | } \\
1218 | \hline
1219 | \endhead
1220 | \multicolumn{5}{c}{\hfill ... continued on next page} \\
1221 | \endfoot
1222 | \endlastfoot
1223 | 
1224 | 0
1225 |  & 
1226 | meta
1227 |  & 
1228 | yes (SC>0)
1229 |  & 
1230 | no  (BC=SC)
1231 |  & 
1232 | yes
1233 |  \\
1234 | \hline
1235 | 
1236 | 256
1237 |  & 
1238 | data
1239 |  & 
1240 | no  (SC=0)
1241 |  & 
1242 | yes (RC=BC)
1243 |  & 
1244 | no
1245 |  \\
1246 | \hline
1247 | 
1248 | 512
1249 |  & 
1250 | mix
1251 |  & 
1252 | yes (SC>0)
1253 |  & 
1254 | yes (RC=BC-SC)
1255 |  & 
1256 | no
1257 |  \\
1258 | \hline
1259 | 
1260 | 1024
1261 |  & 
1262 | amd
1263 |  & 
1264 | yes?
1265 |  & 
1266 | yes?
1267 |  & 
1268 | no?
1269 |  \\
1270 | \hline
1271 | 
1272 | 16384
1273 |  & 
1274 | meta
1275 |  & 
1276 | yes (SC>0)
1277 |  & 
1278 | no (BC=SC)
1279 |  & 
1280 | yes
1281 |  \\
1282 | \hline
1283 | 
1284 | -28672
1285 |  & 
1286 | comp
1287 |  & 
1288 | no
1289 |  & 
1290 | no
1291 |  & 
1292 | no
1293 |  \\
1294 | \hline
1295 | \end{longtable*}
1296 | 
1297 | There are at least four page types 'meta', 'data', 'mix', and 'amd'. These types are encoded in the most significant byte of a two byte bit field at page offset 16|32. If no bit is set, the following page is of type 'meta'. If the first, second, or third bits are set, then the page is of type 'data', 'mix', or 'amd', respectively. Hence, if the two bytes are interpreted as an unsigned integer, then the 'meta', 'data', 'mix', and 'amd' types correspond to 0, 256, 512, and 1024, respectively. In compressed files, other bits (and sometimes multiple bits) have been set (e.g., \texttt{1 <{}< 16 | 1 <{}< 13}, which is \texttt{-28672} signed, or \texttt{36864} unsigned). However, the pattern is unclear.
1298 | 
1299 | If a page is of type 'meta', 'mix', or 'amd', data beginning at offset byte 24|40 are a sequence of SC SL-byte \hyperref[subheader-pointers]{subheader pointers}, which point to an offset farther down the page. \hyperref[sas7bdat-subheaders]{SAS7BDAT Subheaders} stored at these offsets hold meta information about the database, including the column names, labels, and types.
1300 | If a page is of type 'mix', then \textbf{packed binary data begin at the next 8 byte boundary following the last subheader pointer}. In this case, the data begin at offset B+8+SC*SL+DL, where DL = (B+8+SC*SL+PL+7) \% 8 * 8, and '\%' is the modulo operator.
1301 | 
1302 | If a page is of type 'data', then packed binary data begin at offset 24|40.
1303 | 
1304 | The 'comp' page was observed as page 2 of the compress\_yes.sas7bdat test file (not distributed with the \texttt{sas7bdat} package). It has BC and SC fields, but no subheader pointers. It contains some initial data and 2 tables. The first table has many rows of length 24; its purpose is unknown. The second table has one entry per data page with the page number and the number of data rows on the page for SC pages. It could be used to access a particular row without reading all preceding data pages.
1305 | 
1306 | 
1307 | \subsubsection{Subheader Pointers%
1308 |   \label{subheader-pointers}%
1309 | }
1310 | 
1311 | The subheader pointers encode information about the offset and length of subheaders relative to the beginning of the page where the subheader pointer is located. The purpose of the last four bytes of the subheader pointer are uncertain, but may indicate that additional subheader pointers are to be found on the next page, or that the corresponding subheader is not crucial.
1312 | 
1313 | \setlength{\DUtablewidth}{\linewidth}%
1314 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.633\DUtablewidth}|}
1315 | \hline
1316 | \textbf{%
1317 | offset
1318 | } & \textbf{%
1319 | length
1320 | } & \textbf{%
1321 | conf.
1322 | } & \textbf{%
1323 | description
1324 | } \\
1325 | \hline
1326 | \endfirsthead
1327 | \hline
1328 | \textbf{%
1329 | offset
1330 | } & \textbf{%
1331 | length
1332 | } & \textbf{%
1333 | conf.
1334 | } & \textbf{%
1335 | description
1336 | } \\
1337 | \hline
1338 | \endhead
1339 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
1340 | \endfoot
1341 | \endlastfoot
1342 | 
1343 | 0
1344 |  & 
1345 | 4|8
1346 |  & 
1347 | high
1348 |  & 
1349 | int, offset from page start to subheader
1350 |  \\
1351 | \hline
1352 | 
1353 | 4|8
1354 |  & 
1355 | 4|8
1356 |  & 
1357 | high
1358 |  & 
1359 | int, length of subheader := %
1360 | \phantomsection\label{ql}QL
1361 |  \\
1362 | \hline
1363 | 
1364 | 8|16
1365 |  & 
1366 | 1
1367 |  & 
1368 | medium
1369 |  & 
1370 | int, compression := %
1371 | \phantomsection\label{comp}COMP
1372 |  \\
1373 | \hline
1374 | 
1375 | 9|17
1376 |  & 
1377 | 1
1378 |  & 
1379 | low
1380 |  & 
1381 | int, subheader type := ST
1382 |  \\
1383 | \hline
1384 | 
1385 | 10|18
1386 |  & 
1387 | 2|6
1388 |  & 
1389 | low
1390 |  & 
1391 | zeroes
1392 |  \\
1393 | \hline
1394 | 
1395 | 12|24
1396 |  &  & 
1397 | high
1398 |  & 
1399 | Total length of subheader pointer 12|24 (\textbf{u64}), SL
1400 |  \\
1401 | \hline
1402 | \end{longtable*}
1403 | 
1404 | QL is sometimes zero, which indicates that no data is referenced by the corresponding subheader pointer. When this occurs, the subheader pointer may be ignored.
1405 | 
1406 | \setlength{\DUtablewidth}{\linewidth}%
1407 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.493\DUtablewidth}|}
1408 | \hline
1409 | \textbf{%
1410 | \hyperref[comp]{COMP}
1411 | } & \textbf{%
1412 | description
1413 | } \\
1414 | \hline
1415 | \endfirsthead
1416 | \hline
1417 | \textbf{%
1418 | \hyperref[comp]{COMP}
1419 | } & \textbf{%
1420 | description
1421 | } \\
1422 | \hline
1423 | \endhead
1424 | \multicolumn{2}{c}{\hfill ... continued on next page} \\
1425 | \endfoot
1426 | \endlastfoot
1427 | 
1428 | 0
1429 |  & 
1430 | uncompressed
1431 |  \\
1432 | \hline
1433 | 
1434 | 1
1435 |  & 
1436 | truncated (ignore data)
1437 |  \\
1438 | \hline
1439 | 
1440 | 4
1441 |  & 
1442 | RLE compressed row data with control byte
1443 |  \\
1444 | \hline
1445 | \end{longtable*}
1446 | 
1447 | \setlength{\DUtablewidth}{\linewidth}%
1448 | \begin{longtable*}{|p{0.056\DUtablewidth}|p{0.884\DUtablewidth}|}
1449 | \hline
1450 | \textbf{%
1451 | ST
1452 | } & \textbf{%
1453 | subheaders
1454 | } \\
1455 | \hline
1456 | \endfirsthead
1457 | \hline
1458 | \textbf{%
1459 | ST
1460 | } & \textbf{%
1461 | subheaders
1462 | } \\
1463 | \hline
1464 | \endhead
1465 | \multicolumn{2}{c}{\hfill ... continued on next page} \\
1466 | \endfoot
1467 | \endlastfoot
1468 | 
1469 | 0
1470 |  & 
1471 | Row Size, Column Size, Subheader Counts, Column Format and Label, in Uncompressed file
1472 |  \\
1473 | \hline
1474 | 
1475 | 1
1476 |  & 
1477 | Column Text, Column Names, Column Attributes, Column List
1478 |  \\
1479 | \hline
1480 | 
1481 | 1
1482 |  & 
1483 | all subheaders (including row data), in Compressed file.
1484 |  \\
1485 | \hline
1486 | \end{longtable*}
1487 | 
1488 | 
1489 | \section{SAS7BDAT Subheaders%
1490 |   \label{sas7bdat-subheaders}%
1491 | }
1492 | 
1493 | Subheaders contain meta information regarding the SAS7BDAT database, including row and column counts, column names, labels, and types. Each subheader is associated with a four- or eight-byte 'signature' (\textbf{u64}) that identifies the subheader type, and hence, how it should be parsed.
1494 | 
1495 | 
1496 | \subsection{Row Size Subheader%
1497 |   \label{row-size-subheader}%
1498 | }
1499 | 
1500 | The row size subheader holds information about row length (in bytes), their total count, and their count on a page of type 'mix'.  Fields at offset 28|56 and higher are not needed to read the file, but are documented here for completeness.  The four test files used for example data in the higher fields are \texttt{eyecarex.sas7bdat}, \texttt{acadindx.sas7bdat}, \texttt{natlterr1994.sas7bdat}, \texttt{txzips.sas7bdat} (non-Intel/Intel x regular/u64).
1501 | 
1502 | \setlength{\DUtablewidth}{\linewidth}%
1503 | \begin{longtable*}{|p{0.076\DUtablewidth}|p{0.076\DUtablewidth}|p{0.055\DUtablewidth}|p{0.743\DUtablewidth}|}
1504 | \hline
1505 | \textbf{%
1506 | offset
1507 | } & \textbf{%
1508 | length
1509 | } & \textbf{%
1510 | conf.
1511 | } & \textbf{%
1512 | description
1513 | } \\
1514 | \hline
1515 | \endfirsthead
1516 | \hline
1517 | \textbf{%
1518 | offset
1519 | } & \textbf{%
1520 | length
1521 | } & \textbf{%
1522 | conf.
1523 | } & \textbf{%
1524 | description
1525 | } \\
1526 | \hline
1527 | \endhead
1528 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
1529 | \endfoot
1530 | \endlastfoot
1531 | 
1532 | 0
1533 |  & 
1534 | 4|8
1535 |  & 
1536 | high
1537 |  & 
1538 | binary, signature xF7F7F7F7|xF7F7F7F700000000
1539 |  \\
1540 | \hline
1541 | 
1542 | 4|8
1543 |  & 
1544 | 16|32
1545 |  & 
1546 | low
1547 |  & 
1548 | \emph{????????????}
1549 |  \\
1550 | \hline
1551 | 
1552 | 20|40
1553 |  & 
1554 | 4|8
1555 |  & 
1556 | high
1557 |  & 
1558 | int, row length (in bytes) := %
1559 | \phantomsection\label{rl}RL
1560 |  \\
1561 | \hline
1562 | 
1563 | 24|48
1564 |  & 
1565 | 4|8
1566 |  & 
1567 | high
1568 |  & 
1569 | int, total row count := TRC
1570 |  \\
1571 | \hline
1572 | 
1573 | 28|56
1574 |  & 
1575 | 8|16
1576 |  & 
1577 | low
1578 |  & 
1579 | \emph{????????????}
1580 |  \\
1581 | \hline
1582 | 
1583 | 36|72
1584 |  & 
1585 | 4|8
1586 |  & 
1587 | medium
1588 |  & 
1589 | int, number of \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader} on first page where they appear := %
1590 | \phantomsection\label{ncfl1}NCFL1
1591 |  \\
1592 | \hline
1593 | 
1594 | 40|80
1595 |  & 
1596 | 4|8
1597 |  & 
1598 | medium
1599 |  & 
1600 | int, number of \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader} on second page where they appear (or 0) := %
1601 | \phantomsection\label{ncfl2}NCFL2
1602 |  \\
1603 | \hline
1604 | 
1605 | 44|88
1606 |  & 
1607 | 8|16
1608 |  & 
1609 | low
1610 |  & 
1611 | \emph{????????????}
1612 |  \\
1613 | \hline
1614 | 
1615 | 52|104
1616 |  & 
1617 | 4|8
1618 |  & 
1619 | medium
1620 |  & 
1621 | int, page size, equals PL
1622 |  \\
1623 | \hline
1624 | 
1625 | 56|112
1626 |  & 
1627 | 4|8
1628 |  & 
1629 | low
1630 |  & 
1631 | \emph{????????????}
1632 |  \\
1633 | \hline
1634 | 
1635 | 60|120
1636 |  & 
1637 | 4|8
1638 |  & 
1639 | medium
1640 |  & 
1641 | int, max row count on \textquotedbl{}mix\textquotedbl{} page := %
1642 | \phantomsection\label{mrc}MRC
1643 |  \\
1644 | \hline
1645 | 
1646 | 64|128
1647 |  & 
1648 | 8|16
1649 |  & 
1650 | medium
1651 |  & 
1652 | sequence of 8|16 FF, end of initial header
1653 |  \\
1654 | \hline
1655 | 
1656 | 72|144
1657 |  & 
1658 | 148|296
1659 |  & 
1660 | medium
1661 |  & 
1662 | zeroes
1663 |  \\
1664 | \hline
1665 | 
1666 | 220|440
1667 |  & 
1668 | 4
1669 |  & 
1670 | low
1671 |  & 
1672 | int, page sequence signature (equals current page sequence signature)
1673 |  \\
1674 | \hline
1675 | 
1676 | 224|444
1677 |  & 
1678 | 40|68
1679 |  & 
1680 | low
1681 |  & 
1682 | zeroes
1683 |  \\
1684 | \hline
1685 | 
1686 | 264|512
1687 |  & 
1688 | 4|8
1689 |  & 
1690 | low
1691 |  & 
1692 | int, value 1 observed in 4 test files
1693 |  \\
1694 | \hline
1695 | 
1696 | 268|520
1697 |  & 
1698 | 2
1699 |  & 
1700 | low
1701 |  & 
1702 | int, value 2 observed
1703 |  \\
1704 | \hline
1705 | 
1706 | 270|522
1707 |  & 
1708 | 2|6
1709 |  & 
1710 | low
1711 |  & 
1712 | zeroes (pads length of 3 fields to 8|16)
1713 |  \\
1714 | \hline
1715 | 
1716 | 272|528
1717 |  & 
1718 | 4|8
1719 |  & 
1720 | medium
1721 |  & 
1722 | int, number of pages with subheader data := NPSHD
1723 |  \\
1724 | \hline
1725 | 
1726 | 276|536
1727 |  & 
1728 | 2
1729 |  & 
1730 | medium
1731 |  & 
1732 | int, number of subheaders with positive length on last page with subheader data := NSHPL
1733 |  \\
1734 | \hline
1735 | 
1736 | 278|538
1737 |  & 
1738 | 2|6
1739 |  & 
1740 | low
1741 |  & 
1742 | zeroes
1743 |  \\
1744 | \hline
1745 | 
1746 | 280|544
1747 |  & 
1748 | 4|8
1749 |  & 
1750 | low
1751 |  & 
1752 | int, values equal to NPSHD observed
1753 |  \\
1754 | \hline
1755 | 
1756 | 284|552
1757 |  & 
1758 | 2
1759 |  & 
1760 | low
1761 |  & 
1762 | int, values equal to NSHPL+2 observed
1763 |  \\
1764 | \hline
1765 | 
1766 | 286|554
1767 |  & 
1768 | 2|6
1769 |  & 
1770 | low
1771 |  & 
1772 | zeroes
1773 |  \\
1774 | \hline
1775 | 
1776 | 288|560
1777 |  & 
1778 | 4|8
1779 |  & 
1780 | medium
1781 |  & 
1782 | int, number of pages in file, equals PC
1783 |  \\
1784 | \hline
1785 | 
1786 | 292|568
1787 |  & 
1788 | 2
1789 |  & 
1790 | low
1791 |  & 
1792 | int, values 22,26,9,56 observed
1793 |  \\
1794 | \hline
1795 | 
1796 | 294|570
1797 |  & 
1798 | 2|6
1799 |  & 
1800 | low
1801 |  & 
1802 | zeroes
1803 |  \\
1804 | \hline
1805 | 
1806 | 296|576
1807 |  & 
1808 | 4|8
1809 |  & 
1810 | low
1811 |  & 
1812 | int, value 1 observed
1813 |  \\
1814 | \hline
1815 | 
1816 | 300|584
1817 |  & 
1818 | 2
1819 |  & 
1820 | low
1821 |  & 
1822 | int, values 7|8 observed
1823 |  \\
1824 | \hline
1825 | 
1826 | 302|586
1827 |  & 
1828 | 2|6
1829 |  & 
1830 | low
1831 |  & 
1832 | zeroes
1833 |  \\
1834 | \hline
1835 | 
1836 | 304|592
1837 |  & 
1838 | 40|80
1839 |  & 
1840 | low
1841 |  & 
1842 | zeroes
1843 |  \\
1844 | \hline
1845 | 
1846 | 344|672
1847 |  & 
1848 | 2
1849 |  & 
1850 | low
1851 |  & 
1852 | int, value 0
1853 |  \\
1854 | \hline
1855 | 
1856 | 346|674
1857 |  & 
1858 | 2
1859 |  & 
1860 | low
1861 |  & 
1862 | int, values 0|8
1863 |  \\
1864 | \hline
1865 | 
1866 | 348|676
1867 |  & 
1868 | 2
1869 |  & 
1870 | low
1871 |  & 
1872 | int, value 4
1873 |  \\
1874 | \hline
1875 | 
1876 | 350|678
1877 |  & 
1878 | 2
1879 |  & 
1880 | low
1881 |  & 
1882 | int, value 0
1883 |  \\
1884 | \hline
1885 | 
1886 | 352|680
1887 |  & 
1888 | 2
1889 |  & 
1890 | low
1891 |  & 
1892 | int, values 12,32|0
1893 |  \\
1894 | \hline
1895 | 
1896 | 354|682
1897 |  & 
1898 | 2
1899 |  & 
1900 | low
1901 |  & 
1902 | int, length of Creator Software string := LCS
1903 |  \\
1904 | \hline
1905 | 
1906 | 356|684
1907 |  & 
1908 | 2
1909 |  & 
1910 | low
1911 |  & 
1912 | int, value 0
1913 |  \\
1914 | \hline
1915 | 
1916 | 358|686
1917 |  & 
1918 | 2
1919 |  & 
1920 | low
1921 |  & 
1922 | int, value 20
1923 |  \\
1924 | \hline
1925 | 
1926 | 360|688
1927 |  & 
1928 | 2
1929 |  & 
1930 | low
1931 |  & 
1932 | int, value of 8 indicates MXNAM and MXLAB valid := IMAXN
1933 |  \\
1934 | \hline
1935 | 
1936 | 362|690
1937 |  & 
1938 | 8
1939 |  & 
1940 | low
1941 |  & 
1942 | zeroes
1943 |  \\
1944 | \hline
1945 | 
1946 | 370|698
1947 |  & 
1948 | 2
1949 |  & 
1950 | low
1951 |  & 
1952 | int, value 12
1953 |  \\
1954 | \hline
1955 | 
1956 | 372|700
1957 |  & 
1958 | 2
1959 |  & 
1960 | low
1961 |  & 
1962 | int, value 8
1963 |  \\
1964 | \hline
1965 | 
1966 | 374|702
1967 |  & 
1968 | 2
1969 |  & 
1970 | low
1971 |  & 
1972 | int, value 0
1973 |  \\
1974 | \hline
1975 | 
1976 | 376|704
1977 |  & 
1978 | 2
1979 |  & 
1980 | low
1981 |  & 
1982 | int, value 28
1983 |  \\
1984 | \hline
1985 | 
1986 | 378|706
1987 |  & 
1988 | 2
1989 |  & 
1990 | low
1991 |  & 
1992 | int, length of Creator PROC step name := LCP
1993 |  \\
1994 | \hline
1995 | 
1996 | 380|708
1997 |  & 
1998 | 36
1999 |  & 
2000 | low
2001 |  & 
2002 | zeroes
2003 |  \\
2004 | \hline
2005 | 
2006 | 416|744
2007 |  & 
2008 | 2
2009 |  & 
2010 | low
2011 |  & 
2012 | int, value 4
2013 |  \\
2014 | \hline
2015 | 
2016 | 418|746
2017 |  & 
2018 | 2
2019 |  & 
2020 | low
2021 |  & 
2022 | int, value 1
2023 |  \\
2024 | \hline
2025 | 
2026 | 420|748
2027 |  & 
2028 | 2
2029 |  & 
2030 | low
2031 |  & 
2032 | int, number of Column Text subheaders in file := %
2033 | \phantomsection\label{nct}NCT
2034 |  \\
2035 | \hline
2036 | 
2037 | 422|750
2038 |  & 
2039 | 2
2040 |  & 
2041 | low
2042 |  & 
2043 | int, max length of column names := MXNAM (see IMAXN)
2044 |  \\
2045 | \hline
2046 | 
2047 | 424|752
2048 |  & 
2049 | 2
2050 |  & 
2051 | low
2052 |  & 
2053 | int, max length of column labels := MXLAB (see IMAXN)
2054 |  \\
2055 | \hline
2056 | 
2057 | 426|754
2058 |  & 
2059 | 12
2060 |  & 
2061 | low
2062 |  & 
2063 | zeroes
2064 |  \\
2065 | \hline
2066 | 
2067 | 438|766
2068 |  & 
2069 | 2
2070 |  & 
2071 | medium
2072 |  & 
2073 | int, number of data rows on a full page INT{[}(PL - 24 / 40)/\hyperref[rl]{RL}{]}; 0 for compressed file
2074 |  \\
2075 | \hline
2076 | 
2077 | 440|768
2078 |  & 
2079 | 27
2080 |  & 
2081 | low
2082 |  & 
2083 | zeroes
2084 |  \\
2085 | \hline
2086 | 
2087 | 467|795
2088 |  & 
2089 | 1
2090 |  & 
2091 | low
2092 |  & 
2093 | int, bit field, values 1,5
2094 |  \\
2095 | \hline
2096 | 
2097 | 468|796
2098 |  & 
2099 | 12
2100 |  & 
2101 | low
2102 |  & 
2103 | zeroes
2104 |  \\
2105 | \hline
2106 | 
2107 | 480|808
2108 |  &  & 
2109 | medium
2110 |  & 
2111 | Total length of subheader, QL
2112 |  \\
2113 | \hline
2114 | \end{longtable*}
2115 | 
2116 | 
2117 | \subsection{Column Size Subheader%
2118 |   \label{column-size-subheader}%
2119 | }
2120 | 
2121 | The \hyperref[column-size-subheader]{column size subheader} holds the number of columns (variables).
2122 | 
2123 | \setlength{\DUtablewidth}{\linewidth}%
2124 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.540\DUtablewidth}|}
2125 | \hline
2126 | \textbf{%
2127 | offset
2128 | } & \textbf{%
2129 | length
2130 | } & \textbf{%
2131 | conf.
2132 | } & \textbf{%
2133 | description
2134 | } \\
2135 | \hline
2136 | \endfirsthead
2137 | \hline
2138 | \textbf{%
2139 | offset
2140 | } & \textbf{%
2141 | length
2142 | } & \textbf{%
2143 | conf.
2144 | } & \textbf{%
2145 | description
2146 | } \\
2147 | \hline
2148 | \endhead
2149 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2150 | \endfoot
2151 | \endlastfoot
2152 | 
2153 | 0
2154 |  & 
2155 | 4|8
2156 |  & 
2157 | high
2158 |  & 
2159 | binary, signature xF6F6F6F6|xF6F6F6F600000000
2160 |  \\
2161 | \hline
2162 | 
2163 | 4|8
2164 |  & 
2165 | 4|8
2166 |  & 
2167 | high
2168 |  & 
2169 | int, number of columns := NCOL
2170 |  \\
2171 | \hline
2172 | 
2173 | 8|16
2174 |  & 
2175 | 4|8
2176 |  & 
2177 | low
2178 |  & 
2179 | \emph{????????????}  usually zeroes
2180 |  \\
2181 | \hline
2182 | 
2183 | 12|24
2184 |  &  & 
2185 | medium
2186 |  & 
2187 | Total length of subheader, QL
2188 |  \\
2189 | \hline
2190 | \end{longtable*}
2191 | 
2192 | 
2193 | \subsection{Subheader Counts Subheader%
2194 |   \label{subheader-counts-subheader}%
2195 | }
2196 | 
2197 | This subheader contains information on the first and last appearances of at least 7 common subheader types. Any of these subheaders may appear once or more. Multiple instances of a subheader provide information for an exclusive subset of columns. The order in which data is read from multiple subheaders corresponds to the reading order (left to right) of columns. The structure of this subheader was deduced and reported by Clint Cummins.
2198 | 
2199 | \setlength{\DUtablewidth}{\linewidth}%
2200 | \begin{longtable*}{|p{0.108\DUtablewidth}|p{0.088\DUtablewidth}|p{0.077\DUtablewidth}|p{0.677\DUtablewidth}|}
2201 | \hline
2202 | \textbf{%
2203 | offset
2204 | } & \textbf{%
2205 | length
2206 | } & \textbf{%
2207 | conf.
2208 | } & \textbf{%
2209 | description
2210 | } \\
2211 | \hline
2212 | \endfirsthead
2213 | \hline
2214 | \textbf{%
2215 | offset
2216 | } & \textbf{%
2217 | length
2218 | } & \textbf{%
2219 | conf.
2220 | } & \textbf{%
2221 | description
2222 | } \\
2223 | \hline
2224 | \endhead
2225 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2226 | \endfoot
2227 | \endlastfoot
2228 | 
2229 | 0
2230 |  & 
2231 | 4|8
2232 |  & 
2233 | high
2234 |  & 
2235 | int, signature -1024 (x00FCFFFF|x00FCFFFFFFFFFFFF)
2236 |  \\
2237 | \hline
2238 | 
2239 | 4|8
2240 |  & 
2241 | 4|8
2242 |  & 
2243 | low
2244 |  & 
2245 | int, length or offset, usually >= 48
2246 |  \\
2247 | \hline
2248 | 
2249 | 8|16
2250 |  & 
2251 | 4|8
2252 |  & 
2253 | low
2254 |  & 
2255 | int, usually 4
2256 |  \\
2257 | \hline
2258 | 
2259 | 12|24
2260 |  & 
2261 | 2
2262 |  & 
2263 | low
2264 |  & 
2265 | int, usually 7 (number of nonzero SCVs?)
2266 |  \\
2267 | \hline
2268 | 
2269 | 14|26
2270 |  & 
2271 | 50|94
2272 |  & 
2273 | low
2274 |  & 
2275 | \emph{????????????}
2276 |  \\
2277 | \hline
2278 | 
2279 | 64|120
2280 |  & 
2281 | 12*LSCV
2282 |  & 
2283 | medium
2284 |  & 
2285 | 12 \hyperref[subheader-count-vectors]{subheader count vectors}, length := LSCV = 20|40 bytes each
2286 |  \\
2287 | \hline
2288 | 
2289 | 304|600
2290 |  &  & 
2291 | medium
2292 |  & 
2293 | Total length of subheader, QL
2294 |  \\
2295 | \hline
2296 | \end{longtable*}
2297 | 
2298 | 
2299 | \subsubsection{Subheader Count Vectors%
2300 |   \label{subheader-count-vectors}%
2301 | }
2302 | 
2303 | The subheader count vectors encode information for each of 4 common subheader types, and potentially 12 total subheader types.
2304 | 
2305 | \setlength{\DUtablewidth}{\linewidth}%
2306 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.633\DUtablewidth}|}
2307 | \hline
2308 | \textbf{%
2309 | offset
2310 | } & \textbf{%
2311 | length
2312 | } & \textbf{%
2313 | conf.
2314 | } & \textbf{%
2315 | description
2316 | } \\
2317 | \hline
2318 | \endfirsthead
2319 | \hline
2320 | \textbf{%
2321 | offset
2322 | } & \textbf{%
2323 | length
2324 | } & \textbf{%
2325 | conf.
2326 | } & \textbf{%
2327 | description
2328 | } \\
2329 | \hline
2330 | \endhead
2331 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2332 | \endfoot
2333 | \endlastfoot
2334 | 
2335 | 0
2336 |  & 
2337 | 4|8
2338 |  & 
2339 | high
2340 |  & 
2341 | int, signature (see list below)
2342 |  \\
2343 | \hline
2344 | 
2345 | 4|8
2346 |  & 
2347 | 4|8
2348 |  & 
2349 | medium
2350 |  & 
2351 | int, page where this subheader first appears := PAGE1
2352 |  \\
2353 | \hline
2354 | 
2355 | 8|16
2356 |  & 
2357 | 2
2358 |  & 
2359 | medium
2360 |  & 
2361 | int, position of subheader pointer in PAGE1 := LOC1
2362 |  \\
2363 | \hline
2364 | 
2365 | 10|18
2366 |  & 
2367 | 2|6
2368 |  & 
2369 | low
2370 |  & 
2371 | \emph{????????????}  zero padding
2372 |  \\
2373 | \hline
2374 | 
2375 | 12|24
2376 |  & 
2377 | 4|8
2378 |  & 
2379 | medium
2380 |  & 
2381 | int, page where this subheader last appears := PAGEL
2382 |  \\
2383 | \hline
2384 | 
2385 | 16|32
2386 |  & 
2387 | 2
2388 |  & 
2389 | medium
2390 |  & 
2391 | int, position of subheader pointer in PAGEL := LOCL
2392 |  \\
2393 | \hline
2394 | 
2395 | 18|34
2396 |  & 
2397 | 2|6
2398 |  & 
2399 | low
2400 |  & 
2401 | \emph{????????????}  zero padding
2402 |  \\
2403 | \hline
2404 | 
2405 | 20|40
2406 |  &  & 
2407 | medium
2408 |  & 
2409 | Total length of subheader count vector, LSCV
2410 |  \\
2411 | \hline
2412 | \end{longtable*}
2413 | 
2414 | The LOC1 and LOCL give the positions of the corresponding subheader pointer in PAGE1 and PAGEL, respectively. That is, if there are SC subheader pointers on page PAGE1, then the corresponding subheader pointer first occurs at the LOC1'th position in this array, enumerating from 1. If PAGE1=0, the subheader is not present. If PAGE1=PAGEL and LOC1=LOCL, the subheader appears exactly once. If PAGE1!=PAGEL or LOC1!=LOCL, the subheader appears 2 or more times. In all test files, PAGE1 <= PAGEL, and the corresponding subheaders appear only once per page.  The variable \hyperref[nct]{NCT} in the \hyperref[row-size-subheader]{Row Size Subheader} should be used to ensure that all Column Text subheaders are located (and to avoid scanning through all pages in the file when all subheaders are already located).
2415 | 
2416 | The first 7 binary signatures in the \hyperref[subheader-count-vectors]{Subheader Count Vectors} array are always:
2417 | 
2418 | \setlength{\DUtablewidth}{\linewidth}%
2419 | \begin{longtable*}{|p{0.121\DUtablewidth}|p{0.249\DUtablewidth}|}
2420 | \hline
2421 | \textbf{%
2422 | signature
2423 | } & \textbf{%
2424 | description
2425 | } \\
2426 | \hline
2427 | \endfirsthead
2428 | \hline
2429 | \textbf{%
2430 | signature
2431 | } & \textbf{%
2432 | description
2433 | } \\
2434 | \hline
2435 | \endhead
2436 | \multicolumn{2}{c}{\hfill ... continued on next page} \\
2437 | \endfoot
2438 | \endlastfoot
2439 | 
2440 | -4
2441 |  & 
2442 | Column Attributes
2443 |  \\
2444 | \hline
2445 | 
2446 | -3
2447 |  & 
2448 | Column Text
2449 |  \\
2450 | \hline
2451 | 
2452 | -1
2453 |  & 
2454 | Column Names
2455 |  \\
2456 | \hline
2457 | 
2458 | -2
2459 |  & 
2460 | Column List
2461 |  \\
2462 | \hline
2463 | 
2464 | -5
2465 |  & 
2466 | unknown signature \#1
2467 |  \\
2468 | \hline
2469 | 
2470 | -6
2471 |  & 
2472 | unknown signature \#2
2473 |  \\
2474 | \hline
2475 | 
2476 | -7
2477 |  & 
2478 | unknown signature \#3
2479 |  \\
2480 | \hline
2481 | \end{longtable*}
2482 | 
2483 | The remaining 5 out of 12 signatures are zeros in the observed source files. Presumably, these are for subheaders not yet defined, or not present in the collection of test files.
2484 | 
2485 | A \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader} may appear on multiple pages, but are not indexed in Subheader Counts. The variables NCFL1 and NCFL2 in the \hyperref[row-size-subheader]{Row Size subheader} may be helpful if you want to know in advance if these appear across multiple pages.
2486 | 
2487 | 
2488 | \subsection{Column Text Subheader%
2489 |   \label{column-text-subheader}%
2490 | }
2491 | 
2492 | The column text subheader contains a block of text associated with columns, including the column names, labels, and formats. However, this subheader is not sufficient to parse this information. Other subheaders (e.g. the \hyperref[column-name-subheader]{column name subheader}), which point to specific elements within this subheader are also needed.
2493 | 
2494 | \setlength{\DUtablewidth}{\linewidth}%
2495 | \begin{longtable*}{|p{0.092\DUtablewidth}|p{0.081\DUtablewidth}|p{0.081\DUtablewidth}|p{0.697\DUtablewidth}|}
2496 | \hline
2497 | \textbf{%
2498 | offset
2499 | } & \textbf{%
2500 | length
2501 | } & \textbf{%
2502 | conf.
2503 | } & \textbf{%
2504 | description
2505 | } \\
2506 | \hline
2507 | \endfirsthead
2508 | \hline
2509 | \textbf{%
2510 | offset
2511 | } & \textbf{%
2512 | length
2513 | } & \textbf{%
2514 | conf.
2515 | } & \textbf{%
2516 | description
2517 | } \\
2518 | \hline
2519 | \endhead
2520 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2521 | \endfoot
2522 | \endlastfoot
2523 | 
2524 | 0
2525 |  & 
2526 | 4|8
2527 |  & 
2528 | high
2529 |  & 
2530 | int, signature -3 (xFDFFFFFF|xFDFFFFFFFFFFFFFF)
2531 |  \\
2532 | \hline
2533 | 
2534 | 4|8
2535 |  & 
2536 | 2
2537 |  & 
2538 | medium
2539 |  & 
2540 | int, size of text block (QL - 16|20)
2541 |  \\
2542 | \hline
2543 | 
2544 | 6|10
2545 |  & 
2546 | 2
2547 |  & 
2548 | low
2549 |  & 
2550 | \emph{????????????}
2551 |  \\
2552 | \hline
2553 | 
2554 | 8|12
2555 |  & 
2556 | 2
2557 |  & 
2558 | low
2559 |  & 
2560 | \emph{????????????}
2561 |  \\
2562 | \hline
2563 | 
2564 | 10|14
2565 |  & 
2566 | 2
2567 |  & 
2568 | low
2569 |  & 
2570 | \emph{????????????}
2571 |  \\
2572 | \hline
2573 | 
2574 | 12|16
2575 |  & 
2576 | 2
2577 |  & 
2578 | low
2579 |  & 
2580 | \emph{????????????}
2581 |  \\
2582 | \hline
2583 | 
2584 | 14|18
2585 |  & 
2586 | 2
2587 |  & 
2588 | low
2589 |  & 
2590 | \emph{????????????}
2591 |  \\
2592 | \hline
2593 | 
2594 | 16|20
2595 |  & 
2596 | varies
2597 |  & 
2598 | medium
2599 |  & 
2600 | ascii, compression \& Creator PROC step name that generated data
2601 |  \\
2602 | \hline
2603 | 
2604 | varies
2605 |  & 
2606 | \%QL
2607 |  & 
2608 | high
2609 |  & 
2610 | ascii, combined column names, labels, formats
2611 |  \\
2612 | \hline
2613 | \end{longtable*}
2614 | 
2615 | This subheader sometimes appears more than once; each is a separate array. If so, the \textquotedbl{}column name index\textquotedbl{} field in \hyperref[column-name-pointers]{column name pointers} selects a particular text array - 0 for the first array, 1 for the second, etc. Similarly, \textquotedbl{}column format index\textquotedbl{} and \textquotedbl{}column label index\textquotedbl{} fields also select a text array. Offsets to strings within the text array are multiples of 4, so the column names and labels section of the array often contains many nulls for padding.
2616 | 
2617 | The variables LCS and LCP from the \hyperref[row-size-subheader]{Row Size subheader} refer to a text field at the start of the text array (at offset 16|20) in the first Column Text subheader (before the column name strings).  This text field also contains compression information.  The following logic decodes this initial field:
2618 | 
2619 | \begin{enumerate}
2620 | \item If the first 8 bytes of the field are blank, file is not compressed, and set LCS=0.  The Creator PROC step name is the LCP bytes starting at offset 16.
2621 | 
2622 | \item If LCS > 0 (still), the file is not compressed, the first LCS bytes are the Creator Software string (padded with nulls).  Set LCP=0.  Stat/Transfer files use this pattern.
2623 | 
2624 | \item If the first 8 bytes of the field are \texttt{SASYZCRL}, the file is compressed with Run Length Encoding.  The Creator PROC step name is the LCP bytes starting at offset 24.
2625 | 
2626 | \item If the first 8 bytes are nonblank and options 2 or 3 above are not used, this probably indicates COMPRESS=BINARY.  We need test files to confirm this, though.
2627 | \end{enumerate}
2628 | 
2629 | 
2630 | \subsection{Column Name Subheader%
2631 |   \label{column-name-subheader}%
2632 | }
2633 | 
2634 | Column name subheaders contain a sequence of \hyperref[column-name-pointers]{column name pointers} to the offset of each column name \textbf{relative to a} \hyperref[column-text-subheader]{column text subheader}. There may be multiple column name subheaders, indexing into multiple column text subheaders.
2635 | 
2636 | \setlength{\DUtablewidth}{\linewidth}%
2637 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.644\DUtablewidth}|}
2638 | \hline
2639 | \textbf{%
2640 | offset
2641 | } & \textbf{%
2642 | length
2643 | } & \textbf{%
2644 | conf.
2645 | } & \textbf{%
2646 | description
2647 | } \\
2648 | \hline
2649 | \endfirsthead
2650 | \hline
2651 | \textbf{%
2652 | offset
2653 | } & \textbf{%
2654 | length
2655 | } & \textbf{%
2656 | conf.
2657 | } & \textbf{%
2658 | description
2659 | } \\
2660 | \hline
2661 | \endhead
2662 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2663 | \endfoot
2664 | \endlastfoot
2665 | 
2666 | 0
2667 |  & 
2668 | 4|8
2669 |  & 
2670 | high
2671 |  & 
2672 | int, signature -1 (xFFFFFFFF|xFFFFFFFFFFFFFFFF)
2673 |  \\
2674 | \hline
2675 | 
2676 | 4|8
2677 |  & 
2678 | 2
2679 |  & 
2680 | medium
2681 |  & 
2682 | int, length of remaining subheader (QL - 16|20)
2683 |  \\
2684 | \hline
2685 | 
2686 | 6|10
2687 |  & 
2688 | 2
2689 |  & 
2690 | low
2691 |  & 
2692 | \emph{????????????}
2693 |  \\
2694 | \hline
2695 | 
2696 | 8|12
2697 |  & 
2698 | 2
2699 |  & 
2700 | low
2701 |  & 
2702 | \emph{????????????}
2703 |  \\
2704 | \hline
2705 | 
2706 | 10|14
2707 |  & 
2708 | 2
2709 |  & 
2710 | low
2711 |  & 
2712 | \emph{????????????}
2713 |  \\
2714 | \hline
2715 | 
2716 | 12|16
2717 |  & 
2718 | 8*CMAX
2719 |  & 
2720 | medium
2721 |  & 
2722 | \hyperref[column-name-pointers]{column name pointers} (see below), CMAX=(QL-20|28)/8
2723 |  \\
2724 | \hline
2725 | 
2726 | MCN
2727 |  & 
2728 | 8|12
2729 |  & 
2730 | low
2731 |  & 
2732 | zeros, 12|16 + 8*CMAX := MCN
2733 |  \\
2734 | \hline
2735 | \end{longtable*}
2736 | 
2737 | Each column name subheader holds CMAX column name pointers. When there are multiple column name subheaders, CMAX will be less than NCOL.
2738 | 
2739 | 
2740 | \subsubsection{Column Name Pointers%
2741 |   \label{column-name-pointers}%
2742 | }
2743 | 
2744 | \setlength{\DUtablewidth}{\linewidth}%
2745 | \begin{longtable*}{|p{0.061\DUtablewidth}|p{0.061\DUtablewidth}|p{0.061\DUtablewidth}|p{0.767\DUtablewidth}|}
2746 | \hline
2747 | \textbf{%
2748 | offset
2749 | } & \textbf{%
2750 | length
2751 | } & \textbf{%
2752 | conf.
2753 | } & \textbf{%
2754 | description
2755 | } \\
2756 | \hline
2757 | \endfirsthead
2758 | \hline
2759 | \textbf{%
2760 | offset
2761 | } & \textbf{%
2762 | length
2763 | } & \textbf{%
2764 | conf.
2765 | } & \textbf{%
2766 | description
2767 | } \\
2768 | \hline
2769 | \endhead
2770 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2771 | \endfoot
2772 | \endlastfoot
2773 | 
2774 | 0
2775 |  & 
2776 | 2
2777 |  & 
2778 | high
2779 |  & 
2780 | int, column name index to select \hyperref[column-text-subheader]{Column Text Subheader}
2781 |  \\
2782 | \hline
2783 | 
2784 | 2
2785 |  & 
2786 | 2
2787 |  & 
2788 | high
2789 |  & 
2790 | int, column name offset w.r.t. end of selected Column Text signature.  Always a multiple of 4.
2791 |  \\
2792 | \hline
2793 | 
2794 | 4
2795 |  & 
2796 | 2
2797 |  & 
2798 | high
2799 |  & 
2800 | int, column name length
2801 |  \\
2802 | \hline
2803 | 
2804 | 6
2805 |  & 
2806 | 2
2807 |  & 
2808 | low
2809 |  & 
2810 | zeros
2811 |  \\
2812 | \hline
2813 | 
2814 | 8
2815 |  &  & 
2816 | high
2817 |  & 
2818 | Total length of column name pointer
2819 |  \\
2820 | \hline
2821 | \end{longtable*}
2822 | 
2823 | 
2824 | \subsection{Column Attributes Subheader%
2825 |   \label{column-attributes-subheader}%
2826 | }
2827 | 
2828 | The column attribute subheader holds information regarding the column offsets within a data row, the column widths, and the column types (either numeric or character). The column attribute subheader sometimes occurs more than once (in test data). In these cases, column attributes are applied in the order they are parsed.
2829 | 
2830 | \setlength{\DUtablewidth}{\linewidth}%
2831 | \begin{longtable*}{|p{0.080\DUtablewidth}|p{0.099\DUtablewidth}|p{0.071\DUtablewidth}|p{0.700\DUtablewidth}|}
2832 | \hline
2833 | \textbf{%
2834 | offset
2835 | } & \textbf{%
2836 | length
2837 | } & \textbf{%
2838 | conf.
2839 | } & \textbf{%
2840 | description
2841 | } \\
2842 | \hline
2843 | \endfirsthead
2844 | \hline
2845 | \textbf{%
2846 | offset
2847 | } & \textbf{%
2848 | length
2849 | } & \textbf{%
2850 | conf.
2851 | } & \textbf{%
2852 | description
2853 | } \\
2854 | \hline
2855 | \endhead
2856 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2857 | \endfoot
2858 | \endlastfoot
2859 | 
2860 | 0
2861 |  & 
2862 | 4|8
2863 |  & 
2864 | high
2865 |  & 
2866 | int, signature -4 (hex xFCFFFFFF|FCFFFFFFFFFFFFFF)
2867 |  \\
2868 | \hline
2869 | 
2870 | 4|8
2871 |  & 
2872 | 2
2873 |  & 
2874 | medium
2875 |  & 
2876 | int, length of remaining subheader
2877 |  \\
2878 | \hline
2879 | 
2880 | 6|10
2881 |  & 
2882 | 2
2883 |  & 
2884 | low
2885 |  & 
2886 | \emph{????????????}
2887 |  \\
2888 | \hline
2889 | 
2890 | 8|12
2891 |  & 
2892 | 2
2893 |  & 
2894 | low
2895 |  & 
2896 | \emph{????????????}
2897 |  \\
2898 | \hline
2899 | 
2900 | 10|14
2901 |  & 
2902 | 2
2903 |  & 
2904 | low
2905 |  & 
2906 | \emph{????????????}
2907 |  \\
2908 | \hline
2909 | 
2910 | 12|16
2911 |  & 
2912 | LCAV*CMAX
2913 |  & 
2914 | high
2915 |  & 
2916 | \hyperref[column-attribute-vectors]{column attribute vectors} (see below), CMAX=(QL-20|28)/LCAV, LCAV=12|16
2917 |  \\
2918 | \hline
2919 | 
2920 | MCA
2921 |  & 
2922 | 8|12
2923 |  & 
2924 | low
2925 |  & 
2926 | MCA = 12|16 + LCAV*CMAX
2927 |  \\
2928 | \hline
2929 | \end{longtable*}
2930 | 
2931 | 
2932 | \subsubsection{Column Attribute Vectors%
2933 |   \label{column-attribute-vectors}%
2934 | }
2935 | 
2936 | \setlength{\DUtablewidth}{\linewidth}%
2937 | \begin{longtable*}{|p{0.179\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.563\DUtablewidth}|}
2938 | \hline
2939 | \textbf{%
2940 | offset
2941 | } & \textbf{%
2942 | length
2943 | } & \textbf{%
2944 | conf.
2945 | } & \textbf{%
2946 | description
2947 | } \\
2948 | \hline
2949 | \endfirsthead
2950 | \hline
2951 | \textbf{%
2952 | offset
2953 | } & \textbf{%
2954 | length
2955 | } & \textbf{%
2956 | conf.
2957 | } & \textbf{%
2958 | description
2959 | } \\
2960 | \hline
2961 | \endhead
2962 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
2963 | \endfoot
2964 | \endlastfoot
2965 | 
2966 | 0
2967 |  & 
2968 | 4|8
2969 |  & 
2970 | high
2971 |  & 
2972 | int, column offset in data row (in bytes)
2973 |  \\
2974 | \hline
2975 | 
2976 | 4|8
2977 |  & 
2978 | 4
2979 |  & 
2980 | high
2981 |  & 
2982 | int, column width
2983 |  \\
2984 | \hline
2985 | 
2986 | 8|12
2987 |  & 
2988 | 2
2989 |  & 
2990 | low
2991 |  & 
2992 | name length flag
2993 |  \\
2994 | \hline
2995 | 
2996 | 10|14
2997 |  & 
2998 | 1
2999 |  & 
3000 | high
3001 |  & 
3002 | int, column type (1 = numeric, 2 = character)
3003 |  \\
3004 | \hline
3005 | 
3006 | 11|15
3007 |  & 
3008 | 1
3009 |  & 
3010 | low
3011 |  & 
3012 | \emph{????????????}
3013 |  \\
3014 | \hline
3015 | 
3016 | 12|16
3017 |  &  & 
3018 | high
3019 |  & 
3020 | Total length of column attribute vector, LCAV
3021 |  \\
3022 | \hline
3023 | \end{longtable*}
3024 | 
3025 | Observed values of name length flag in the source files:
3026 | 
3027 | \setlength{\DUtablewidth}{\linewidth}%
3028 | \begin{longtable*}{|p{0.183\DUtablewidth}|p{0.757\DUtablewidth}|}
3029 | \hline
3030 | \textbf{%
3031 | name length flag
3032 | } & \textbf{%
3033 | description
3034 | } \\
3035 | \hline
3036 | \endfirsthead
3037 | \hline
3038 | \textbf{%
3039 | name length flag
3040 | } & \textbf{%
3041 | description
3042 | } \\
3043 | \hline
3044 | \endhead
3045 | \multicolumn{2}{c}{\hfill ... continued on next page} \\
3046 | \endfoot
3047 | \endlastfoot
3048 | 
3049 | 4
3050 |  & 
3051 | name length <= 8
3052 |  \\
3053 | \hline
3054 | 
3055 | 1024
3056 |  & 
3057 | usually means name length <= 8 , but sometimes the length is 9-12
3058 |  \\
3059 | \hline
3060 | 
3061 | 2048
3062 |  & 
3063 | name length > 8
3064 |  \\
3065 | \hline
3066 | 
3067 | 2560
3068 |  & 
3069 | name length > 8
3070 |  \\
3071 | \hline
3072 | \end{longtable*}
3073 | 
3074 | 
3075 | \subsection{Column Format and Label Subheader%
3076 |   \label{column-format-and-label-subheader}%
3077 | }
3078 | 
3079 | The column format and label subheader contains pointers to a column format and label \textbf{relative to a} \hyperref[column-text-subheader]{column text subheader}. Since the column label subheader only contains information regarding a single column, there are typically as many of these subheaders as columns. The structure of column format pointers was contributed by Clint Cummins.
3080 | 
3081 | \setlength{\DUtablewidth}{\linewidth}%
3082 | \begin{longtable*}{|p{0.071\DUtablewidth}|p{0.071\DUtablewidth}|p{0.063\DUtablewidth}|p{0.746\DUtablewidth}|}
3083 | \hline
3084 | \textbf{%
3085 | offset
3086 | } & \textbf{%
3087 | length
3088 | } & \textbf{%
3089 | conf.
3090 | } & \textbf{%
3091 | description
3092 | } \\
3093 | \hline
3094 | \endfirsthead
3095 | \hline
3096 | \textbf{%
3097 | offset
3098 | } & \textbf{%
3099 | length
3100 | } & \textbf{%
3101 | conf.
3102 | } & \textbf{%
3103 | description
3104 | } \\
3105 | \hline
3106 | \endhead
3107 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
3108 | \endfoot
3109 | \endlastfoot
3110 | 
3111 | 0
3112 |  & 
3113 | 4|8
3114 |  & 
3115 | high
3116 |  & 
3117 | int, signature -1026 (hex FEFB \& 2 or 6 FFs)
3118 |  \\
3119 | \hline
3120 | 
3121 | 4|8
3122 |  & 
3123 | 30|38
3124 |  & 
3125 | low
3126 |  & 
3127 | \emph{????????????}
3128 |  \\
3129 | \hline
3130 | 
3131 | 34|46
3132 |  & 
3133 | 2
3134 |  & 
3135 | high
3136 |  & 
3137 | int, column format index to select \hyperref[column-text-subheader]{Column Text Subheader}
3138 |  \\
3139 | \hline
3140 | 
3141 | 36|48
3142 |  & 
3143 | 2
3144 |  & 
3145 | high
3146 |  & 
3147 | int, column format offset w.r.t. end of selected Column Text signature.  A multiple of 4.
3148 |  \\
3149 | \hline
3150 | 
3151 | 38|50
3152 |  & 
3153 | 2
3154 |  & 
3155 | high
3156 |  & 
3157 | int, column format length
3158 |  \\
3159 | \hline
3160 | 
3161 | 40|52
3162 |  & 
3163 | 2
3164 |  & 
3165 | high
3166 |  & 
3167 | int, column label index to select \hyperref[column-text-subheader]{Column Text Subheader}
3168 |  \\
3169 | \hline
3170 | 
3171 | 42|54
3172 |  & 
3173 | 2
3174 |  & 
3175 | high
3176 |  & 
3177 | int, column label offset w.r.t. end of selected Column Text signature.  A multiple of 4.
3178 |  \\
3179 | \hline
3180 | 
3181 | 44|56
3182 |  & 
3183 | 2
3184 |  & 
3185 | high
3186 |  & 
3187 | int, column label length
3188 |  \\
3189 | \hline
3190 | 
3191 | 46|58
3192 |  & 
3193 | 6
3194 |  & 
3195 | low
3196 |  & 
3197 | \emph{????????????}
3198 |  \\
3199 | \hline
3200 | 
3201 | 52|64
3202 |  &  & 
3203 | medium
3204 |  & 
3205 | Total length of subheader, QL
3206 |  \\
3207 | \hline
3208 | \end{longtable*}
3209 | 
3210 | 
3211 | \subsection{Column List Subheader%
3212 |   \label{column-list-subheader}%
3213 | }
3214 | 
3215 | The purpose of this subheader is not clear. But the structure is partly identified. Information related to this subheader was contributed by Clint Cummins.  eyecarex (created by Stat/Transfer) does not have this subheader.
3216 | 
3217 | \setlength{\DUtablewidth}{\linewidth}%
3218 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.610\DUtablewidth}|}
3219 | \hline
3220 | \textbf{%
3221 | offset
3222 | } & \textbf{%
3223 | length
3224 | } & \textbf{%
3225 | conf.
3226 | } & \textbf{%
3227 | description
3228 | } \\
3229 | \hline
3230 | \endfirsthead
3231 | \hline
3232 | \textbf{%
3233 | offset
3234 | } & \textbf{%
3235 | length
3236 | } & \textbf{%
3237 | conf.
3238 | } & \textbf{%
3239 | description
3240 | } \\
3241 | \hline
3242 | \endhead
3243 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
3244 | \endfoot
3245 | \endlastfoot
3246 | 
3247 | 0
3248 |  & 
3249 | 4|8
3250 |  & 
3251 | high
3252 |  & 
3253 | int, signature -2 (hex FE \& 3 or 7 FFs)
3254 |  \\
3255 | \hline
3256 | 
3257 | 4|8
3258 |  & 
3259 | 2
3260 |  & 
3261 | low
3262 |  & 
3263 | int, value close to offset in subheader pointer
3264 |  \\
3265 | \hline
3266 | 
3267 | 6|10
3268 |  & 
3269 | 6
3270 |  & 
3271 | low
3272 |  & 
3273 | \emph{????????????}
3274 |  \\
3275 | \hline
3276 | 
3277 | 12|16
3278 |  & 
3279 | 4|8
3280 |  & 
3281 | medium
3282 |  & 
3283 | int, length of remaining subheader
3284 |  \\
3285 | \hline
3286 | 
3287 | 16|24
3288 |  & 
3289 | 2
3290 |  & 
3291 | low
3292 |  & 
3293 | int, usually equals NCOL
3294 |  \\
3295 | \hline
3296 | 
3297 | 18|26
3298 |  & 
3299 | 2
3300 |  & 
3301 | medium
3302 |  & 
3303 | int, length of column list := CL, usually CL > NCOL
3304 |  \\
3305 | \hline
3306 | 
3307 | 20|28
3308 |  & 
3309 | 2
3310 |  & 
3311 | low
3312 |  & 
3313 | int, usually 1
3314 |  \\
3315 | \hline
3316 | 
3317 | 22|30
3318 |  & 
3319 | 2
3320 |  & 
3321 | low
3322 |  & 
3323 | int, usually equals NCOL
3324 |  \\
3325 | \hline
3326 | 
3327 | 24|32
3328 |  & 
3329 | 2
3330 |  & 
3331 | low
3332 |  & 
3333 | int, usually 3 equal values
3334 |  \\
3335 | \hline
3336 | 
3337 | 26|34
3338 |  & 
3339 | 2
3340 |  & 
3341 | low
3342 |  & 
3343 | int, usually 3 equal values
3344 |  \\
3345 | \hline
3346 | 
3347 | 28|36
3348 |  & 
3349 | 2
3350 |  & 
3351 | low
3352 |  & 
3353 | int, usually 3 equal values
3354 |  \\
3355 | \hline
3356 | 
3357 | 30|38
3358 |  & 
3359 | 2*CL
3360 |  & 
3361 | medium
3362 |  & 
3363 | \hyperref[column-list-values]{column list values} (see below)
3364 |  \\
3365 | \hline
3366 | 
3367 | MCL
3368 |  & 
3369 | 8
3370 |  & 
3371 | low
3372 |  & 
3373 | usually zeros, 30|38 + 2*CL := MCL
3374 |  \\
3375 | \hline
3376 | \end{longtable*}
3377 | 
3378 | 
3379 | \subsubsection{Column List Values%
3380 |   \label{column-list-values}%
3381 | }
3382 | 
3383 | These values are 2 byte integers, with (CL-NCOL) zero values. Each nonzero value is unique, between -NCOL and NCOL. The significance of signedness and ordering is unknown. The values do not correspond to a sorting order of columns.
3384 | 
3385 | 
3386 | \subsection{Compressed Binary Data Subheader%
3387 |   \label{compressed-binary-data-subheader}%
3388 | }
3389 | 
3390 | When a SAS7BDAT file is created by SAS with the option COMPRESS=CHAR or COMPRESS=YES, each row of data is compressed independently with a Run Length Encoding (RLE) structure.  This yields a variable length compressed row.  Each such row is stored in a single subheader in sequential order, indexed by the \hyperref[subheader-pointers]{subheader pointers}.  A RLE compressed data row is identified by COMP=4 in the subheader pointer, and does not have a subheader signature.  If a particular row had highly variable data and yielded no compression, it is still stored in a subheader, but uncompressed with COMP=0 instead of COMP=4.  The test file \texttt{compress\_yes.sas7bdat} has such highly variable (random) data and all its rows are in this COMP=0 form of subheaders.  It takes up more space than the uncompressed version \texttt{compress\_no.sas7bdat}, due to the extra length of the subheader pointers.  The final subheader on a page is usually COMP=1, which indicates a truncated row to be ignored; the complete data row appears on the next page.
3391 | 
3392 | The SAS option COMPRESS=BINARY apparently uses a RDC (Ross Data Compression) structure instead of RLE.  We need more test files to investigate this structure, and only document RLE at present.
3393 | 
3394 | 
3395 | \subsubsection{Run Length Encoding%
3396 |   \label{run-length-encoding}%
3397 | }
3398 | 
3399 | In RLE, the compressed row data is a series of control bytes, each optionally followed by data bytes.  The control byte specifies how the data bytes are interpreted, or is self contained.  The control byte has 2 parts - the upper 4 bits are the Command, and the lower 4 bits are the Length.  Each is an uint in the range 0-15.  For example, control byte 82 (hex) is Command 8 and Length 2, and control byte F4 (hex) is command 15 (F hex) and Length 4.  We have identified the functions of the 11 different Command values which are observed in the test files.  The RLE structure was contributed by Clint Cummins.
3400 | 
3401 | \setlength{\DUtablewidth}{\linewidth}%
3402 | \begin{longtable*}{|p{0.052\DUtablewidth}|p{0.046\DUtablewidth}|p{0.087\DUtablewidth}|p{0.765\DUtablewidth}|}
3403 | \hline
3404 | \textbf{%
3405 | Command
3406 | } & \textbf{%
3407 | Length
3408 | } & \textbf{%
3409 | Name
3410 | } & \textbf{%
3411 | Function
3412 | } \\
3413 | \hline
3414 | \endfirsthead
3415 | \hline
3416 | \textbf{%
3417 | Command
3418 | } & \textbf{%
3419 | Length
3420 | } & \textbf{%
3421 | Name
3422 | } & \textbf{%
3423 | Function
3424 | } \\
3425 | \hline
3426 | \endhead
3427 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
3428 | \endfoot
3429 | \endlastfoot
3430 | 
3431 | 0
3432 |  & 
3433 | 0
3434 |  & 
3435 | Copy64
3436 |  & 
3437 | using the first byte as a uint length L (0-255), Copy the next N=64+L bytes from the input to the output (copies 64 to 319 bytes)
3438 |  \\
3439 | \hline
3440 | 
3441 | 1
3442 |  & 
3443 | ?
3444 |  & 
3445 | ?
3446 |  & 
3447 | \emph{????????????}  (not observed in test files)
3448 |  \\
3449 | \hline
3450 | 
3451 | 2
3452 |  & 
3453 | ?
3454 |  & 
3455 | ?
3456 |  & 
3457 | \emph{????????????}  (not observed in test files)
3458 |  \\
3459 | \hline
3460 | 
3461 | 3
3462 |  & 
3463 | ?
3464 |  & 
3465 | ?
3466 |  & 
3467 | \emph{????????????}  (not observed in test files)
3468 |  \\
3469 | \hline
3470 | 
3471 | 4
3472 |  & 
3473 | ?
3474 |  & 
3475 | ?
3476 |  & 
3477 | \emph{????????????}  (not observed in test files)
3478 |  \\
3479 | \hline
3480 | 
3481 | 5
3482 |  & 
3483 | ?
3484 |  & 
3485 | ?
3486 |  & 
3487 | \emph{????????????}  (not observed in test files)
3488 |  \\
3489 | \hline
3490 | 
3491 | 6
3492 |  & 
3493 | 0
3494 |  & 
3495 | InsertBlank17
3496 |  & 
3497 | using the first byte as a uint length L, Insert N=17+L blanks (decimal 32, hex 20) in the output (inserts 17 to 273 blanks)
3498 |  \\
3499 | \hline
3500 | 
3501 | 7
3502 |  & 
3503 | 0
3504 |  & 
3505 | InsertZero17
3506 |  & 
3507 | using the first byte as a uint length L, Insert N=17+L zero bytes in the output
3508 |  \\
3509 | \hline
3510 | 
3511 | 8
3512 |  & 
3513 | L
3514 |  & 
3515 | Copy1
3516 |  & 
3517 | using the Length bits as a uint length L (0-15), Copy the next N=1+L bytes from the input to the output (copies 1 to 16 bytes)
3518 |  \\
3519 | \hline
3520 | 
3521 | 9
3522 |  & 
3523 | L
3524 |  & 
3525 | Copy17
3526 |  & 
3527 | Copy the next N=17+L bytes from the input to the output (copies 17 to 32 bytes)
3528 |  \\
3529 | \hline
3530 | 
3531 | 10 (A)
3532 |  & 
3533 | L
3534 |  & 
3535 | Copy33
3536 |  & 
3537 | Copy the next N=33+L bytes from the input to the output (copies 33 to 48 bytes)
3538 |  \\
3539 | \hline
3540 | 
3541 | 11 (B)
3542 |  & 
3543 | L
3544 |  & 
3545 | Copy49
3546 |  & 
3547 | Copy the next N=49+L bytes from the input to the output (copies 49 to 64 bytes)
3548 |  \\
3549 | \hline
3550 | 
3551 | 12 (C)
3552 |  & 
3553 | L
3554 |  & 
3555 | InsertByte3
3556 |  & 
3557 | Insert N=3+L copies of the next byte in the output (inserts 3 to 18 bytes)
3558 |  \\
3559 | \hline
3560 | 
3561 | 13 (D)
3562 |  & 
3563 | L
3564 |  & 
3565 | Insert@2
3566 |  & 
3567 | Insert N=2+L @ (decimal 64, hex 40) bytes in the output (inserts 2 to 17 @ bytes)
3568 |  \\
3569 | \hline
3570 | 
3571 | 14 (E)
3572 |  & 
3573 | L
3574 |  & 
3575 | InsertBlank2
3576 |  & 
3577 | Insert N=2+L blanks in the output
3578 |  \\
3579 | \hline
3580 | 
3581 | 15 (F)
3582 |  & 
3583 | L
3584 |  & 
3585 | InsertZero2
3586 |  & 
3587 | Insert N=2+L zero bytes in the output
3588 |  \\
3589 | \hline
3590 | \end{longtable*}
3591 | 
3592 | The most common Commands in \texttt{obs\_all\_perf\_1.sas7bdat} are F and 8 (alternating).  This file is entirely 8 byte doubles, so the F commands often handle consecutive zero bytes in zero value doubles.
3593 | 
3594 | 
3595 | \subsubsection{RLE Example 1%
3596 |   \label{rle-example-1}%
3597 | }
3598 | 
3599 | Compressed data row:
3600 | 
3601 | \texttt{87 A B C D E F G H F2 8A 1 2 3 4 5 6 7 8 9 A B D0 A1 a b c d e f g ... z}
3602 | 
3603 | \texttt{CB -8-data-bytes-{}- CB CB -{}-11-data-bytes-{}-{}-{}-{}-{}- CB CB -{}-34-data-bytes-{}-}
3604 | 
3605 | \texttt{Copy1 ~ ~ ~ ~ ~ ~ ~InsertZero2 ~ ~ ~ ~ ~ ~ ~ ~ Ins Copy33 next 34 bytes}
3606 | 
3607 | \texttt{Next 8 bytes ~ ~ ~ 4 00h bytes ~ ~ ~ ~ ~ ~ ~ ~ 2 40h}
3608 | 
3609 | There are 5 Control Bytes (CB) in the above sequence.
3610 | 
3611 | \begin{enumerate}
3612 | \item 87:  Copy1 next 8 bytes
3613 | 
3614 | \item F2:  InsertZero2 4 00h bytes
3615 | 
3616 | \item 8A:  Copy1 next 11 bytes
3617 | 
3618 | \item D0:  Insert@2 2 40h bytes
3619 | 
3620 | \item A1:  Copy33 next 34 bytes
3621 | \end{enumerate}
3622 | 
3623 | Output uncompressed row:
3624 | 
3625 | \texttt{A B C D E F G H 00 00 00 00 1 2 3 4 5 6 7 8 9 A B 40 40 a b c ... z}
3626 | 
3627 | 
3628 | \subsubsection{RLE Example 2%
3629 |   \label{rle-example-2}%
3630 | }
3631 | 
3632 | Compressed data row:
3633 | 
3634 | \texttt{87 A B C D E F G H C1 99 A5 a b c ... z}
3635 | 
3636 | \texttt{CB -8-data-bytes-{}- CB ar CB -last-bytes}
3637 | 
3638 | \texttt{Copy1 8 ~ ~ ~ ~ ~ ~InsBy Copy33 38 bytes}
3639 | 
3640 | Control Bytes in Example 2:
3641 | 
3642 | \begin{enumerate}
3643 | \item 87:  Copy1 next 8 bytes
3644 | 
3645 | \item C1,99:  InsertByte3 4 99h bytes
3646 | 
3647 | \item A5:  Copy33 next 38 bytes
3648 | \end{enumerate}
3649 | 
3650 | Output uncompressed row:
3651 | 
3652 | \texttt{A B C D E F G H 99 99 99 99 a b c ... z}
3653 | 
3654 | Once a data row is uncompressed, use the \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT Packed Binary Data} description below to read the variables.
3655 | 
3656 | 
3657 | \section{SAS7BDAT Packed Binary Data%
3658 |   \label{sas7bdat-packed-binary-data}%
3659 | }
3660 | 
3661 | SAS7BDAT packed binary are uncompressed, and appear after any subheaders on the page; see the \hyperref[page-offset-table]{Page Offset Table}.  These data are stored by rows, where the size of a row (in bytes) is defined by the \hyperref[row-size-subheader]{row size subheader}. When multiple rows occur on a single page, they are immediately adjacent. When a database contains many rows, it is typical that the collection of rows (i.e. their data) is evenly distributed to a number of 'data' pages. However, in test files, no single row's data is broken across two or more pages. A single data row is parsed by interpreting the binary data according to the collection of column attributes contained in the \hyperref[column-attributes-subheader]{column attributes subheader}. Binary data can be interpreted in two ways, as ASCII characters, or as floating point numbers. The column width attribute specifies the number of bytes associated with a column. For character data, this interpretation is straight-forward. For numeric data, interpretation of the column width is more complex.
3662 | 
3663 | The common binary representation of floating point numbers has three parts; the sign (\texttt{s}), exponent (\texttt{e}), and mantissa (\texttt{m}). The corresponding floating point number is \texttt{s * m * b \textasciicircum{} e}, where \texttt{b} is the base (2 for binary, 10 for decimal). Under the IEEE 754 floating point standard, the sign, exponent, and mantissa are encoded by 1, 11, and 52 bits respectively, totaling 8 bytes. In SAS7BDAT file, numeric quantities can be 3, 4, 5, 6, 7, or 8 bytes in length. For numeric quantities of less than 8 bytes, the remaining number of bytes are truncated from the least significant part of the mantissa. Hence, the minimum and maximum numeric values are identical for all byte lengths, but shorter numeric values have reduced precision.
3664 | 
3665 | Reduction in precision is characterized by the largest integer such that itself and all smaller integers have an exact representation, denoted \texttt{M}. At best, all integers greater than \texttt{M} are approximated to the nearest multiple of \texttt{b}. The table of \hyperref[numeric-binary-formats]{numeric binary formats} below lists \texttt{M} values and describes how bits are distributed among the six possible column widths in SAS7BDAT files, and lists.
3666 | 
3667 | 
3668 | \subsection{Numeric Binary Formats%
3669 |   \label{numeric-binary-formats}%
3670 | }
3671 | 
3672 | \setlength{\DUtablewidth}{\linewidth}%
3673 | \begin{longtable*}{|p{0.075\DUtablewidth}|p{0.075\DUtablewidth}|p{0.063\DUtablewidth}|p{0.110\DUtablewidth}|p{0.110\DUtablewidth}|p{0.203\DUtablewidth}|}
3674 | \hline
3675 | \textbf{%
3676 | size
3677 | } & \textbf{%
3678 | bytes
3679 | } & \textbf{%
3680 | sign
3681 | } & \textbf{%
3682 | exponent
3683 | } & \textbf{%
3684 | mantissa
3685 | } & \textbf{%
3686 | \texttt{M}
3687 | } \\
3688 | \hline
3689 | \endfirsthead
3690 | \hline
3691 | \textbf{%
3692 | size
3693 | } & \textbf{%
3694 | bytes
3695 | } & \textbf{%
3696 | sign
3697 | } & \textbf{%
3698 | exponent
3699 | } & \textbf{%
3700 | mantissa
3701 | } & \textbf{%
3702 | \texttt{M}
3703 | } \\
3704 | \hline
3705 | \endhead
3706 | \multicolumn{6}{c}{\hfill ... continued on next page} \\
3707 | \endfoot
3708 | \endlastfoot
3709 | 
3710 | 24bit
3711 |  & 
3712 | 3
3713 |  & 
3714 | 1
3715 |  & 
3716 | 11
3717 |  & 
3718 | 12
3719 |  & 
3720 | 8192
3721 |  \\
3722 | \hline
3723 | 
3724 | 32bit
3725 |  & 
3726 | 4
3727 |  & 
3728 | 1
3729 |  & 
3730 | 11
3731 |  & 
3732 | 20
3733 |  & 
3734 | 2097152
3735 |  \\
3736 | \hline
3737 | 
3738 | 40bit
3739 |  & 
3740 | 5
3741 |  & 
3742 | 1
3743 |  & 
3744 | 11
3745 |  & 
3746 | 28
3747 |  & 
3748 | 536870912
3749 |  \\
3750 | \hline
3751 | 
3752 | 48bit
3753 |  & 
3754 | 6
3755 |  & 
3756 | 1
3757 |  & 
3758 | 11
3759 |  & 
3760 | 36
3761 |  & 
3762 | 137438953472
3763 |  \\
3764 | \hline
3765 | 
3766 | 56bit
3767 |  & 
3768 | 7
3769 |  & 
3770 | 1
3771 |  & 
3772 | 11
3773 |  & 
3774 | 44
3775 |  & 
3776 | 35184372088832
3777 |  \\
3778 | \hline
3779 | 
3780 | 64bit
3781 |  & 
3782 | 8
3783 |  & 
3784 | 1
3785 |  & 
3786 | 11
3787 |  & 
3788 | 52
3789 |  & 
3790 | 9007199254740990
3791 |  \\
3792 | \hline
3793 | \end{longtable*}
3794 | 
3795 | 
3796 | \subsection{Dates, Currency, and Formatting%
3797 |   \label{dates-currency-and-formatting}%
3798 | }
3799 | 
3800 | Column formatting infomation is encoded within the \hyperref[column-text-subheader]{Column Text Subheader} and \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader}. Columns with formatting information have special meaning and interpretation. For example, numeric values may represent dates, encoded as the number of seconds since midnight, January 1, 1960. The format string for fields encoded this way is \textquotedbl{}DATETIME\textquotedbl{}. Using R, these values may be converted using the as.POSIXct or as.POSIXlt functions with argument \texttt{origin=\textquotedbl{}1960-01-01\textquotedbl{}}. The most common date format strings correspond to numeric fields, and are interpreted as follows:
3801 | 
3802 | \setlength{\DUtablewidth}{\linewidth}%
3803 | \begin{longtable*}{|p{0.110\DUtablewidth}|p{0.470\DUtablewidth}|p{0.156\DUtablewidth}|}
3804 | \hline
3805 | \textbf{%
3806 | Format
3807 | } & \textbf{%
3808 | Interpretation
3809 | } & \textbf{%
3810 | R Function
3811 | } \\
3812 | \hline
3813 | \endfirsthead
3814 | \hline
3815 | \textbf{%
3816 | Format
3817 | } & \textbf{%
3818 | Interpretation
3819 | } & \textbf{%
3820 | R Function
3821 | } \\
3822 | \hline
3823 | \endhead
3824 | \multicolumn{3}{c}{\hfill ... continued on next page} \\
3825 | \endfoot
3826 | \endlastfoot
3827 | 
3828 | DATE
3829 |  & 
3830 | Number of days since January 1, 1960
3831 |  & 
3832 | chron::chron
3833 |  \\
3834 | \hline
3835 | 
3836 | TIME
3837 |  & 
3838 | Number of seconds since midnight
3839 |  & 
3840 | as.POSIXct
3841 |  \\
3842 | \hline
3843 | 
3844 | DATETIME
3845 |  & 
3846 | Number of seconds since January 1, 1960
3847 |  & 
3848 | as.POSIXct
3849 |  \\
3850 | \hline
3851 | \end{longtable*}
3852 | 
3853 | There are many additional format strings for numeric and character fields.
3854 | 
3855 | 
3856 | \section{Platform Differences%
3857 |   \label{platform-differences}%
3858 | }
3859 | 
3860 | The test files referenced in \texttt{data/sas7bdat.sources.RData} were examined over a period of time. Files with non-Microsoft Windows markings were only observed late into the writing of this document. Consequently (but not intentionally), the SAS7BDAT description above was first deduced for SAS datasets generated on the most commonly observed platform: Microsoft Windows. The extensions to SAS7BDAT files for \textbf{u64} and non-Intel formats was contributed a little later by Clint Cummins.
3861 | 
3862 | In particular, the files \texttt{natlerr1944.sas7bdat}, \texttt{natlerr2006.sas7bdat} appear to be generated on the 'SunOS' platform (\textbf{u64}, non-Intel).  \texttt{txzips.sas7bdat} was created on Linux 64-bit SAS server (\textbf{u64}, Intel).  \texttt{eyecarex.sas7bdat} is non-Intel, possibly 32-bit PowerPC.
3863 | 
3864 | The files \texttt{cfrance2.sas7bdat}, \texttt{cfrance.sas7bdat}, \texttt{coutline.sas7bdat},  \texttt{gfrance2.sas7bdat}, \texttt{gfrance.sas7bdat}, \texttt{goutline.sas7bdat}, \texttt{xfrance2.sas7bdat}, \texttt{xfrance.sas7bdat}, \texttt{xoutline.sas7bdat} appear to be generated on a 32-bit 'Linux' Intel system.  They have the same format as Windows files, except for the (ignorable) OS strings in the first header.
3865 | 
3866 | Text may appear in non-ASCII compatible, partially ASCII compatible, or multi-byte encodings. In particular, Kasper Sorenson discovered some text that appears to be encoded using the Windows-1252 'code page'.
3867 | 
3868 | \textbf{Key Test Files}
3869 | 
3870 | \setlength{\DUtablewidth}{\linewidth}%
3871 | \begin{longtable*}{|p{0.301\DUtablewidth}|p{0.639\DUtablewidth}|}
3872 | \hline
3873 | \textbf{%
3874 | filename
3875 | } & \textbf{%
3876 | format features
3877 | } \\
3878 | \hline
3879 | \endfirsthead
3880 | \hline
3881 | \textbf{%
3882 | filename
3883 | } & \textbf{%
3884 | format features
3885 | } \\
3886 | \hline
3887 | \endhead
3888 | \multicolumn{2}{c}{\hfill ... continued on next page} \\
3889 | \endfoot
3890 | \endlastfoot
3891 | 
3892 | \texttt{acadindx.sas7bdat}
3893 |  & 
3894 | non-u64, Intel (most files are like this one)
3895 |  \\
3896 | \hline
3897 | 
3898 | \texttt{br.sas7bdat}
3899 |  & 
3900 | truncated doubles (widths 3,4,6; compare with br2 widths all 8)
3901 |  \\
3902 | \hline
3903 | 
3904 | \texttt{eyecarex.sas7bdat}
3905 |  & 
3906 | non-u64, non-Intel, written by Stat/Transfer
3907 |  \\
3908 | \hline
3909 | 
3910 | \texttt{txzips.sas7bdat}
3911 |  & 
3912 | u64, Intel
3913 |  \\
3914 | \hline
3915 | 
3916 | \texttt{natlterr1994.sas7bdat}
3917 |  & 
3918 | u64, non-Intel
3919 |  \\
3920 | \hline
3921 | 
3922 | \texttt{hltheds2006.sas7bdat}
3923 |  & 
3924 | 2 Column Attributes subheaders
3925 |  \\
3926 | \hline
3927 | 
3928 | \texttt{moshim.sas7bdat}
3929 |  & 
3930 | 3 Column Attributes subheaders
3931 |  \\
3932 | \hline
3933 | 
3934 | \texttt{flightdelays.sas7bdat}
3935 |  & 
3936 | 2 Column Text subheaders
3937 |  \\
3938 | \hline
3939 | 
3940 | \texttt{ymcls\_p2\_long\_040506.sas7bdat}
3941 |  & 
3942 | 5 Column Text subheaders, first Column Attributes subheader is on page 6
3943 |  \\
3944 | \hline
3945 | 
3946 | \texttt{flightschedule.sas7bdat}
3947 |  & 
3948 | 2+ Column Text subheaders
3949 |  \\
3950 | \hline
3951 | 
3952 | \texttt{internationalflight.sas7bdat}
3953 |  & 
3954 | 2+ Column Text subheaders
3955 |  \\
3956 | \hline
3957 | 
3958 | \texttt{marchflights.sas7bdat}
3959 |  & 
3960 | 2+ Column Text subheaders
3961 |  \\
3962 | \hline
3963 | 
3964 | \texttt{mechanicslevel1.sas7bdat}
3965 |  & 
3966 | 2+ Column Text subheaders
3967 |  \\
3968 | \hline
3969 | 
3970 | \texttt{compress\_yes.sas7bdat}
3971 |  & 
3972 | COMPRESS=CHAR, one PGTYPE=-28672, no RLE compression (COMP=0)
3973 |  \\
3974 | \hline
3975 | 
3976 | \texttt{obs\_all\_perf\_1.sas7bdat}
3977 |  & 
3978 | COMPRESS=CHAR, many PGTYPE=16384, much RLE compression (COMP=4)
3979 |  \\
3980 | \hline
3981 | \end{longtable*}
3982 | 
3983 | 
3984 | \section{Compression Data%
3985 |   \label{compression-data}%
3986 | }
3987 | 
3988 | The table below presents the results of compression tests on a collection of 142 SAS7BDAT data files (sources in \texttt{data/}). The 'type' field represents the type of compression, 'ctime' is the compression time (in seconds), 'dtime' is the decompression time, and the 'compression ratio' field holds the cumulative disk usage (in megabytes) before and after compression. Although the \texttt{xz} algorithm requires significantly more time to compress these data, the decompression time is on par with gzip.
3989 | 
3990 | \setlength{\DUtablewidth}{\linewidth}%
3991 | \begin{longtable*}{|p{0.168\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.307\DUtablewidth}|}
3992 | \hline
3993 | \textbf{%
3994 | type
3995 | } & \textbf{%
3996 | ctime
3997 | } & \textbf{%
3998 | dtime
3999 | } & \textbf{%
4000 | compression ratio
4001 | } \\
4002 | \hline
4003 | \endfirsthead
4004 | \hline
4005 | \textbf{%
4006 | type
4007 | } & \textbf{%
4008 | ctime
4009 | } & \textbf{%
4010 | dtime
4011 | } & \textbf{%
4012 | compression ratio
4013 | } \\
4014 | \hline
4015 | \endhead
4016 | \multicolumn{4}{c}{\hfill ... continued on next page} \\
4017 | \endfoot
4018 | \endlastfoot
4019 | 
4020 | gzip -9
4021 |  & 
4022 | 76.7s
4023 |  & 
4024 | 2.6s
4025 |  & 
4026 | 541M / 30.3M = 17.9
4027 |  \\
4028 | \hline
4029 | 
4030 | bzip2 -9
4031 |  & 
4032 | 92.7s
4033 |  & 
4034 | 11.2s
4035 |  & 
4036 | 541M / 19.0M = 28.5
4037 |  \\
4038 | \hline
4039 | 
4040 | xz -9
4041 |  & 
4042 | 434.2s
4043 |  & 
4044 | 2.7s
4045 |  & 
4046 | 541M / 12.8M = 42.3
4047 |  \\
4048 | \hline
4049 | \end{longtable*}
4050 | 
4051 | 
4052 | \section{Software Prototype%
4053 |   \label{software-prototype}%
4054 | }
4055 | 
4056 | The prototype program for reading SAS7BDAT formatted files is implemented entirely in R (see file \texttt{src/sas7bdat.R}). Files not recognized as having been generated under a Microsoft Windows platform are rejected (for now). Implementation of the \texttt{read.sas7bdat} function should be considered a 'reference implementation', and not one designed with performance in mind.
4057 | 
4058 | There are certain advantages and disadvantages to developing a prototype of this nature in R.
4059 | 
4060 | Advantages:
4061 | 
4062 | \begin{enumerate}
4063 | \item R is an interpreted language with built-in debugger. Hence, experimental routines may be implemented and debugged quickly and interactively, without the need of external compiler or debugger tools (e.g. gcc, gdb).
4064 | 
4065 | \item R programs are portable across a variety of computing platforms. This is especially important in the present context, because manipulating files stored on disk is a platform-specific task. Platform-specific operations are abstracted from the R user.
4066 | \end{enumerate}
4067 | 
4068 | Disadvantages:
4069 | 
4070 | \begin{enumerate}
4071 | \item Manipulating binary (raw) data in R is a relatively new capability. The best tools and practices for binary data operations are not as developed as those for other data types.
4072 | 
4073 | \item Interpreted code is often much less efficient than compiled code. This is not major disadvantage for prototype implementations because human code development is far less efficient than the R interpreter. Gains made in efficient code development using an interpreted language far outweigh benefit of compiled languages.
4074 | \end{enumerate}
4075 | 
4076 | Another software implementation was made by Clint Cummins, in the TSP econometrics package (mainly as an independent platform for exploring the format).
4077 | 
4078 | 
4079 | \section{ToDo%
4080 |   \label{todo}%
4081 | }
4082 | 
4083 | \begin{itemize}
4084 | \item obtain test files which use COMPRESS=BINARY, and develop identification and uncompression procedures
4085 | 
4086 | \item look for data which will reliably distinguish between structural subheaders (which have one of the known signatures) and uncompressed row data, which may have row data in the signature position that matches one of the known signatures.  Both use COMP=0.  Are NPSHD and NSHPL sufficient to do this?
4087 | 
4088 | \item obtain test files with more than 2.1 billion (and more than 4.2 billion) data rows, i.e. where 8 byte integer TRC in \textbf{u64} is apparently needed.  Do the non-u64 files handle this, with additional fields beyond the 4 byte TRC used for segmentation?  Is TRC a (signed) int or (unsigned) uint?
4089 | 
4090 | \item identify any SAS7BDAT encryption flag (this is not the same as 'cracking', or breaking encryption); we just identify if a file is encrypted and not readable without a key
4091 | 
4092 | \item experiment further with 'amendment page' concept
4093 | 
4094 | \item consider header bytes -by- SAS\_host
4095 | 
4096 | \item check that only one page of type \textquotedbl{}mix\textquotedbl{} is observed. If so insert \textquotedbl{}In all test cases (\texttt{data/sources.csv}), there are exactly zero or one pages of type 'mix'.\textquotedbl{} under the \hyperref[page-offset-table]{Page Offset Table} header.   {[}May not be needed, because the BC and SC fields in each Page Offset Table make the \hyperref[mrc]{MRC} field in the initial header unnecessary.{]}
4097 | 
4098 | \item identify all missing value representations: missing numeric values appear to be represented as '0000000000D1FFFF' (nan) for numeric 'double' quantities.
4099 | 
4100 | \item identify purpose of various unknown header quantities
4101 | 
4102 | \item determine purpose of Column List subheader
4103 | 
4104 | \item determine purpose and pattern of 'page sequence signature' fields.  Are they useful?
4105 | 
4106 | \item identify how non-ASCII encoding is specified
4107 | 
4108 | \item implement R options to read just header (and subheader) information without data, and an option to read just some data fields, and not all fields.  {[}The TSP implemenation already does this, and can also read a subset of the data rows.{]}
4109 | \end{itemize}
4110 | 
4111 | \end{document}
4112 | 


--------------------------------------------------------------------------------
/vignettes/sas7bdat.rst:
--------------------------------------------------------------------------------
  1 | ===============================
  2 | SAS7BDAT Database Binary Format
  3 | ===============================
  4 | 
  5 | by:
  6 | 
  7 |     | Matthew S. Shotwell, PhD
  8 |     | Assistant Professor
  9 |     | Department of Biostatistics
 10 |     | Vanderbilt University
 11 |     | matt.shotwell@vanderbilt.edu
 12 | 
 13 | 1/9/2013 update (**u64** format extensions, Row Size fields, and RLE compression) by:
 14 | 
 15 |     | Clint Cummins, PhD
 16 |     | clint@stanford.edu
 17 | 
 18 | 
 19 | Copyright (C) 2013 is retained by the authors listed above. This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/.
 20 | 
 21 | Contents
 22 | ========
 23 | 
 24 | - `Introduction`_
 25 | - `SAS7BDAT Header`_
 26 | - `SAS7BDAT Pages`_
 27 | - `SAS7BDAT Subheaders`_
 28 | - `SAS7BDAT Packed Binary Data`_
 29 | - `Platform Differences`_
 30 | - `Compression Data`_
 31 | - `Software Prototype`_
 32 | - `ToDo`_
 33 | 
 34 | Introduction
 35 | ============
 36 | 
 37 | The SAS7BDAT file is a binary database storage file. At the time of this writing, no description of the SAS7BDAT file format was publicly available. Hence, users who wish to read and manipulate these files were required to obtain a license for the SAS software, or third party software with support for SAS7BDAT files. The purpose of this document is to promote interoperability between SAS and other popular statistical software packages, especially R (http://www.r-project.org/).
 38 | 
 39 | The information below was deduced by examining the contents of many SAS7BDAT databases downloaded freely from internet resources (see ``data/sas7bdat.sources.RData``). No guarantee is made regarding its accuracy. No SAS software, nor any other software requiring the purchase of a license was used.
 40 | 
 41 | SAS7BDAT files consist of binary encoded data. Data files encoded in this format often have the extension '.sas7bdat'. The name 'SAS7BDAT' is not official, but is used throughout this document to refer to SAS database files formatted according to the descriptions below.
 42 | 
 43 | There are significant differences in the SAS7BDAT format depending on the operating systems and computer hardware platforms (32bit vs. 64bit). See the section on `platform differences`_ for more details. The format described below is sufficient to read the entire collection of test files referenced in ``data/sas7bdat.sources.RData`` (i.e. files associated with 32bit and some 64bit builds of SAS for Microsoft Windows, and **u64** SAS versions).  This includes files created with COMPRESS=CHAR. The format described here is probably not sufficient to **write** SAS7BDAT format files, due to lingering uncertainties.
 44 | 
 45 | The figure below illustrates the overall structure of the SAS7BDAT database. Each file consists of a header (length := HL bytes), followed by PC pages, each of length PL bytes (PC and PL are shorthand for 'page count' and 'page size' respectively, and are used to denote these quantities throughout this document).::
 46 | 
 47 |   ----------
 48 |   |   HL   |  header 
 49 |   ----------
 50 |   |   PL   |  page 1
 51 |   ----------
 52 |   |   PL   |  page 2
 53 |   ----------
 54 |      ...
 55 |   ----------
 56 |   |   PL   |  page PC
 57 |   ----------
 58 | 
 59 | Throughout this document, hexadecimal digits are denoted with a preceding 'x', binary digits with a preceding 'b', and decimal digits with no preceding character. For example, see the below `table of hexadecimal, decimal, and binary values`_.
 60 | 
 61 | SAS7BDAT Header
 62 | ===============
 63 | 
 64 | The SAS7BDAT file header contains a binary file identifier (*i.e.*, a magic number), the dataset name, timestamp, the number pages (PC), their size (PL) and a variety of other values that pertain to the database as a whole. The purpose of many header fields remain unknown, but are likely to include specifications for data compression and encryption, password protection, and dates/times of creation and/or modification. Most files encountered encode multi-byte values little-endian (least significant byte first). However, some files have big-endian values. Hence, it appears that multi-byte values are encoded using endianness of the platform where the file was written.  See `Platform Differences`_ for a table of key test files which differ in several ways.
 65 | 
 66 | The *offset table* below describes the SAS7BDAT file header as a sequence of bytes. Information stored in the table is indexed by its byte offset (first column) in the header and its length (second column) in bytes. For example, the field at offset 0 has length 32 bytes. Hence, bytes 0,1,...,31 comprise the data for this field. Byte lengths having the form '%n' should read: 'the number of bytes remaining up to, but not including byte n'. The fourth column gives a shorthand description of the data contained at the corresponding offset. For example, 'int, page size := PL' indicates that the data stored at the corresponding location is a signed integer representing the page size, which we denote PL. The description *????????????* indicates that the meaning of data stored at the corresponding offset is unknown. The third column represents the author's confidence (low, medium, high) in the corresponding offset, length, and description. Each offset table in this document is formatted in a similar fashion. Variables defined in an offset table are sometimes used in subsequent tables.
 67 | 
 68 | Header Offset Table
 69 | -------------------
 70 | 
 71 | ==============  ======  ======  ===============================================
 72 | offset		length	conf.	description
 73 | ==============  ======  ======  ===============================================
 74 | 0		32	high	binary, `magic number`_ 
 75 | 32		1	high	binary, Alignment_: if (byte==x33) a2=4 else a2=0 .  **u64** is true if a2=4 (unix 64 bit format).
 76 | 33		2	low	*????????????*
 77 | 35		1	high	binary, Alignment_  if (byte==x33) a1=4 else a1=0
 78 | 36		1	low	*????????????*
 79 | 37		1	high	int, endianness (x01-little [Intel] x00-big)
 80 | 38		1	low	*????????????*
 81 | 39		1	medium	ascii, OS type (1-UNIX or 2-WIN).  Does not affect format except for the OS strings.
 82 | 40		8	low	*????????????*
 83 | 48		8	low	*????????????*
 84 | 56		8	low	repeat of 32:32+8
 85 | 64		6	low	*????????????*
 86 | 70		2	low	int, `Character Encoding`_
 87 | 72		12	low	*????????????*
 88 | 84		8	high	ascii 'SAS FILE'
 89 | 92		64	high	ascii, dataset name
 90 | 156		8	medium	ascii, file type, e.g. ``'DATA    '``
 91 | 164		a1	medium	zero padding when a1=4 .  Aligns the double timestamps below on double word boundaries.
 92 | 164+a1		8	high	double, timestamp, date created, secs since 1/1/60 (for SAS version 8.x and higher)
 93 | 172+a1		8	high	double, timestamp, date modified, secs since 1/1/60 (for SAS version 8.x and higher)
 94 | 180+a1		16	low	*????????????*
 95 | 196+a1		4	high	int, length of SAS7BDAT header := HL 
 96 | 200+a1		4	high	int, page size := _`PL`
 97 | 204+a1		4+a2	high	int, page count := PC .  Length 4 or 8 (**u64**), henceforth denoted **4|8**
 98 | 208+a1+a2	8	low	*????????????*
 99 | 216+a1+a2	8	high	ascii, SAS release  (e.g. 9.0101M3 )
100 | 224+a1+a2	16	high	ascii, host  (SAS server type, longest observed string has 9 bytes)
101 | 240+a1+a2	16	high	ascii, OS version number (for UNIX, else null)
102 | 256+a1+a2	16	high	ascii, OS maker or version (SUN, IBM, sometimes WIN)
103 | 272+a1+a2	16	high	ascii, OS name (for UNIX, else null)
104 | 288+a1+a2	32	low	*????????????*
105 | 320+a1+a2	4	low	int, page sequence signature? (value is close to the value at start of each Page Offset Table)
106 | 324+a1+a2	4	low	*????????????*
107 | 328+a1+a2	8	medium	double, 3rd timestamp, sometimes zero
108 | 336+a1+a2	%HL	medium	zeros
109 | 1024|8192		medium	Total length of header (8192 for **u64**), HL
110 | ==============  ======  ======  ===============================================
111 | 
112 | The 8 bytes beginning at offset 32 hold information which affects the offset of the 'release' and 'host' information. In particular:
113 | 
114 | 1. The byte at offset 32 defines the **u64** (unix 64 bit) file format, which affects many field and header lengths (usually via 4 vs. 8 byte integers).
115 | 2. The byte at offset 35 controls an offset before the timestamps.
116 | 3. The byte at offset 37 defines byte ordering of ints and doubles (most test files were created on Windows and use Intel byte ordering; little endian).
117 | 4. The byte at offset 39 appears to distinguish the OS type, where '1' indicates that the file was generated on a UNIX-like system, such as Linux or SunOS, and '2' indicates the file was generated on a Microsoft Windows platform. However, this does not affect any important fields in the file format.
118 | 
119 | The following table describes some of the possible polymorphisms for the 8 bytes at offset 32. The first field lists the name of the file where the sequence was found (see ``data/sas7bdat.sources.RData``), the second lists the eight byte values (hexadecimal), the third field shows bytes 216-239 in ASCII ('.' represents a non-ASCII character or '\0'), and the fourth field lists the SAS7BDAT sub-format.
120 | 
121 | =========================== =================================== ============================ ======================
122 | filename                    bytes 32-39                         bytes 216-239                format
123 | =========================== =================================== ============================ ======================
124 | ``compress_no.sas7bdat``    ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0101M3NET_ASRV........`` Windows Intel
125 | ``compress_yes.sas7bdat``   ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0101M3NET_ASRV........`` Windows Intel
126 | ``lowbwt_i386.sas7bdat``    ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0202M0W32_VSPRO.......`` Windows Intel
127 | ``missing_values.sas7bdat`` ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0202M0W32_VSPRO.......`` Windows Intel
128 | ``obs_all_perf_1.sas7bdat`` ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0101M3XP_PRO..........`` Windows Intel
129 | ``adsl.sas7bdat``           ``x22 x22 x00 x33 x33 x01 x02 x32`` ``....9.0202M3X64_ESRV....`` Windows x64 Intel
130 | ``eyecarex.sas7bdat``       ``x22 x22 x00 x33 x22 x00 x02 x31`` ``....9.0000M0WIN.........`` Unix non-Intel
131 | ``lowbwt_x64.sas7bdat``     ``x22 x22 x00 x33 x33 x01 x02 x32`` ``....9.0202M2X64_VSPRO...`` Windows x64 Intel
132 | ``natlterr1994.sas7bdat``   ``x33 x22 x00 x33 x33 x00 x02 x31`` ``........9.0101M3SunOS...`` u64 Unix non-Intel
133 | ``natlterr2006.sas7bdat``   ``x33 x22 x00 x33 x33 x00 x02 x31`` ``........9.0101M3SunOS...`` u64 Unix non-Intel
134 | ``txzips.sas7bdat``         ``x33 x22 x00 x33 x33 x01 x02 x31`` ``........9.0201M0Linux...`` u64 Unix Intel
135 | =========================== =================================== ============================ ======================
136 | 
137 | .. _`table of hexadecimal, decimal, and binary values`:
138 | 
139 | The binary representation for the hexadecimal values present in the table above are given below.
140 | 
141 | ===========  =======  =============
142 | hexadecimal  decimal  binary
143 | ===========  =======  =============
144 | ``x01``      ``001``  ``b00000001``
145 | ``x02``      ``002``  ``b00000010``
146 | ``x22``      ``034``  ``b00010010``
147 | ``x31``      ``049``  ``b00011001``
148 | ``x32``      ``050``  ``b00011010``
149 | ``x33``      ``051``  ``b00011011``
150 | ===========  =======  =============
151 | 
152 | Alignment
153 | +++++++++
154 | 
155 | In files generated by 64 bit builds of SAS, 'alignment' means that all data field offsets containing doubles or 8 byte ints should be a factor of 8 bytes. For files generated by 32 bit builds of SAS, the alignment is 4 bytes. Because `SAS7BDAT Packed Binary Data`_ may contain double precision values, it appears that all data rows are 64 bit aligned, regardless of whether the file was written with a 32 bit or 64 bit build of SAS. Alignment of data structures according to the platform word length (4 bytes for 32 bit, and 8 bytes for 64 bit architectures) facilitates efficient operations on data stored in memory. It also suggests that parts of SAS7BDAT data file format are platform dependent. One theory is that the SAS implementation utilizes a common C or C++ structure or class to reference data stored in memory. When compiled, these structures are aligned according to the word length of the target platform. Of course, when SAS was originally written, platform differences may not have been forseeable. Hence, these inconsistencies may not have been intentional.
156 | 
157 | Magic Number
158 | ++++++++++++
159 | 
160 | The SAS7BDAT magic number is the following 32 byte (hex) sequence.::
161 | 
162 |    x00 x00 x00 x00   x00 x00 x00 x00
163 |    x00 x00 x00 x00   xc2 xea x81 x60
164 |    xb3 x14 x11 xcf   xbd x92 x08 x00
165 |    x09 xc7 x31 x8c   x18 x1f x10 x11
166 | 
167 | In all test files except one (not listed in ``data/sas7bdat.sources.RData``), the magic number above holds. The one anomalous file has the following magic number::
168 | 
169 |    x00 x00 x00 x00   x00 x00 x00 x00
170 |    x00 x00 x00 x00   x00 x00 x00 x00 
171 |    x00 x00 x00 x00   x00 x00 x00 x00 
172 |    x00 x00 x00 x00   x18 x1f x10 x11
173 | 
174 | In addition, the anomalous file is associated with the SAS release "3.2TK". Indeed, this file may not have been written by SAS. Otherwise, the anomalous file appears to be formatted similarly to other test files.
175 | 
176 | Character Encoding
177 | ++++++++++++++++++
178 | 
179 | The integer (one or two bytes) at header offset 70 (bytes) indicates the character encoding of string data. The table below lists the values that are known to occur and the associated character encoding. 
180 | 
181 | ==============	==============	=============
182 | bytes 70-72	SAS name	iconv name
183 | ==============	==============	=============
184 | 0		(Unspecified)	(Unspecified)
185 | 20		utf-8		UTF-8
186 | 28		us-ascii	US-ASCII
187 | 29		latin1		ISO-8859-1
188 | 30		latin2		ISO-8859-2
189 | 31		latin3		ISO-8859-3
190 | 34		arabic		ISO-8859-6
191 | 36		hebrew		ISO-8859-8
192 | 39		thai		ISO-8859-11
193 | 40		latin5		ISO-8859-9
194 | 60		wlatin2		WINDOWS-1250
195 | 61		wcyrillic	WINDOWS-1251
196 | 62		wlatin1		WINDOWS-1252
197 | 63		wgreek		WINDOWS-1253
198 | 64		wturkish	WINDOWS-1254
199 | 65		whebrew		WINDOWS-1255
200 | 66		warabic		WINDOWS-1256
201 | 119		euc-tw		EUC-TW
202 | 123		big5		BIG-5
203 | 125		euc-cn		EUC-CN
204 | 134		euc-jp		EUC-JP
205 | 138		shift-jis	SHIFT-JIS
206 | 140		euc-kr		EUC-KR
207 | ==============	==============	============= 
208 | 
209 | When the encoding is unspecified, the file uses the encoding of the SAS session that produced it (usually Windows-1252).
210 | 
211 | SAS7BDAT Pages
212 | ==============
213 | 
214 | Following the SAS7BDAT header are pages of data. Each page can be one of (at least) four types. The first three are those that contain meta-information (e.g. field/column attributes), packed binary data, or a combination of both. These types are denoted 'meta', 'data', and 'mix' respectively. Meta-information is required to correctly interpret the packed binary information. Hence, this information must be parsed first. In test files, 'meta' and 'mix' pages always precede 'data' pages. In some test data files, there is a fourth page type, denoted 'amd' which appears to encode additional meta information. This page usually occurs last, and appears to contain amended meta information.
215 | 
216 | The `page offset table`_ below describes each page type. Byte offsets appended with one of '(meta/mix)', '(mix)', or '(data)' indicate that the corresponding length and description apply only to pages of the listed type. Provisionally, the internal structure of the 'amd' page type is considered identical to the 'meta' page type.   
217 | 
218 | Page Offset Table
219 | -----------------
220 | 
221 | ==============  ==============	======  ===============================================
222 | offset		length		conf.	description
223 | ==============  ==============	======  ===============================================
224 | 0		4		low	int, page sequence signature?
225 | 4		12|28		low	*????????????* length 12 or 28 (**u64**)
226 | B		2		medium	int, bit field `page type`_ := _PGTYPE; B = 16|32
227 | B+2		2		medium	int, data block count := _`BC`
228 | B+4		2		medium	int, `subheader pointers`_ count := _`SC` <= `BC`_
229 | B+6		2		low	*????????????*
230 | B+8		SC*SL		medium	SC `subheader pointers`_, SL = 12|24
231 | B+8+SC*SL	DL		medium	if NRD>0, 8 byte alignment; DL = (B+8+SC*SL+7) % 8 * 8
232 | B+8+SC*SL+DL	RC*`RL`_	medium	`SAS7BDAT packed binary data`_ data row count := RC = (BC-SC)
233 | C		%`PL`_		medium  subheader data and/or filler; C = (B+8+SC*SL+DL+RC*RL)
234 | ==============  ==============	======  ===============================================
235 | 
236 | Page Type
237 | +++++++++
238 | 
239 | ======	====	==========	========================================	===================
240 | PGTYPE	name	subheaders	uncompressed row data (after subheaders)	compressed row data (in subheaders)
241 | ======	====	==========	========================================	===================
242 | 0	meta	yes (SC>0)	no  (BC=SC)					yes
243 | 256	data	no  (SC=0)	yes (RC=BC)					no
244 | 512	mix	yes (SC>0)	yes (RC=BC-SC)					no
245 | 1024	amd	yes?		yes?						no?
246 | 16384	meta	yes (SC>0)	no (BC=SC)					yes
247 | -28672	comp	no		no						no
248 | ======	====	==========	========================================	===================
249 | 
250 | There are at least four page types 'meta', 'data', 'mix', and 'amd'. These types are encoded in the most significant byte of a two byte bit field at page offset 16|32. If no bit is set, the following page is of type 'meta'. If the first, second, or third bits are set, then the page is of type 'data', 'mix', or 'amd', respectively. Hence, if the two bytes are interpreted as an unsigned integer, then the 'meta', 'data', 'mix', and 'amd' types correspond to 0, 256, 512, and 1024, respectively. In compressed files, other bits (and sometimes multiple bits) have been set (e.g., ``1 << 16 | 1 << 13``, which is ``-28672`` signed, or ``36864`` unsigned). However, the pattern is unclear.
251 | 
252 | If a page is of type 'meta', 'mix', or 'amd', data beginning at offset byte 24|40 are a sequence of SC SL-byte `subheader pointers`_, which point to an offset farther down the page. `SAS7BDAT Subheaders`_ stored at these offsets hold meta information about the database, including the column names, labels, and types.    
253 | If a page is of type 'mix', then **packed binary data begin at the next 8 byte boundary following the last subheader pointer**. In this case, the data begin at offset B+8+SC*SL+DL, where DL = (B+8+SC*SL+PL+7) % 8 * 8, and '%' is the modulo operator. 
254 | 
255 | If a page is of type 'data', then packed binary data begin at offset 24|40.
256 | 
257 | The 'comp' page was observed as page 2 of the compress_yes.sas7bdat test file (not distributed with the ``sas7bdat`` package). It has BC and SC fields, but no subheader pointers. It contains some initial data and 2 tables. The first table has many rows of length 24; its purpose is unknown. The second table has one entry per data page with the page number and the number of data rows on the page for SC pages. It could be used to access a particular row without reading all preceding data pages.
258 | 
259 | Subheader Pointers
260 | ++++++++++++++++++
261 | 
262 | The subheader pointers encode information about the offset and length of subheaders relative to the beginning of the page where the subheader pointer is located. The purpose of the last four bytes of the subheader pointer are uncertain, but may indicate that additional subheader pointers are to be found on the next page, or that the corresponding subheader is not crucial.
263 | 
264 | =======	======  ======  ===============================================
265 | offset	length	conf.	description
266 | =======	======  ======  ===============================================
267 | 0	4|8	high	int, offset from page start to subheader
268 | 4|8	4|8	high	int, length of subheader := _`QL` 
269 | 8|16	1	medium	int, compression := _`COMP`
270 | 9|17	1	low	int, subheader type := ST
271 | 10|18	2|6	low	zeroes
272 | 12|24		high	Total length of subheader pointer 12|24 (**u64**), SL 
273 | =======	======  ======  ===============================================
274 | 
275 | QL is sometimes zero, which indicates that no data is referenced by the corresponding subheader pointer. When this occurs, the subheader pointer may be ignored.
276 | 
277 | =======	============
278 | `COMP`_	description
279 | =======	============
280 | 0	uncompressed
281 | 1	truncated (ignore data)
282 | 4	RLE compressed row data with control byte
283 | =======	============
284 | 
285 | ====	============
286 | ST	subheaders
287 | ====	============
288 | 0	Row Size, Column Size, Subheader Counts, Column Format and Label, in Uncompressed file
289 | 1	Column Text, Column Names, Column Attributes, Column List
290 | 1	all subheaders (including row data), in Compressed file.
291 | ====	============
292 | 
293 | 
294 | SAS7BDAT Subheaders
295 | ===================
296 | 
297 | Subheaders contain meta information regarding the SAS7BDAT database, including row and column counts, column names, labels, and types. Each subheader is associated with a four- or eight-byte 'signature' (**u64**) that identifies the subheader type, and hence, how it should be parsed.
298 | 
299 | Row Size Subheader
300 | ------------------
301 | 
302 | The row size subheader holds information about row length (in bytes), their total count, and their count on a page of type 'mix'.  Fields at offset 28|56 and higher are not needed to read the file, but are documented here for completeness.  The four test files used for example data in the higher fields are ``eyecarex.sas7bdat``, ``acadindx.sas7bdat``, ``natlterr1994.sas7bdat``, ``txzips.sas7bdat`` (non-Intel/Intel x regular/u64).
303 | 
304 | =========	=========	======  ===============================================
305 | offset		length		conf.	description
306 | =========	=========	======  ===============================================
307 | 0		4|8		high	binary, signature xF7F7F7F7|xF7F7F7F700000000
308 | 4|8		16|32		low	*????????????*
309 | 20|40		4|8		high	int, row length (in bytes) := _`RL`
310 | 24|48		4|8		high	int, total row count := TRC 
311 | 28|56		8|16		low	*????????????*
312 | 36|72		4|8		medium	int, number of `Column Format and Label Subheader`_ on first page where they appear := _`NCFL1`
313 | 40|80		4|8		medium	int, number of `Column Format and Label Subheader`_ on second page where they appear (or 0) := _`NCFL2`
314 | 44|88		8|16		low	*????????????*
315 | 52|104		4|8		medium	int, page size, equals PL
316 | 56|112		4|8		low	*????????????*
317 | 60|120		4|8		medium	int, max row count on "mix" page := _`MRC`
318 | 64|128		8|16		medium	sequence of 8|16 FF, end of initial header
319 | 72|144		148|296		medium	zeroes
320 | 220|440		4		low	int, page sequence signature (equals current page sequence signature)
321 | 224|444		40|68		low	zeroes
322 | 264|512		4|8		low	int, value 1 observed in 4 test files
323 | 268|520		2		low	int, value 2 observed
324 | 270|522		2|6		low	zeroes (pads length of 3 fields to 8|16)
325 | 272|528		4|8		medium	int, number of pages with subheader data := NPSHD
326 | 276|536		2		medium	int, number of subheaders with positive length on last page with subheader data := NSHPL
327 | 278|538		2|6		low	zeroes
328 | 280|544		4|8		low	int, values equal to NPSHD observed
329 | 284|552		2		low	int, values equal to NSHPL+2 observed
330 | 286|554		2|6		low	zeroes
331 | 288|560		4|8		medium	int, number of pages in file, equals PC
332 | 292|568		2		low	int, values 22,26,9,56 observed
333 | 294|570		2|6		low	zeroes
334 | 296|576		4|8		low	int, value 1 observed
335 | 300|584		2		low	int, values 7|8 observed
336 | 302|586		2|6		low	zeroes
337 | 304|592		40|80		low	zeroes
338 | 344|672		2		low	int, value 0
339 | 346|674		2		low	int, values 0|8
340 | 348|676		2		low	int, value 4
341 | 350|678		2		low	int, value 0
342 | 352|680		2		low	int, values 12,32|0
343 | 354|682		2		low	int, length of Creator Software string := LCS
344 | 356|684		2		low	int, value 0
345 | 358|686		2		low	int, value 20
346 | 360|688		2		low	int, value of 8 indicates MXNAM and MXLAB valid := IMAXN
347 | 362|690		8		low	zeroes
348 | 370|698		2		low	int, value 12
349 | 372|700		2		low	int, value 8
350 | 374|702		2		low	int, value 0
351 | 376|704		2		low	int, value 28
352 | 378|706		2		low	int, length of Creator PROC step name := LCP
353 | 380|708		36		low	zeroes
354 | 416|744		2		low	int, value 4
355 | 418|746		2		low	int, value 1
356 | 420|748		2		low	int, number of Column Text subheaders in file := _`NCT`
357 | 422|750		2		low	int, max length of column names := MXNAM (see IMAXN)
358 | 424|752		2		low	int, max length of column labels := MXLAB (see IMAXN)
359 | 426|754		12		low	zeroes
360 | 438|766		2		medium	int, number of data rows on a full page INT[(PL - 24 / 40)/`RL`_]; 0 for compressed file
361 | 440|768		27		low	zeroes
362 | 467|795		1		low	int, bit field, values 1,5
363 | 468|796		12		low	zeroes
364 | 480|808				medium	Total length of subheader, QL
365 | =========	=========	======  ===============================================
366 | 
367 | 
368 | 
369 | Column Size Subheader 
370 | ---------------------
371 | 
372 | The `column size subheader`_ holds the number of columns (variables).
373 | 
374 | =======	======	======	=================================
375 | offset	length	conf.	description
376 | =======	======  ======  =================================
377 | 0	4|8	high	binary, signature xF6F6F6F6|xF6F6F6F600000000
378 | 4|8	4|8	high	int, number of columns := NCOL 
379 | 8|16	4|8	low	*????????????*  usually zeroes
380 | 12|24		medium	Total length of subheader, QL
381 | =======	======  ======  =================================
382 | 
383 | 
384 | Subheader Counts Subheader
385 | --------------------------
386 | 
387 | This subheader contains information on the first and last appearances of at least 7 common subheader types. Any of these subheaders may appear once or more. Multiple instances of a subheader provide information for an exclusive subset of columns. The order in which data is read from multiple subheaders corresponds to the reading order (left to right) of columns. The structure of this subheader was deduced and reported by Clint Cummins.
388 | 
389 | =========	=======	======  ===============================================
390 | offset		length	conf.	description
391 | =========	=======	======  ===============================================
392 | 0		4|8	high	int, signature -1024 (x00FCFFFF|x00FCFFFFFFFFFFFF)
393 | 4|8		4|8	low	int, length or offset, usually >= 48
394 | 8|16		4|8	low	int, usually 4
395 | 12|24		2	low	int, usually 7 (number of nonzero SCVs?)
396 | 14|26		50|94	low	*????????????*
397 | 64|120		12*LSCV	medium	12 `subheader count vectors`_, length := LSCV = 20|40 bytes each
398 | 304|600			medium	Total length of subheader, QL
399 | =========	=======	======  ===============================================
400 | 
401 | Subheader Count Vectors
402 | +++++++++++++++++++++++
403 | 
404 | The subheader count vectors encode information for each of 4 common subheader types, and potentially 12 total subheader types.
405 | 
406 | =======	======  ======  =====================================================
407 | offset	length	conf.	description
408 | =======	======  ======  =====================================================
409 | 0	4|8	high	int, signature (see list below)
410 | 4|8	4|8	medium	int, page where this subheader first appears := PAGE1
411 | 8|16	2	medium	int, position of subheader pointer in PAGE1 := LOC1
412 | 10|18	2|6	low	*????????????*  zero padding
413 | 12|24	4|8	medium	int, page where this subheader last appears := PAGEL
414 | 16|32	2	medium	int, position of subheader pointer in PAGEL := LOCL
415 | 18|34	2|6	low	*????????????*	zero padding
416 | 20|40		medium	Total length of subheader count vector, LSCV
417 | =======	======  ======  =====================================================
418 | 
419 | The LOC1 and LOCL give the positions of the corresponding subheader pointer in PAGE1 and PAGEL, respectively. That is, if there are SC subheader pointers on page PAGE1, then the corresponding subheader pointer first occurs at the LOC1'th position in this array, enumerating from 1. If PAGE1=0, the subheader is not present. If PAGE1=PAGEL and LOC1=LOCL, the subheader appears exactly once. If PAGE1!=PAGEL or LOC1!=LOCL, the subheader appears 2 or more times. In all test files, PAGE1 <= PAGEL, and the corresponding subheaders appear only once per page.  The variable `NCT`_ in the `Row Size Subheader`_ should be used to ensure that all Column Text subheaders are located (and to avoid scanning through all pages in the file when all subheaders are already located).
420 | 
421 | The first 7 binary signatures in the `Subheader Count Vectors`_ array are always:
422 | 
423 | =========	====================
424 | signature	description
425 | =========	====================
426 | -4		Column Attributes
427 | -3		Column Text
428 | -1		Column Names
429 | -2		Column List
430 | -5		unknown signature #1
431 | -6		unknown signature #2
432 | -7		unknown signature #3
433 | =========	====================
434 | 
435 | The remaining 5 out of 12 signatures are zeros in the observed source files. Presumably, these are for subheaders not yet defined, or not present in the collection of test files. 
436 | 
437 | A `Column Format and Label Subheader`_ may appear on multiple pages, but are not indexed in Subheader Counts. The variables NCFL1 and NCFL2 in the `Row Size subheader`_ may be helpful if you want to know in advance if these appear across multiple pages.
438 | 
439 | 
440 | Column Text Subheader
441 | ---------------------
442 | 
443 | The column text subheader contains a block of text associated with columns, including the column names, labels, and formats. However, this subheader is not sufficient to parse this information. Other subheaders (e.g. the `column name subheader`_), which point to specific elements within this subheader are also needed. 
444 | 
445 | =======	======  ======  ===============================================
446 | offset	length	conf.	description
447 | =======	======  ======  ===============================================
448 | 0	4|8	high	int, signature -3 (xFDFFFFFF|xFDFFFFFFFFFFFFFF)
449 | 4|8	2	medium	int, size of text block (QL - 16|20)
450 | 6|10	2	low	*????????????*
451 | 8|12	2	low	*????????????*
452 | 10|14	2	low	*????????????*
453 | 12|16	2	low	*????????????*
454 | 14|18	2	low	*????????????*
455 | 16|20	varies	medium	ascii, compression & Creator PROC step name that generated data
456 | varies	%QL	high	ascii, combined column names, labels, formats
457 | =======	======  ======  ===============================================
458 | 
459 | This subheader sometimes appears more than once; each is a separate array. If so, the "column name index" field in `column name pointers`_ selects a particular text array - 0 for the first array, 1 for the second, etc. Similarly, "column format index" and "column label index" fields also select a text array. Offsets to strings within the text array are multiples of 4, so the column names and labels section of the array often contains many nulls for padding.
460 | 
461 | The variables LCS and LCP from the `Row Size subheader`_ refer to a text field at the start of the text array (at offset 16|20) in the first Column Text subheader (before the column name strings).  This text field also contains compression information.  The following logic decodes this initial field:
462 | 
463 | 1. If the first 8 bytes of the field are blank, file is not compressed, and set LCS=0.  The Creator PROC step name is the LCP bytes starting at offset 16.
464 | 2. If LCS > 0 (still), the file is not compressed, the first LCS bytes are the Creator Software string (padded with nulls).  Set LCP=0.  Stat/Transfer files use this pattern.
465 | 3. If the first 8 bytes of the field are ``SASYZCRL``, the file is compressed with Run Length Encoding.  The Creator PROC step name is the LCP bytes starting at offset 24.
466 | 4. If the first 8 bytes are nonblank and options 2 or 3 above are not used, this probably indicates COMPRESS=BINARY.  We need test files to confirm this, though.
467 | 
468 | 
469 | Column Name Subheader
470 | ---------------------
471 | 
472 | Column name subheaders contain a sequence of `column name pointers`_ to the offset of each column name **relative to a** `column text subheader`_. There may be multiple column name subheaders, indexing into multiple column text subheaders.
473 | 
474 | =======	======  ======  ====================================================
475 | offset	length	conf.	description
476 | =======	======  ======  ====================================================
477 | 0	4|8	high	int, signature -1 (xFFFFFFFF|xFFFFFFFFFFFFFFFF)
478 | 4|8	2	medium	int, length of remaining subheader (QL - 16|20)
479 | 6|10	2	low	*????????????*
480 | 8|12	2	low	*????????????*
481 | 10|14	2	low	*????????????*
482 | 12|16	8*CMAX	medium	`column name pointers`_ (see below), CMAX=(QL-20|28)/8
483 | MCN	8|12	low	zeros, 12|16 + 8*CMAX := MCN
484 | =======	======  ======  ====================================================
485 | 
486 | Each column name subheader holds CMAX column name pointers. When there are multiple column name subheaders, CMAX will be less than NCOL.
487 | 
488 | Column Name Pointers
489 | ++++++++++++++++++++
490 | 
491 | ======	======  ======  ======================================================
492 | offset	length	conf.	description
493 | ======	======  ======  ======================================================
494 | 0	2	high	int, column name index to select `Column Text Subheader`_
495 | 2	2	high	int, column name offset w.r.t. end of selected Column Text signature.  Always a multiple of 4.
496 | 4	2	high	int, column name length
497 | 6	2	low	zeros
498 | 8		high	Total length of column name pointer
499 | ======	======  ======  ======================================================
500 | 
501 | 
502 | Column Attributes Subheader
503 | ---------------------------
504 | 
505 | The column attribute subheader holds information regarding the column offsets within a data row, the column widths, and the column types (either numeric or character). The column attribute subheader sometimes occurs more than once (in test data). In these cases, column attributes are applied in the order they are parsed.
506 | 
507 | =======	=========	======	===================================================
508 | offset	length		conf.	description
509 | =======	=========	======	===================================================
510 | 0	4|8		high	int, signature -4 (hex xFCFFFFFF|FCFFFFFFFFFFFFFF)
511 | 4|8	2		medium	int, length of remaining subheader
512 | 6|10	2		low	*????????????*
513 | 8|12	2		low	*????????????*
514 | 10|14	2		low	*????????????*
515 | 12|16	LCAV*CMAX	high	`column attribute vectors`_ (see below), CMAX=(QL-20|28)/LCAV, LCAV=12|16 
516 | MCA	8|12		low	MCA = 12|16 + LCAV*CMAX
517 | =======	=========	======	===================================================
518 | 
519 | Column Attribute Vectors 
520 | ++++++++++++++++++++++++
521 | 
522 | ==============  ======  ======  ===============================================
523 | offset		length	conf.	description
524 | ==============  ======  ======  ===============================================
525 | 0		4|8	high	int, column offset in data row (in bytes)
526 | 4|8		4	high	int, column width
527 | 8|12		2	low	name length flag
528 | 10|14		1	high	int, column type (1 = numeric, 2 = character)
529 | 11|15		1	low	*????????????*
530 | 12|16			high	Total length of column attribute vector, LCAV
531 | ==============  ======  ======  ===============================================
532 | 
533 | Observed values of name length flag in the source files:
534 | 
535 | ================  =================================================================
536 | name length flag		description
537 | ================  =================================================================
538 | 4			name length <= 8
539 | 1024			usually means name length <= 8 , but sometimes the length is 9-12
540 | 2048			name length > 8
541 | 2560			name length > 8
542 | ================  =================================================================
543 | 
544 | 
545 | Column Format and Label Subheader
546 | ---------------------------------
547 | 
548 | The column format and label subheader contains pointers to a column format and label **relative to a** `column text subheader`_. Since the column label subheader only contains information regarding a single column, there are typically as many of these subheaders as columns. The structure of column format pointers was contributed by Clint Cummins. 
549 | 
550 | =======	=======	======	===============================================
551 | offset	length	conf.	description
552 | =======	=======	======	===============================================
553 | 0	4|8	high	int, signature -1026 (hex FEFB & 2 or 6 FFs)
554 | 4|8	30|38	low	*????????????*
555 | 34|46	2	high	int, column format index to select `Column Text Subheader`_
556 | 36|48	2	high	int, column format offset w.r.t. end of selected Column Text signature.  A multiple of 4.
557 | 38|50	2	high	int, column format length
558 | 40|52	2	high	int, column label index to select `Column Text Subheader`_
559 | 42|54	2	high	int, column label offset w.r.t. end of selected Column Text signature.  A multiple of 4.
560 | 44|56	2	high	int, column label length
561 | 46|58	6	low	*????????????*
562 | 52|64		medium	Total length of subheader, QL
563 | =======	=======	======	===============================================
564 | 
565 | Column List Subheader
566 | ---------------------
567 | 
568 | The purpose of this subheader is not clear. But the structure is partly identified. Information related to this subheader was contributed by Clint Cummins.  eyecarex (created by Stat/Transfer) does not have this subheader.
569 | 
570 | =======	======	======	===============================================
571 | offset	length	conf.	description
572 | =======	======	======	===============================================
573 | 0	4|8	high	int, signature -2 (hex FE & 3 or 7 FFs)
574 | 4|8	2	low	int, value close to offset in subheader pointer
575 | 6|10	6	low	*????????????* 
576 | 12|16	4|8	medium	int, length of remaining subheader
577 | 16|24	2	low	int, usually equals NCOL
578 | 18|26	2	medium	int, length of column list := CL, usually CL > NCOL
579 | 20|28	2	low	int, usually 1
580 | 22|30	2	low	int, usually equals NCOL
581 | 24|32	2	low	int, usually 3 equal values
582 | 26|34	2	low	int, usually 3 equal values
583 | 28|36	2	low	int, usually 3 equal values
584 | 30|38	2*CL	medium	`column list values`_ (see below)
585 | MCL	8	low	usually zeros, 30|38 + 2*CL := MCL
586 | =======	======	======	===============================================
587 | 
588 | Column List Values
589 | ++++++++++++++++++
590 | 
591 | These values are 2 byte integers, with (CL-NCOL) zero values. Each nonzero value is unique, between -NCOL and NCOL. The significance of signedness and ordering is unknown. The values do not correspond to a sorting order of columns.
592 | 
593 | Compressed Binary Data Subheader
594 | --------------------------------
595 | 
596 | When a SAS7BDAT file is created by SAS with the option COMPRESS=CHAR or COMPRESS=YES, each row of data is compressed independently with a Run Length Encoding (RLE) structure.  This yields a variable length compressed row.  Each such row is stored in a single subheader in sequential order, indexed by the `subheader pointers`_.  A RLE compressed data row is identified by COMP=4 in the subheader pointer, and does not have a subheader signature.  If a particular row had highly variable data and yielded no compression, it is still stored in a subheader, but uncompressed with COMP=0 instead of COMP=4.  The test file ``compress_yes.sas7bdat`` has such highly variable (random) data and all its rows are in this COMP=0 form of subheaders.  It takes up more space than the uncompressed version ``compress_no.sas7bdat``, due to the extra length of the subheader pointers.  The final subheader on a page is usually COMP=1, which indicates a truncated row to be ignored; the complete data row appears on the next page.
597 | 
598 | The SAS option COMPRESS=BINARY apparently uses a RDC (Ross Data Compression) structure instead of RLE.  We need more test files to investigate this structure, and only document RLE at present.
599 | 
600 | Run Length Encoding
601 | +++++++++++++++++++
602 | 
603 | In RLE, the compressed row data is a series of control bytes, each optionally followed by data bytes.  The control byte specifies how the data bytes are interpreted, or is self contained.  The control byte has 2 parts - the upper 4 bits are the Command, and the lower 4 bits are the Length.  Each is an uint in the range 0-15.  For example, control byte 82 (hex) is Command 8 and Length 2, and control byte F4 (hex) is command 15 (F hex) and Length 4.  We have identified the functions of the 11 different Command values which are observed in the test files.  The RLE structure was contributed by Clint Cummins.
604 | 
605 | =======	======	=============	============================
606 | Command	Length	Name		Function
607 | =======	======	=============	============================
608 | 0	0	Copy64		using the first byte as a uint length L (0-255), Copy the next N=64+L bytes from the input to the output (copies 64 to 319 bytes)
609 | 1	?	?		*????????????*  (not observed in test files)
610 | 2	?	?		*????????????*  (not observed in test files)
611 | 3	?	?		*????????????*  (not observed in test files)
612 | 4	?	?		*????????????*  (not observed in test files)
613 | 5	?	?		*????????????*  (not observed in test files)
614 | 6	0	InsertBlank17	using the first byte as a uint length L, Insert N=17+L blanks (decimal 32, hex 20) in the output (inserts 17 to 273 blanks)
615 | 7	0	InsertZero17	using the first byte as a uint length L, Insert N=17+L zero bytes in the output
616 | 8	L	Copy1		using the Length bits as a uint length L (0-15), Copy the next N=1+L bytes from the input to the output (copies 1 to 16 bytes)
617 | 9	L	Copy17		Copy the next N=17+L bytes from the input to the output (copies 17 to 32 bytes)
618 | 10 (A)	L	Copy33		Copy the next N=33+L bytes from the input to the output (copies 33 to 48 bytes)
619 | 11 (B)	L	Copy49		Copy the next N=49+L bytes from the input to the output (copies 49 to 64 bytes)
620 | 12 (C)	L	InsertByte3	Insert N=3+L copies of the next byte in the output (inserts 3 to 18 bytes)
621 | 13 (D)	L	Insert@2	Insert N=2+L @ (decimal 64, hex 40) bytes in the output (inserts 2 to 17 @ bytes)
622 | 14 (E)	L	InsertBlank2	Insert N=2+L blanks in the output
623 | 15 (F)	L	InsertZero2	Insert N=2+L zero bytes in the output
624 | =======	======	=============	============================
625 | 
626 | The most common Commands in ``obs_all_perf_1.sas7bdat`` are F and 8 (alternating).  This file is entirely 8 byte doubles, so the F commands often handle consecutive zero bytes in zero value doubles.
627 | 
628 | RLE Example 1
629 | +++++++++++++
630 | 
631 | Compressed data row:
632 | 
633 | ``87 A B C D E F G H F2 8A 1 2 3 4 5 6 7 8 9 A B D0 A1 a b c d e f g ... z``
634 | 
635 | ``CB -8-data-bytes-- CB CB --11-data-bytes------ CB CB --34-data-bytes--``
636 | 
637 | ``Copy1              InsertZero2                 Ins Copy33 next 34 bytes``
638 | 
639 | ``Next 8 bytes       4 00h bytes                 2 40h``
640 | 
641 | There are 5 Control Bytes (CB) in the above sequence.
642 | 
643 | 1. 87:  Copy1 next 8 bytes
644 | 2. F2:  InsertZero2 4 00h bytes
645 | 3. 8A:  Copy1 next 11 bytes
646 | 4. D0:  Insert@2 2 40h bytes
647 | 5. A1:  Copy33 next 34 bytes
648 | 
649 | Output uncompressed row:
650 | 
651 | ``A B C D E F G H 00 00 00 00 1 2 3 4 5 6 7 8 9 A B 40 40 a b c ... z``
652 | 
653 | RLE Example 2
654 | +++++++++++++
655 | 
656 | Compressed data row:
657 | 
658 | ``87 A B C D E F G H C1 99 A5 a b c ... z``
659 | 
660 | ``CB -8-data-bytes-- CB ar CB -last-bytes``
661 | 
662 | ``Copy1 8            InsBy Copy33 38 bytes``
663 | 
664 | Control Bytes in Example 2:
665 | 
666 | 1. 87:  Copy1 next 8 bytes
667 | 2. C1,99:  InsertByte3 4 99h bytes
668 | 3. A5:  Copy33 next 38 bytes
669 | 
670 | Output uncompressed row:
671 | 
672 | ``A B C D E F G H 99 99 99 99 a b c ... z``
673 | 
674 | Once a data row is uncompressed, use the `SAS7BDAT Packed Binary Data`_ description below to read the variables.
675 | 
676 | 
677 | 
678 | SAS7BDAT Packed Binary Data
679 | ===========================
680 | 
681 | SAS7BDAT packed binary are uncompressed, and appear after any subheaders on the page; see the `Page Offset Table`_.  These data are stored by rows, where the size of a row (in bytes) is defined by the `row size subheader`_. When multiple rows occur on a single page, they are immediately adjacent. When a database contains many rows, it is typical that the collection of rows (i.e. their data) is evenly distributed to a number of 'data' pages. However, in test files, no single row's data is broken across two or more pages. A single data row is parsed by interpreting the binary data according to the collection of column attributes contained in the `column attributes subheader`_. Binary data can be interpreted in two ways, as ASCII characters, or as floating point numbers. The column width attribute specifies the number of bytes associated with a column. For character data, this interpretation is straight-forward. For numeric data, interpretation of the column width is more complex.
682 | 
683 | The common binary representation of floating point numbers has three parts; the sign (``s``), exponent (``e``), and mantissa (``m``). The corresponding floating point number is ``s * m * b ^ e``, where ``b`` is the base (2 for binary, 10 for decimal). Under the IEEE 754 floating point standard, the sign, exponent, and mantissa are encoded by 1, 11, and 52 bits respectively, totaling 8 bytes. In SAS7BDAT file, numeric quantities can be 3, 4, 5, 6, 7, or 8 bytes in length. For numeric quantities of less than 8 bytes, the remaining number of bytes are truncated from the least significant part of the mantissa. Hence, the minimum and maximum numeric values are identical for all byte lengths, but shorter numeric values have reduced precision.
684 | 
685 | Reduction in precision is characterized by the largest integer such that itself and all smaller integers have an exact representation, denoted ``M``. At best, all integers greater than ``M`` are approximated to the nearest multiple of ``b``. The table of `numeric binary formats`_ below lists ``M`` values and describes how bits are distributed among the six possible column widths in SAS7BDAT files, and lists.
686 | 
687 | Numeric Binary Formats
688 | ----------------------
689 | 
690 | =====     =====  ====  ========  ========  ================
691 | size      bytes  sign  exponent  mantissa  ``M``	
692 | =====     =====  ====  ========  ========  ================
693 | 24bit     3      1     11        12                    8192
694 | 32bit     4      1     11        20                 2097152
695 | 40bit     5      1     11        28               536870912
696 | 48bit     6      1     11        36            137438953472
697 | 56bit     7      1     11        44          35184372088832
698 | 64bit     8      1     11        52        9007199254740990
699 | =====     =====  ====  ========  ========  ================
700 | 
701 | Dates, Currency, and Formatting
702 | -------------------------------
703 | 
704 | Column formatting infomation is encoded within the `Column Text Subheader`_ and `Column Format and Label Subheader`_. Columns with formatting information have special meaning and interpretation. For example, numeric values may represent dates, encoded as the number of seconds since midnight, January 1, 1960. The format string for fields encoded this way is "DATETIME". Using R, these values may be converted using the as.POSIXct or as.POSIXlt functions with argument ``origin="1960-01-01"``. The most common date format strings correspond to numeric fields, and are interpreted as follows:
705 | 
706 | ========  =======================================  ============
707 | Format    Interpretation                           R Function
708 | ========  =======================================  ============
709 | DATE      Number of days since January 1, 1960     chron::chron
710 | TIME      Number of seconds since midnight         as.POSIXct
711 | DATETIME  Number of seconds since January 1, 1960  as.POSIXct
712 | ========  =======================================  ============
713 | 
714 | There are many additional format strings for numeric and character fields.
715 | 
716 | Platform Differences
717 | ====================
718 | 
719 | The test files referenced in ``data/sas7bdat.sources.RData`` were examined over a period of time. Files with non-Microsoft Windows markings were only observed late into the writing of this document. Consequently (but not intentionally), the SAS7BDAT description above was first deduced for SAS datasets generated on the most commonly observed platform: Microsoft Windows. The extensions to SAS7BDAT files for **u64** and non-Intel formats was contributed a little later by Clint Cummins.
720 | 
721 | In particular, the files ``natlerr1944.sas7bdat``, ``natlerr2006.sas7bdat`` appear to be generated on the 'SunOS' platform (**u64**, non-Intel).  ``txzips.sas7bdat`` was created on Linux 64-bit SAS server (**u64**, Intel).  ``eyecarex.sas7bdat`` is non-Intel, possibly 32-bit PowerPC.
722 | 
723 | The files ``cfrance2.sas7bdat``, ``cfrance.sas7bdat``, ``coutline.sas7bdat``,  ``gfrance2.sas7bdat``, ``gfrance.sas7bdat``, ``goutline.sas7bdat``, ``xfrance2.sas7bdat``, ``xfrance.sas7bdat``, ``xoutline.sas7bdat`` appear to be generated on a 32-bit 'Linux' Intel system.  They have the same format as Windows files, except for the (ignorable) OS strings in the first header.
724 | 
725 | Text may appear in non-ASCII compatible, partially ASCII compatible, or multi-byte encodings. In particular, Kasper Sorenson discovered some text that appears to be encoded using the Windows-1252 'code page'. 
726 | 
727 | **Key Test Files**
728 | 
729 | =================================	======================================
730 | filename				format features
731 | =================================	======================================
732 | ``acadindx.sas7bdat``			non-u64, Intel (most files are like this one)
733 | ``br.sas7bdat``				truncated doubles (widths 3,4,6; compare with br2 widths all 8)
734 | ``eyecarex.sas7bdat``			non-u64, non-Intel, written by Stat/Transfer
735 | ``txzips.sas7bdat``			u64, Intel
736 | ``natlterr1994.sas7bdat``		u64, non-Intel
737 | ``hltheds2006.sas7bdat``		2 Column Attributes subheaders
738 | ``moshim.sas7bdat``			3 Column Attributes subheaders
739 | ``flightdelays.sas7bdat``		2 Column Text subheaders
740 | ``ymcls_p2_long_040506.sas7bdat``	5 Column Text subheaders, first Column Attributes subheader is on page 6
741 | ``flightschedule.sas7bdat``		2+ Column Text subheaders
742 | ``internationalflight.sas7bdat``	2+ Column Text subheaders
743 | ``marchflights.sas7bdat``		2+ Column Text subheaders
744 | ``mechanicslevel1.sas7bdat``		2+ Column Text subheaders
745 | ``compress_yes.sas7bdat``		COMPRESS=CHAR, one PGTYPE=-28672, no RLE compression (COMP=0)
746 | ``obs_all_perf_1.sas7bdat``		COMPRESS=CHAR, many PGTYPE=16384, much RLE compression (COMP=4)
747 | =================================	======================================
748 | 
749 | 
750 | Compression Data
751 | ================
752 | 
753 | The table below presents the results of compression tests on a collection of 142 SAS7BDAT data files (sources in ``data/``). The 'type' field represents the type of compression, 'ctime' is the compression time (in seconds), 'dtime' is the decompression time, and the 'compression ratio' field holds the cumulative disk usage (in megabytes) before and after compression. Although the ``xz`` algorithm requires significantly more time to compress these data, the decompression time is on par with gzip.
754 | 
755 | =============	======	======	=========================
756 | type		ctime	dtime	compression ratio
757 | =============	======	====== 	=========================
758 | gzip -9		76.7s	2.6s	541M / 30.3M = 17.9
759 | bzip2 -9	92.7s	11.2s	541M / 19.0M = 28.5
760 | xz -9		434.2s	2.7s	541M / 12.8M = 42.3
761 | =============	======	======	=========================
762 | 
763 | 
764 | Software Prototype
765 | ==================
766 | 
767 | The prototype program for reading SAS7BDAT formatted files is implemented entirely in R (see file ``src/sas7bdat.R``). Files not recognized as having been generated under a Microsoft Windows platform are rejected (for now). Implementation of the ``read.sas7bdat`` function should be considered a 'reference implementation', and not one designed with performance in mind. 
768 | 
769 | There are certain advantages and disadvantages to developing a prototype of this nature in R.
770 | 
771 | Advantages:
772 | 
773 | 1. R is an interpreted language with built-in debugger. Hence, experimental routines may be implemented and debugged quickly and interactively, without the need of external compiler or debugger tools (e.g. gcc, gdb).
774 | 2. R programs are portable across a variety of computing platforms. This is especially important in the present context, because manipulating files stored on disk is a platform-specific task. Platform-specific operations are abstracted from the R user.
775 | 
776 | Disadvantages:
777 | 
778 | 1. Manipulating binary (raw) data in R is a relatively new capability. The best tools and practices for binary data operations are not as developed as those for other data types.
779 | 2. Interpreted code is often much less efficient than compiled code. This is not major disadvantage for prototype implementations because human code development is far less efficient than the R interpreter. Gains made in efficient code development using an interpreted language far outweigh benefit of compiled languages.
780 | 
781 | Another software implementation was made by Clint Cummins, in the TSP econometrics package (mainly as an independent platform for exploring the format).
782 | 
783 | ToDo
784 | ====
785 | 
786 | - obtain test files which use COMPRESS=BINARY, and develop identification and uncompression procedures
787 | - look for data which will reliably distinguish between structural subheaders (which have one of the known signatures) and uncompressed row data, which may have row data in the signature position that matches one of the known signatures.  Both use COMP=0.  Are NPSHD and NSHPL sufficient to do this?
788 | - obtain test files with more than 2.1 billion (and more than 4.2 billion) data rows, i.e. where 8 byte integer TRC in **u64** is apparently needed.  Do the non-u64 files handle this, with additional fields beyond the 4 byte TRC used for segmentation?  Is TRC a (signed) int or (unsigned) uint?
789 | - identify any SAS7BDAT encryption flag (this is not the same as 'cracking', or breaking encryption); we just identify if a file is encrypted and not readable without a key
790 | - experiment further with 'amendment page' concept
791 | - consider header bytes -by- SAS_host
792 | - check that only one page of type "mix" is observed. If so insert "In all test cases (``data/sources.csv``), there are exactly zero or one pages of type 'mix'." under the `Page Offset Table`_ header.   [May not be needed, because the BC and SC fields in each Page Offset Table make the `MRC`_ field in the initial header unnecessary.]
793 | - identify all missing value representations: missing numeric values appear to be represented as '0000000000D1FFFF' (nan) for numeric 'double' quantities.
794 | - identify purpose of various unknown header quantities
795 | - determine purpose of Column List subheader
796 | - determine purpose and pattern of 'page sequence signature' fields.  Are they useful?
797 | - identify how non-ASCII encoding is specified
798 | - implement R options to read just header (and subheader) information without data, and an option to read just some data fields, and not all fields.  [The TSP implemenation already does this, and can also read a subset of the data rows.]
799 | 


--------------------------------------------------------------------------------