├── .Rbuildignore ├── DESCRIPTION ├── NAMESPACE ├── R └── sas7bdat.R ├── README ├── data └── sas7bdat.sources.RData ├── man ├── read.sas7bdat.Rd └── sas7bdat.sources.Rd └── vignettes ├── reverse-engineering.Rnw ├── rst2Rnw.sh ├── sas7bdat.Rnw └── sas7bdat.rst /.Rbuildignore: -------------------------------------------------------------------------------- 1 | doc/CCNotes 2 | doc/reverse-engineering.Rnw 3 | doc/rst2Rnw.sh 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: sas7bdat 2 | Type: Package 3 | Title: sas7bdat Reverse Engineering Documentation 4 | Version: 0.8 5 | Date: 2024-08-28 6 | Authors@R: c(person("Matt", "Shotwell", role=c("aut", "cre"), 7 | email="matt.shotwell@vanderbilt.edu"), 8 | person("Clint", "Cummins", role="ctb", 9 | email="clint@stanford.edu")) 10 | Maintainer: Matt Shotwell 11 | Description: Documentation and prototypes for the earliest (circa 2010) open-source effort to reverse engineer the sas7bdat file format. The package includes a prototype reader for sas7bdat files. However, newer packages may contain more robust readers for sas7bdat files. 12 | Depends: R (>= 2.10) 13 | License: GPL (>= 2) 14 | LazyLoad: yes 15 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | export(read.sas7bdat) 2 | importFrom("utils", "download.file") 3 | -------------------------------------------------------------------------------- /R/sas7bdat.R: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2015 Matt Shotwell, VUMC 2 | # 3 | # This program is free software; you can redistribute it and/or modify 4 | # it under the terms of the GNU General Public License as published by 5 | # the Free Software Foundation; either version 2 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU General Public License along 14 | # with this program; if not, write to the Free Software Foundation, Inc., 15 | # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. 16 | 17 | # Download all files listed in sas7bdat.sources 18 | # path - where to save files 19 | # max.size - limit on the size of downloaded files (bytes) 20 | download.sas7bdat.sources <- 21 | function(ss, path=normalizePath("."), max.size=2^20) { 22 | # don't download zip files or files larger than max.size 23 | ss <- subset(ss, !grepl(".zip$", ss$url) & ss$uncompressed < max.size) 24 | if(!file.exists(path)) 25 | dir.create(path) 26 | apply(ss, 1, function(r) 27 | download.file(r["url"], file.path(path, r["filename"]))) 28 | } 29 | 30 | # Compress a file on disk 31 | # desc - file path 32 | # type - compression type ("gzip", "bzip2", "xz") 33 | file.compress <- function(desc, type = "gzip") { 34 | if(type == "gzip") { 35 | ext <- ".gz"; cfile <- gzfile 36 | } else if(type == "bzip2") { 37 | ext <- ".bz2"; cfile <- bzfile 38 | } else if(type == "xz") { 39 | ext <- ".xz"; cfile <- xzfile 40 | } else { 41 | stop("compression 'type' unrecognized") 42 | } 43 | inp <- file(desc, open="rb") 44 | oup <- cfile(paste(desc, ext, sep=""), open="wb") 45 | while(length(dat <- readBin(inp, "raw", 2^13)) > 0) 46 | writeBin(dat, oup) 47 | close(inp) 48 | close(oup) 49 | return(paste(desc, ext, sep="")) 50 | } 51 | 52 | # Generate an entry for sas7bdat.sources 53 | # fn - a local file name 54 | # url - url of the file 55 | generate.sas7bdat.source <- function(fn, url) { 56 | dl <- try(download.file(url, fn)) 57 | if(inherits(dl, "try-error") || dl != 0) 58 | return(FALSE) 59 | sz <- file.info(fn)$size 60 | cat("gzip compress...") 61 | fn.gz <- file.compress(fn, "gzip") 62 | sz.gz <- file.info(fn.gz)$size 63 | cat("done\nbzip2 compress...") 64 | fn.bz2 <- file.compress(fn, "bzip2") 65 | sz.bz2 <- file.info(fn.bz2)$size 66 | cat("done\nxz compress...") 67 | fn.xz <- file.compress(fn, "xz") 68 | sz.xz <- file.info(fn.xz)$size 69 | cat("done\nparsing file...") 70 | dat <- try(read.sas7bdat(fn)) 71 | cat("done\n") 72 | if(!inherits(dat, "try-error")) { 73 | # the two date variables below are not used 74 | as.character(attr(dat, 'date.created')) -> datecreated 75 | as.character(attr(dat, 'date.modified')) -> datemodified 76 | attr(dat, 'SAS.release') -> SAS_release 77 | attr(dat, 'SAS.host') -> SAS_host 78 | attr(dat, 'OS.version') -> OS_version 79 | attr(dat, 'OS.maker') -> OS_maker 80 | attr(dat, 'OS.name') -> OS_name 81 | attr(dat, 'endian') -> endian 82 | attr(dat, 'winunix') -> winunix 83 | dat <- "OK" 84 | } else { 85 | datecreated <- "" 86 | datemodified <- "" 87 | SAS_release <- "" 88 | SAS_host <- "" 89 | OS_version <- "" 90 | OS_maker <- "" 91 | OS_name <- "" 92 | endian <- "" 93 | winunix <- "" 94 | dat <- dat[1] 95 | } 96 | data.frame( 97 | filename = fn, accessed = Sys.time(), uncompressed = sz, 98 | gzip = sz.gz, bzip2 = sz.bz2, xz = sz.xz, url = url, 99 | PKGversion = VERSION, message = dat, SASrelease = SAS_release, 100 | SAShost = SAS_host, OSversion = OS_version, OSmaker = OS_maker, 101 | OSname = OS_name, endian = endian, winunix = winunix, 102 | stringsAsFactors=FALSE) 103 | } 104 | 105 | update.sas7bdat.source <- function(df) { 106 | re <- generate.sas7bdat.source(df$filename, df$url) 107 | if(inherits(re, "logical")) return(df) 108 | return(re) 109 | } 110 | 111 | 112 | # Update sas7bdat.sources 113 | update.sas7bdat.sources <- function(ss) { 114 | for(i in 1:nrow(ss)) 115 | ss[i,] <- update.sas7bdat.source(ss[i,]) 116 | return(ss) 117 | } 118 | 119 | VERSION <- "0.5" 120 | BUGREPORT <- "please report bugs to maintainer" 121 | CAUTION <- "please verify data correctness" 122 | 123 | # Subheader 'signatures' 124 | SUBH_ROWSIZE <- as.raw(c(0xF7,0xF7,0xF7,0xF7)) 125 | SUBH_COLSIZE <- as.raw(c(0xF6,0xF6,0xF6,0xF6)) 126 | SUBH_COLTEXT <- as.raw(c(0xFD,0xFF,0xFF,0xFF)) 127 | SUBH_COLATTR <- as.raw(c(0xFC,0xFF,0xFF,0xFF)) 128 | SUBH_COLNAME <- as.raw(c(0xFF,0xFF,0xFF,0xFF)) 129 | SUBH_COLLABS <- as.raw(c(0xFE,0xFB,0xFF,0xFF)) 130 | SUBH_COLLIST <- as.raw(c(0xFE,0xFF,0xFF,0xFF)) 131 | SUBH_SUBHCNT <- as.raw(c(0x00,0xFC,0xFF,0xFF)) 132 | 133 | # Page types 134 | PAGE_META <- 0 135 | PAGE_DATA <- 256 #1<<8 136 | PAGE_MIX <- c(512,640) #1<<9,1<<9|1<<7 137 | PAGE_AMD <- 1024 #1<<10 138 | PAGE_METC <- 16384 #1<<14 (compressed data) 139 | PAGE_COMP <- -28672 #~(1<<14|1<<13|1<<12) 140 | PAGE_MIX_DATA <- c(PAGE_MIX, PAGE_DATA) 141 | PAGE_META_MIX_AMD <- c(PAGE_META, PAGE_MIX, PAGE_AMD) 142 | PAGE_ANY <- c(PAGE_META_MIX_AMD, PAGE_DATA, PAGE_METC, PAGE_COMP) 143 | 144 | page_type_strng <- function(type) { 145 | if(type %in% PAGE_META) 146 | return('meta') 147 | if(type %in% PAGE_DATA) 148 | return('data') 149 | if(type %in% PAGE_MIX) 150 | return('mix') 151 | if(type %in% PAGE_AMD) 152 | return('amd') 153 | return('unknown') 154 | } 155 | 156 | read_subheaders <- function(page, u64) { 157 | subhs <- list() 158 | subh_total <- 0 159 | if(!(page$type %in% PAGE_META_MIX_AMD)) 160 | return(subhs) 161 | # page offset of subheader pointers 162 | oshp <- if(u64) 40 else 24 163 | # length of subheader pointers 164 | lshp <- if(u64) 24 else 12 165 | # length of first two subheader fields 166 | lshf <- if(u64) 8 else 4 167 | for(i in 1:page$subh_count) { 168 | subh_total <- subh_total + 1 169 | base <- oshp + (i - 1) * lshp 170 | subhs[[subh_total]] <- list() 171 | subhs[[subh_total]]$page <- page$page 172 | subhs[[subh_total]]$offset <- read_int(page$data, base, lshf) 173 | subhs[[subh_total]]$length <- read_int(page$data, base + lshf, lshf) 174 | if(subhs[[subh_total]]$length > 0) { 175 | subhs[[subh_total]]$raw <- read_raw(page$data, 176 | subhs[[subh_total]]$offset, subhs[[subh_total]]$length) 177 | subhs[[subh_total]]$signature <- read_raw(subhs[[subh_total]]$raw, 0, 4) 178 | } 179 | } 180 | return(subhs) 181 | } 182 | 183 | read_column_names <- function(col_name, col_text, u64) { 184 | names <- list() 185 | name_count <- 0 186 | offp <- if(u64) 8 else 4 187 | for(subh in col_name) { 188 | cmax <- (subh$length - if(u64) 28 else 20)/8 189 | for(i in 1:cmax) { 190 | name_count <- name_count + 1 191 | names[[name_count]] <- list() 192 | base <- (if(u64) 16 else 12) + (i-1) * 8 193 | hdr <- read_int(subh$raw, base, 2) 194 | off <- read_int(subh$raw, base + 2, 2) 195 | len <- read_int(subh$raw, base + 4, 2) 196 | names[[name_count]]$name <- read_str(col_text[[hdr+1]]$raw, 197 | off + offp, len) 198 | } 199 | } 200 | return(names) 201 | } 202 | 203 | read_column_labels_formats <- function(col_labs, col_text, u64) { 204 | if(length(col_labs) < 1) 205 | return(NULL) 206 | offp <- if(u64) 8 else 4 207 | labs <- list() 208 | for(i in 1:length(col_labs)) { 209 | labs[[i]] <- list() 210 | base <- if(u64) 46 else 34 211 | hdr <- read_int(col_labs[[i]]$raw, base, 2) 212 | off <- read_int(col_labs[[i]]$raw, base + 2, 2) 213 | len <- read_int(col_labs[[i]]$raw, base + 4, 2) 214 | if(len > 0) 215 | labs[[i]]$format <- read_str(col_text[[hdr+1]]$raw, 216 | off + offp, len) 217 | labs[[i]]$fhdr <- hdr; 218 | labs[[i]]$foff <- off 219 | labs[[i]]$flen <- len 220 | base <- if(u64) 52 else 40 221 | hdr <- read_int(col_labs[[i]]$raw, base, 2) 222 | off <- read_int(col_labs[[i]]$raw, base + 2, 2) 223 | len <- read_int(col_labs[[i]]$raw, base + 4, 2) 224 | if(len > 0) 225 | labs[[i]]$label <- read_str(col_text[[hdr+1]]$raw, 226 | off + offp, len) 227 | labs[[i]]$lhdr <- hdr; 228 | labs[[i]]$loff <- off 229 | labs[[i]]$llen <- len 230 | } 231 | return(labs) 232 | } 233 | 234 | read_column_attributes <- function(col_attr, u64) { 235 | info <- list() 236 | info_ct <- 0 237 | lcav <- if(u64) 16 else 12 238 | for(subh in col_attr) { 239 | cmax <- (subh$length - if(u64) 28 else 20)/lcav 240 | for(i in 1:cmax) { 241 | info_ct <- info_ct + 1 242 | info[[info_ct]] <- list() 243 | base <- lcav + (i-1) * lcav 244 | info[[info_ct]]$offset <- read_int(subh$raw, base, 245 | if(u64) 8 else 4) 246 | info[[info_ct]]$length <- read_int(subh$raw, 247 | base + if(u64) 8 else 4, 248 | 4) 249 | info[[info_ct]]$type <- read_int(subh$raw, 250 | base + if(u64) 14 else 10, 251 | 1) 252 | info[[info_ct]]$type <- ifelse(info[[info_ct]]$type == 1, 253 | "numeric", "character") 254 | } 255 | } 256 | return(info) 257 | } 258 | 259 | # Magic number 260 | MAGIC <- as.raw(c(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 261 | 0x0, 0x0, 0x0, 0x0, 0xc2,0xea,0x81,0x60, 262 | 0xb3,0x14,0x11,0xcf,0xbd,0x92,0x8, 0x0, 263 | 0x9, 0xc7,0x31,0x8c,0x18,0x1f,0x10,0x11)) 264 | 265 | check_magic_number <- function(data) 266 | identical(data[1:length(MAGIC)], MAGIC) 267 | 268 | # These functions utilize offset + length addressing 269 | read_bin <- function(buf, off, len, type, ...) 270 | readBin(buf[(off+1):(off+len)], type, 1, len, ...) 271 | read_raw <- function(buf, off, len, ...) 272 | readBin(buf[(off+1):(off+len)], "raw", len, 1, ...) 273 | read_int <- function(buf, off, len, ...) 274 | read_bin(buf, off, len, "integer", ...) 275 | read_str <- function(buf, off, len, ...) 276 | read_bin(buf, off, len, "character", ...) 277 | read_flo <- function(buf, off, len, ...) 278 | read_bin(buf, off, len, "double", ...) 279 | 280 | get_subhs <- function(subhs, signature) { 281 | keep <- sapply(subhs, function(subh) { 282 | identical(subh$signature, signature) 283 | }) 284 | subhs[keep] 285 | } 286 | 287 | # Sometimes there is more than one column attribute subheader. 288 | # In these cases, the column attribute data are spliced together 289 | # so that the appear to have been in the same subheader 290 | splice_col_attr_subheaders <- function(col_attr) { 291 | raw <- read_raw(col_attr[[1]]$raw, 0, col_attr[[1]]$length - 8) 292 | for(i in 2:length(col_attr)) 293 | raw <- c(raw, read_raw(col_attr[[i]]$raw, 12, 294 | col_attr[[i]]$length - 20)) 295 | return(list(raw=raw)) 296 | } 297 | 298 | read.sas7bdat <- function(file, encoding="", debug=FALSE) { 299 | if(inherits(file, "connection") && isOpen(file, "read")) { 300 | con <- file 301 | close_con <- FALSE 302 | } else if (is.character(file)) { 303 | con <- file(file, "rb") 304 | close_con <- TRUE 305 | } else { 306 | stop("invalid 'file' argument") 307 | } 308 | 309 | 310 | # Check magic number 311 | header <- readBin(con, "raw", 288, 1) 312 | if(length(header) < 288) 313 | stop("header too short (not a sas7bdat file?)") 314 | if(!check_magic_number(header)) 315 | stop(paste("magic number mismatch", BUGREPORT)) 316 | 317 | # Check for 32 or 64 bit alignment 318 | align1 <- read_raw(header, 32, 1) 319 | if(identical(align1, as.raw(0x33))) { 320 | align1 <- 4 321 | } else { 322 | align1 <- 0 323 | } 324 | 325 | # If align1 == 4, file is u64 type 326 | if(align1 == 4) { 327 | u64 <- TRUE 328 | } else { 329 | u64 <- FALSE 330 | } 331 | 332 | align2 <- read_raw(header, 35, 1) 333 | if(identical(align2, as.raw(0x33))) { 334 | align2 <- 4 335 | } else { 336 | align2 <- 0 337 | } 338 | 339 | endian <- read_raw(header, 37, 1) 340 | if(identical(endian, as.raw(0x01))) { 341 | endian <- "little" 342 | } else { 343 | endian <- "big" 344 | stop("big endian files are not supported") 345 | } 346 | 347 | winunix <- read_str(header, 39, 1) 348 | if(identical(winunix, "1")) { 349 | winunix <- "unix" 350 | } else if(identical(winunix, "2")) { 351 | winunix <- "windows" 352 | } else { 353 | winunix <- "unknown" 354 | } 355 | 356 | # Timestamp is epoch 01/01/1960 357 | datecreated <- read_flo(header, 164+align1, 8) 358 | datecreated <- datecreated + as.POSIXct("1960/01/01", format="%Y/%m/%d") 359 | datemodified <- read_flo(header, 172+align1, 8) 360 | datemodified <- datemodified + as.POSIXct("1960/01/01", format="%Y/%m/%d") 361 | 362 | # Read the remaining header 363 | header_length <- read_int(header, 196 + align2, 4) 364 | header <- c(header, readBin(con, "raw", header_length-288, 1)) 365 | if(length(header) < header_length) 366 | stop("header too short (not a sas7bdat file?)") 367 | 368 | page_size <- read_int(header, 200 + align2, 4) 369 | if(page_size < 0) 370 | stop(paste("page size is negative", BUGREPORT)) 371 | 372 | page_count <- read_int(header, 204 + align2, 4) 373 | if(page_count < 1) 374 | stop(paste("page count is not positive", BUGREPORT)) 375 | 376 | 377 | SAS_release <- read_str(header, 216 + align1 + align2, 8) 378 | 379 | # SAS_host is a 16 byte field, but only the first eight are used 380 | # FIXME: It would be preferable to eliminate this check 381 | SAS_host <- read_str(header, 224 + align1 + align2, 8) 382 | 383 | OS_version <- read_str(header, 240 + align1 + align2, 16) 384 | OS_maker <- read_str(header, 256 + align1 + align2, 16) 385 | OS_name <- read_str(header, 272 + align1 + align2, 16) 386 | 387 | # Read pages 388 | pages <- list() 389 | for(page_num in 1:page_count) { 390 | pages[[page_num]] <- list() 391 | pages[[page_num]]$page <- page_num 392 | pages[[page_num]]$data <- readBin(con, "raw", page_size, 1) 393 | pages[[page_num]]$type <- read_int(pages[[page_num]]$data, if(u64) 32 else 16, 2) 394 | pages[[page_num]]$type_strng <- page_type_strng(pages[[page_num]]$type) 395 | pages[[page_num]]$blck_count <- read_int(pages[[page_num]]$data, if(u64) 34 else 18, 2) 396 | pages[[page_num]]$subh_count <- read_int(pages[[page_num]]$data, if(u64) 36 else 20, 2) 397 | } 398 | 399 | # Read all subheaders 400 | subhs <- list() 401 | for(page in pages) 402 | subhs <- c(subhs, read_subheaders(page, u64)) 403 | 404 | # Parse row size subheader 405 | row_size <- get_subhs(subhs, SUBH_ROWSIZE) 406 | if(length(row_size) != 1) 407 | stop(paste("found", length(row_size), 408 | "row size subheaders where 1 expected", BUGREPORT)) 409 | row_size <- row_size[[1]] 410 | row_length <- read_int(row_size$raw, 411 | if(u64) 40 else 20, 412 | if(u64) 8 else 4) 413 | row_count <- read_int(row_size$raw, 414 | if(u64) 48 else 24, 415 | if(u64) 8 else 4) 416 | col_count_p1 <- read_int(row_size$raw, 417 | if(u64) 72 else 36, 418 | if(u64) 8 else 4) 419 | col_count_p2 <- read_int(row_size$raw, 420 | if(u64) 80 else 40, 421 | if(u64) 8 else 4) 422 | row_count_fp <- read_int(row_size$raw, 423 | if(u64) 120 else 60, 424 | if(u64) 8 else 4) 425 | 426 | # Parse col size subheader 427 | col_size <- get_subhs(subhs, SUBH_COLSIZE) 428 | if(length(col_size) != 1) 429 | stop(paste("found", length(col_size), 430 | "column size subheaders where 1 expected", BUGREPORT)) 431 | col_size <- col_size[[1]] 432 | col_count_6 <- read_int(col_size$raw, 433 | if(u64) 8 else 4, 434 | if(u64) 8 else 4) 435 | col_count <- col_count_6 436 | 437 | #if((col_count_p1 + col_count_p2) != col_count_6) 438 | # warning(paste("column count mismatch" , CAUTION)) 439 | 440 | # Read column information 441 | col_text <- get_subhs(subhs, SUBH_COLTEXT) 442 | if(length(col_text) < 1) 443 | stop(paste("no column text subheaders found", BUGREPORT)) 444 | 445 | # Test for COMPRESS=CHAR compression 446 | # This test is done earlier at the page level 447 | #if("SASYZCRL" == read_str(col_text[[1]]$raw, 16, 8)) 448 | # stop(paste("file uses unsupported CHAR compression")) 449 | 450 | col_attr <- get_subhs(subhs, SUBH_COLATTR) 451 | if(length(col_attr) < 1) 452 | stop(paste("no column attribute subheaders found", BUGREPORT)) 453 | 454 | col_attr <- read_column_attributes(col_attr, u64) 455 | if(length(col_attr) != col_count) 456 | stop(paste("found", length(col_attr), 457 | "column attributes where", col_count, 458 | "expected", BUGREPORT)) 459 | 460 | col_name <- get_subhs(subhs, SUBH_COLNAME) 461 | if(length(col_name) < 1) 462 | stop(paste("no column name subheaders found", BUGREPORT)) 463 | 464 | col_name <- read_column_names(col_name, col_text, u64) 465 | if(length(col_name) != col_count) 466 | stop(paste("found", length(col_name), 467 | "column names where", col_count, "expected", BUGREPORT)) 468 | 469 | # Make column names unique, if not already 470 | col_name_uni <- make.unique(sapply(col_name, function(x)x$name)) 471 | for(i in 1:length(col_name_uni)) 472 | col_name[[i]]$name <- col_name_uni[i] 473 | 474 | col_labs <- get_subhs(subhs, SUBH_COLLABS) 475 | col_labs <- read_column_labels_formats(col_labs, col_text, u64) 476 | if(is.null(col_labs)) 477 | col_labs <- list(length=col_count) 478 | if(length(col_labs) != col_count) 479 | stop(paste("found", length(col_labs), 480 | "column formats and labels", col_count, "expected", BUGREPORT)) 481 | 482 | # Collate column information 483 | col_info <- list() 484 | for(i in 1:col_count) 485 | col_info[[i]] <- c(col_name[[i]], col_attr[[i]], col_labs[[i]]) 486 | 487 | # Check pages for known type 488 | for(page_num in 1:page_count) { 489 | if(!(pages[[page_num]]$type %in% PAGE_ANY)) 490 | stop(paste("page", page_num, "has unknown type:", 491 | pages[[page_num]]$type, BUGREPORT)) 492 | if(pages[[page_num]]$type %in% c(PAGE_METC, PAGE_COMP)) 493 | stop("file contains compressed data") 494 | } 495 | 496 | # Parse data 497 | data <- list() 498 | for(col in col_info) 499 | if(col$length > 0) 500 | data[[col$name]] <- vector(col$type, length=row_count) 501 | 502 | row <- 0 503 | for(page in pages) { 504 | #FIXME are there data on pages of type 4? 505 | if(!(page$type %in% PAGE_MIX_DATA)) 506 | next 507 | base <- (if(u64) 32 else 16) + 8 508 | if(page$type %in% PAGE_MIX) { 509 | row_count_p <- row_count_fp 510 | # skip subheader pointers 511 | base <- base + page$subh_count * if(u64) 24 else 12 512 | base <- base + base %% 8 513 | } else { 514 | row_count_p <- read_int(page$data, if(u64) 34 else 18, 2) 515 | } 516 | # round up to 8-byte boundary 517 | base <- ((base+7) %/% 8) * 8 + base %% 8 518 | if(row_count_p > row_count) 519 | row_count_p <- row_count 520 | for(row in (row+1):(row+row_count_p)) { 521 | for(col in col_info) { 522 | off <- base + col$offset 523 | if(col$length > 0) { 524 | raw <- read_raw(page$data, off, col$length) 525 | if(col$type == "numeric" && col$length < 8) { 526 | raw <- c(as.raw(rep(0x00, 8 - col$length)),raw) 527 | col$length <- 8 528 | } 529 | data[[col$name]][row] <- readBin(raw, col$type, 1, col$length) 530 | if(col$type == "character") { 531 | # Apply encoding 532 | Encoding(data[[col$name]][row]) <- encoding 533 | # Strip beginning and trailing spaces 534 | data[[col$name]][row] <- gsub('^ +| +$', '', data[[col$name]][row]) 535 | } 536 | } 537 | } 538 | base <- base + row_length 539 | } 540 | } 541 | 542 | if(row != row_count) 543 | warning(paste("found", row, "records where", row_count, 544 | "expected", BUGREPORT)) 545 | 546 | if(close_con) 547 | close(con) 548 | 549 | data <- as.data.frame(data) 550 | attr(data, 'pkg.version') <- VERSION 551 | attr(data, 'column.info') <- col_info 552 | attr(data, 'date.created') <- datecreated 553 | attr(data, 'date.modified') <- datemodified 554 | attr(data, 'SAS.release') <- SAS_release 555 | attr(data, 'SAS.host') <- SAS_host 556 | attr(data, 'OS.version') <- OS_version 557 | attr(data, 'OS.maker') <- OS_maker 558 | attr(data, 'OS.name') <- OS_name 559 | attr(data, 'endian') <- endian 560 | attr(data, 'winunix') <- winunix 561 | if(debug) 562 | attr(data, 'debug') <- sys.frame(1) 563 | return(data) 564 | } 565 | -------------------------------------------------------------------------------- /README: -------------------------------------------------------------------------------- 1 | This package contains documents and software related to a compatibility study of the SAS7BDAT database file format. The 'data/sas7bdat.sources.RData' references a collection of SAS7BDAT database files freely avalilable from internet resources. The data are not redistributed due to licensing concerns. Files in the collection are used for testing and investigating the SAS7BDAT file format. 2 | 3 | The included vignette documents various aspects of the compatibility study, including a detailed description of the binary structure of SAS7BDAT formatted databases. 4 | 5 | The R/ directory contains R (www.r-project.org) code implementing a prototype SAS7BDAT file reader. 6 | -------------------------------------------------------------------------------- /data/sas7bdat.sources.RData: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/BioStatMatt/sas7bdat/46418b2892d8f30b55ef497f1eb81e9db4822076/data/sas7bdat.sources.RData -------------------------------------------------------------------------------- /man/read.sas7bdat.Rd: -------------------------------------------------------------------------------- 1 | \name{read.sas7bdat} 2 | \alias{read.sas7bdat} 3 | \title{ 4 | SAS Database Reader (experimental) 5 | } 6 | \description{ 7 | Read SAS files in the sas7bdat data format. 8 | } 9 | \usage{ 10 | read.sas7bdat(file, encoding="", debug=FALSE) 11 | } 12 | \arguments{ 13 | \item{file}{character: Path to a file or an URL.} 14 | \item{encoding}{character: Character encoding for strings} 15 | \item{debug}{logical: Save function environment as attribute of returned object.} 16 | } 17 | \section{Warning}{ 18 | The functionality in this package is EXPERIMENTAL. Use at your own risk. For the latest details, see the \sQuote{sas7bdat} vignette (\emph{i.e.}, \code{vignette('sas7bdat')}). 19 | } 20 | \value{ 21 | A data frame corresponding to the SAS database. The returned data frame has an \code{column.info} attribute and other attributes that contain additional information about each field in the data frame, respectively. The \code{column.info} attribute is a list of lists, containing each of the following: 22 | \item{name}{The field name} 23 | \item{offset}{The field offset in packed binary row data (bytes)} 24 | \item{length}{The field length (bytes)} 25 | \item{type}{The field type, either 'character' or 'numeric'} 26 | When the database specifies a field format and/or label, the following may also be present: 27 | \item{format}{The field display format} 28 | \item{label}{The field label (usually a longer description)} 29 | } 30 | \references{ 31 | http://biostatmatt.com/archives/tag/sas7bdat 32 | } 33 | \author{ 34 | Matt Shotwell 35 | } 36 | 37 | \examples{ 38 | ## see \code{data(sas7bdat.sources)} 39 | } 40 | -------------------------------------------------------------------------------- /man/sas7bdat.sources.Rd: -------------------------------------------------------------------------------- 1 | \name{sas7bdat.sources} 2 | \alias{sas7bdat.sources} 3 | \docType{data} 4 | \title{ 5 | Internet SAS Database Resources 6 | } 7 | \description{ 8 | These data are a collection of internet resources for SAS database files in the sas7bdat format. 9 | } 10 | \usage{data(sas7bdat.sources)} 11 | \format{ 12 | A data frame with records on the following fields: 13 | \describe{ 14 | \item{\code{filename}}{character, the SAS database filename} 15 | \item{\code{accessed}}{POSIXct, the date last retrieved} 16 | \item{\code{uncompressed}}{numeric, file size (bytes)} 17 | \item{\code{gzip}}{numeric, gzip compressed file size (bytes)} 18 | \item{\code{bzip2}}{numeric, bzip2 compressed file size (bytes)} 19 | \item{\code{xz}}{numeric, xz compressed file size (bytes)} 20 | \item{\code{url}}{character, the Universal Resource Locator} 21 | \item{\code{PKGversion}}{character, the \pkg{sas7bdat} package version} 22 | \item{\code{message}}{character, message returned by \code{read.sas7bdat} (if any)} 23 | \item{\code{SASrelease}}{character, SAS release} 24 | \item{\code{SAShost}}{character, SAS host platform} 25 | \item{\code{OSversion}}{character, OS version} 26 | \item{\code{OSmaker}}{character, OS maker} 27 | \item{\code{OSname}}{character, OS name} 28 | \item{\code{endianness}}{character, endianness of header fields} 29 | \item{\code{winunix}}{character, platform type} 30 | } 31 | } 32 | \examples{ 33 | data(sas7bdat.sources) 34 | } 35 | \keyword{datasets} 36 | -------------------------------------------------------------------------------- /vignettes/reverse-engineering.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass{article} 2 | \usepackage{cmap} % fix search and cut-and-paste in Acrobat 3 | \usepackage{ifthen} 4 | \usepackage[T1]{fontenc} 5 | \usepackage[utf8]{inputenc} 6 | \usepackage{alltt} 7 | \setcounter{secnumdepth}{0} 8 | \usepackage{longtable,ltcaption,array} 9 | \setlength{\extrarowheight}{2pt} 10 | \newlength{\DUtablewidth} % internal use in tables 11 | \usepackage{natbib} 12 | %%% Fallback definitions for Docutils-specific commands 13 | 14 | % Provide a length variable and set default, if it is new 15 | \providecommand*{\DUprovidelength}[2]{ 16 | \ifthenelse{\isundefined{#1}}{\newlength{#1}\setlength{#1}{#2}}{} 17 | } 18 | 19 | % line block environment 20 | \DUprovidelength{\DUlineblockindent}{2.5em} 21 | \ifthenelse{\isundefined{\DUlineblock}}{ 22 | \newenvironment{DUlineblock}[1]{% 23 | \list{}{\setlength{\partopsep}{\parskip} 24 | \addtolength{\partopsep}{\baselineskip} 25 | \setlength{\topsep}{0pt} 26 | \setlength{\itemsep}{0.15\baselineskip} 27 | \setlength{\parsep}{0pt} 28 | \setlength{\leftmargin}{#1}} 29 | \raggedright 30 | } 31 | {\endlist} 32 | }{} 33 | 34 | \ifthenelse{\isundefined{\hypersetup}}{ 35 | \usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue]{hyperref} 36 | \usepackage{bookmark} 37 | \urlstyle{same} % normal text font (alternatives: tt, rm, sf) 38 | }{} 39 | \hypersetup{ 40 | pdftitle={Tools and Strategies for Reverse Engineering the Format of Statistical Data Files (Draft)}, 41 | } 42 | \begin{document} 43 | 44 | \title{Tools and Strategies for Reverse Engineering the Format of Statistical Data Files (Draft)} 45 | \author{} 46 | \date{} 47 | \maketitle 48 | 49 | by: 50 | 51 | \begin{quote} 52 | \begin{DUlineblock}{0em} 53 | \item[] Matthew S. Shotwell, PhD 54 | \item[] Associate Professor 55 | \item[] Department of Biostatistics 56 | \item[] Vanderbilt University 57 | \item[] \href{mailto:matt.shotwell@vanderbilt.edu}{matt.shotwell@vanderbilt.edu} 58 | \end{DUlineblock} 59 | \end{quote} 60 | \section{Introduction} 61 | Reverse engineering is much like forensics. At some point in the past, the process was conceived and implemented. In the present, the input and output are observable, but the process itself is a {\it black box}, that is, the details of its design and implementation have been lost, or concealed. Forensics, and reverse engineering attempt to recover the details. 62 | 63 | %Rewards and risks 64 | The risks of a reverse engineering venture may be considerable. Indeed, the principal risk is failure to recover the details of a process under study. This is compounded by the consequences that may arise regardless of success or failure. For instance, merely the attempt to reverse engineer certain computer software may be in violation of the associated end-user license agreement. In an extreme example, an attempt to ``crack'' encrypted communications may be illegal. 65 | 66 | %Baggerly and Coombes "forensic bioinformatics" 67 | 68 | %Statistical data files whose format descriptions are distributed under commercial license, or otherwise unpublished, impose a barrier to reproducible research. Because such formats generally require the user to purchase and learn to operate an associated software package, the barriers are financial and practical in nature. In the worst case, the necessary software may cease to be supported, or become unavailable, rendering formatted data inaccessable. 69 | 70 | %This article aims to facilitate the process of reverse engineering to identify format of statistical data files. This content is appropriate for readers with programming skill that is typical of most statisticians. Indeed, the statisitcal programming enviroment R provides a familiar and useful framework for developing a reverse engineering toolkit. 71 | 72 | \section{Prerequisites} 73 | 74 | This discussion assumes familiarity with some modern computer concepts, such as file input and output, and how computers store data in bits and bytes. References are given throughout for further reading on key concepts. 75 | 76 | Statistical data files are regular computer files that contain structured data, such as a table of records and fields. A data file may also contain metadata, such as a record count or field labels. A file {\it format} is a specification that determines how structured data and metadata are organized into a computer file. Logically, a computer file is simply a sequence of eight bit binary values(bytes). Hence, a file format describes how structured data ({\it e.g.} integers, text) are (1) represented and (2) serialized in a sequence of bytes. 77 | 78 | Formatted data may be human readable, that is, consisting of bytes that are interpreted as character strings ({\it e.g.}, comma separated values). Encoded data that are not human readable are generically said to have 'binary' formatting ({\it e.g.,} XBase and dBase formats, \cite{XBase2010}). 79 | 80 | %@misc{XBase2010, 81 | % author = {Erik Bachmann}, 82 | % title = {XBase (and dBase) File Format Description}, 83 | % publisher = {Clickety Click Software}, 84 | % year = {2010}, 85 | % note = {This is an electronic document. Date retrieved: August 12, 2011.}, 86 | % url = {http://www.clicketyclick.dk/databases/xbase/format/index.html} 87 | %} 88 | 89 | 90 | % Perhaps reference Knuth, The Art of Computer Programming (no, don't think this reference is useful) 91 | 92 | %Although \citep{Goldberg}'s popular discussion of computer representations of floating-point numbers is primarily concerned with rounding errors, the article introduces the topic nicely. 93 | 94 | \section{Basics} 95 | %counting: decimal, hex, octal, binary 96 | %macro structure: header-data paradigm, offset-length 97 | 98 | 99 | %@article{Goldberg1991, 100 | % author = {Goldberg, David}, 101 | % title = {What every computer scientist should know about floating-point arithmetic}, 102 | % journal = {ACM Computing Surveys}, 103 | % number = {1}, 104 | % pages = {5--48}, 105 | % volume = {23}, 106 | % year = {1991} 107 | %} 108 | 109 | %visual and automated detection of floating point data 110 | %%floating point representations 111 | %%statistical properties 112 | 113 | %visual and automated detection of encoded character data 114 | %%character representations 115 | 116 | %alignment issues 117 | %%C structures can have different sizes, depending on the compiler and platform 118 | 119 | \section{The Knockout Strategy} 120 | %This is the strategy of modifying (breaking) fields that have unknown meaning, and then attempting to open the file using software design to open the file. The software may reveal the field's purpose. 121 | 122 | \section{Deducing Field Width Using Endianness} 123 | When a binary field encodes a multi-byte quantity, it may not be clear how many bytes contribute to the value. For instance, suppose that a multi-byte, unsigned integer field is suspected to be four bytes in length, but is only observed for values less than or equal to $2^{16}-1$. In this case, it is possible that the field is only two bytes in length, and the remaining bytes constitute a separate two-byte field. 124 | 125 | If the suspect field is subsequently observed in the opposite endianness, the field length becomes clear. To illustrate, consider the four bytes (in hexadecimal representation) {\tt 01 00 00 00}. Then suppose we observe the same field in opposite endianness. There are several possibilities: 1) {\tt 01 00 00 00} - the field is single byte, {\tt 00 01 00 00} - the field is two-byte, {\tt 00 00 01 00} - the field is three-byte, and {\tt 00 00 00 01} the field is four-byte. 126 | 127 | If there is concern that bytes at a particular offset may form incomplete parts of adjacent fields, then this test for field withd may be misleading. If the byte values observed in opposite endianness cannot be obtained by reordering the original byte values, then these bytes must span two adjacent fields. %if you discover that some files are big endian, this can be used confirm the length of multi-byte fields that were discovered in little-endian format. 128 | \section{Legality} 129 | Some material from: 130 | https://www.eff.org/issues/coders/reverse-engineering-faq 131 | 132 | Among intellectual property laws, including Copyright and fair use (17 U.S.C. 107), DMCA (17 U.S.C. section 1201), trade secrets (Uniform trade secrets act with amendments 1985), Contract law (EULA, TOS, TOU, NDA, developer or API agreement), Electronics Communications Privacy Act (18 U.S.C. 2510 et.seq.), trade secret law is most directly impacts reverse engineering of statistical data files (since copyright (of code), cracking, are not involved, but be careful about TOS and EULAs). However, the UTSA makes explicit that reverse engineering is a {\it proper} means. 133 | 134 | \end{document} 135 | -------------------------------------------------------------------------------- /vignettes/rst2Rnw.sh: -------------------------------------------------------------------------------- 1 | rst2latex sas7bdat.rst > sas7bdat.tex 2 | sed -re '/^%%% User specified packages and stylesheets/a \ 3 | \\usepackage{fullpage}\ 4 | \\usepackage{Sweave}\ 5 | %\\VignetteIndexEntry{sas7bdat}' sas7bdat.tex > sas7bdat.Rnw 6 | rm sas7bdat.tex 7 | -------------------------------------------------------------------------------- /vignettes/sas7bdat.Rnw: -------------------------------------------------------------------------------- 1 | \documentclass[a4paper]{article} 2 | % generated by Docutils 3 | \usepackage{cmap} % fix search and cut-and-paste in Acrobat 4 | \usepackage{ifthen} 5 | \usepackage[T1]{fontenc} 6 | \usepackage[utf8]{inputenc} 7 | \usepackage{alltt} 8 | \setcounter{secnumdepth}{0} 9 | \usepackage{longtable,ltcaption,array} 10 | \setlength{\extrarowheight}{2pt} 11 | \newlength{\DUtablewidth} % internal use in tables 12 | 13 | %%% Custom LaTeX preamble 14 | % PDF Standard Fonts 15 | \usepackage{mathptmx} % Times 16 | \usepackage[scaled=.90]{helvet} 17 | \usepackage{courier} 18 | 19 | %%% User specified packages and stylesheets 20 | \usepackage{fullpage} 21 | \usepackage{Sweave} 22 | %\VignetteIndexEntry{sas7bdat} 23 | 24 | %%% Fallback definitions for Docutils-specific commands 25 | 26 | % Provide a length variable and set default, if it is new 27 | \providecommand*{\DUprovidelength}[2]{ 28 | \ifthenelse{\isundefined{#1}}{\newlength{#1}\setlength{#1}{#2}}{} 29 | } 30 | 31 | % line block environment 32 | \DUprovidelength{\DUlineblockindent}{2.5em} 33 | \ifthenelse{\isundefined{\DUlineblock}}{ 34 | \newenvironment{DUlineblock}[1]{% 35 | \list{}{\setlength{\partopsep}{\parskip} 36 | \addtolength{\partopsep}{\baselineskip} 37 | \setlength{\topsep}{0pt} 38 | \setlength{\itemsep}{0.15\baselineskip} 39 | \setlength{\parsep}{0pt} 40 | \setlength{\leftmargin}{#1}} 41 | \raggedright 42 | } 43 | {\endlist} 44 | }{} 45 | 46 | % hyperlinks: 47 | \ifthenelse{\isundefined{\hypersetup}}{ 48 | \usepackage[colorlinks=true,linkcolor=blue,urlcolor=blue]{hyperref} 49 | \usepackage{bookmark} 50 | \urlstyle{same} % normal text font (alternatives: tt, rm, sf) 51 | }{} 52 | \hypersetup{ 53 | pdftitle={SAS7BDAT Database Binary Format}, 54 | } 55 | 56 | %%% Body 57 | \begin{document} 58 | \title{SAS7BDAT Database Binary Format% 59 | \label{sas7bdat-database-binary-format}} 60 | \author{} 61 | \date{} 62 | \maketitle 63 | 64 | by: 65 | 66 | \begin{quote} 67 | \begin{DUlineblock}{0em} 68 | \item[] Matthew S. Shotwell, PhD 69 | \item[] Assistant Professor 70 | \item[] Department of Biostatistics 71 | \item[] Vanderbilt University 72 | \item[] \href{mailto:matt.shotwell@vanderbilt.edu}{matt.shotwell@vanderbilt.edu} 73 | \end{DUlineblock} 74 | \end{quote} 75 | 76 | 1/9/2013 update (\textbf{u64} format extensions, Row Size fields, and RLE compression) by: 77 | 78 | \begin{quote} 79 | \begin{DUlineblock}{0em} 80 | \item[] Clint Cummins, PhD 81 | \item[] \href{mailto:clint@stanford.edu}{clint@stanford.edu} 82 | \end{DUlineblock} 83 | \end{quote} 84 | 85 | Copyright (C) 2013 is retained by the authors listed above. This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit \url{http://creativecommons.org/licenses/by-sa/3.0/}. 86 | 87 | 88 | \section{Contents% 89 | \label{contents}% 90 | } 91 | 92 | \begin{itemize} 93 | \item \hyperref[introduction]{Introduction} 94 | 95 | \item \hyperref[sas7bdat-header]{SAS7BDAT Header} 96 | 97 | \item \hyperref[sas7bdat-pages]{SAS7BDAT Pages} 98 | 99 | \item \hyperref[sas7bdat-subheaders]{SAS7BDAT Subheaders} 100 | 101 | \item \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT Packed Binary Data} 102 | 103 | \item \hyperref[platform-differences]{Platform Differences} 104 | 105 | \item \hyperref[compression-data]{Compression Data} 106 | 107 | \item \hyperref[software-prototype]{Software Prototype} 108 | 109 | \item \hyperref[todo]{ToDo} 110 | \end{itemize} 111 | 112 | 113 | \section{Introduction% 114 | \label{introduction}% 115 | } 116 | 117 | The SAS7BDAT file is a binary database storage file. At the time of this writing, no description of the SAS7BDAT file format was publicly available. Hence, users who wish to read and manipulate these files were required to obtain a license for the SAS software, or third party software with support for SAS7BDAT files. The purpose of this document is to promote interoperability between SAS and other popular statistical software packages, especially R (\url{http://www.r-project.org/}). 118 | 119 | The information below was deduced by examining the contents of many SAS7BDAT databases downloaded freely from internet resources (see \texttt{data/sas7bdat.sources.RData}). No guarantee is made regarding its accuracy. No SAS software, nor any other software requiring the purchase of a license was used. 120 | 121 | SAS7BDAT files consist of binary encoded data. Data files encoded in this format often have the extension '.sas7bdat'. The name 'SAS7BDAT' is not official, but is used throughout this document to refer to SAS database files formatted according to the descriptions below. 122 | 123 | There are significant differences in the SAS7BDAT format depending on the operating systems and computer hardware platforms (32bit vs. 64bit). See the section on \hyperref[platform-differences]{platform differences} for more details. The format described below is sufficient to read the entire collection of test files referenced in \texttt{data/sas7bdat.sources.RData} (i.e. files associated with 32bit and some 64bit builds of SAS for Microsoft Windows, and \textbf{u64} SAS versions). This includes files created with COMPRESS=CHAR. The format described here is probably not sufficient to \textbf{write} SAS7BDAT format files, due to lingering uncertainties. 124 | 125 | The figure below illustrates the overall structure of the SAS7BDAT database. Each file consists of a header (length := HL bytes), followed by PC pages, each of length PL bytes (PC and PL are shorthand for 'page count' and 'page size' respectively, and are used to denote these quantities throughout this document).: 126 | 127 | \begin{quote} 128 | \begin{alltt} 129 | -{}-{}-{}-{}-{}-{}-{}-{}-{}- 130 | | HL | header 131 | -{}-{}-{}-{}-{}-{}-{}-{}-{}- 132 | | PL | page 1 133 | -{}-{}-{}-{}-{}-{}-{}-{}-{}- 134 | | PL | page 2 135 | -{}-{}-{}-{}-{}-{}-{}-{}-{}- 136 | ... 137 | -{}-{}-{}-{}-{}-{}-{}-{}-{}- 138 | | PL | page PC 139 | -{}-{}-{}-{}-{}-{}-{}-{}-{}- 140 | \end{alltt} 141 | \end{quote} 142 | 143 | Throughout this document, hexadecimal digits are denoted with a preceding 'x', binary digits with a preceding 'b', and decimal digits with no preceding character. For example, see the below \hyperref[table-of-hexadecimal-decimal-and-binary-values]{table of hexadecimal, decimal, and binary values}. 144 | 145 | 146 | \section{SAS7BDAT Header% 147 | \label{sas7bdat-header}% 148 | } 149 | 150 | The SAS7BDAT file header contains a binary file identifier (\emph{i.e.}, a magic number), the dataset name, timestamp, the number pages (PC), their size (PL) and a variety of other values that pertain to the database as a whole. The purpose of many header fields remain unknown, but are likely to include specifications for data compression and encryption, password protection, and dates/times of creation and/or modification. Most files encountered encode multi-byte values little-endian (least significant byte first). However, some files have big-endian values. Hence, it appears that multi-byte values are encoded using endianness of the platform where the file was written. See \hyperref[platform-differences]{Platform Differences} for a table of key test files which differ in several ways. 151 | 152 | The \emph{offset table} below describes the SAS7BDAT file header as a sequence of bytes. Information stored in the table is indexed by its byte offset (first column) in the header and its length (second column) in bytes. For example, the field at offset 0 has length 32 bytes. Hence, bytes 0,1,...,31 comprise the data for this field. Byte lengths having the form '\%n' should read: 'the number of bytes remaining up to, but not including byte n'. The fourth column gives a shorthand description of the data contained at the corresponding offset. For example, 'int, page size := PL' indicates that the data stored at the corresponding location is a signed integer representing the page size, which we denote PL. The description \emph{????????????} indicates that the meaning of data stored at the corresponding offset is unknown. The third column represents the author's confidence (low, medium, high) in the corresponding offset, length, and description. Each offset table in this document is formatted in a similar fashion. Variables defined in an offset table are sometimes used in subsequent tables. 153 | 154 | 155 | \subsection{Header Offset Table% 156 | \label{header-offset-table}% 157 | } 158 | 159 | \setlength{\DUtablewidth}{\linewidth}% 160 | \begin{longtable*}{|p{0.114\DUtablewidth}|p{0.056\DUtablewidth}|p{0.056\DUtablewidth}|p{0.724\DUtablewidth}|} 161 | \hline 162 | \textbf{% 163 | offset 164 | } & \textbf{% 165 | length 166 | } & \textbf{% 167 | conf. 168 | } & \textbf{% 169 | description 170 | } \\ 171 | \hline 172 | \endfirsthead 173 | \hline 174 | \textbf{% 175 | offset 176 | } & \textbf{% 177 | length 178 | } & \textbf{% 179 | conf. 180 | } & \textbf{% 181 | description 182 | } \\ 183 | \hline 184 | \endhead 185 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 186 | \endfoot 187 | \endlastfoot 188 | 189 | 0 190 | & 191 | 32 192 | & 193 | high 194 | & 195 | binary, \hyperref[magic-number]{magic number} 196 | \\ 197 | \hline 198 | 199 | 32 200 | & 201 | 1 202 | & 203 | high 204 | & 205 | binary, \hyperref[alignment]{Alignment}: if (byte==x33) a2=4 else a2=0 . \textbf{u64} is true if a2=4 (unix 64 bit format). 206 | \\ 207 | \hline 208 | 209 | 33 210 | & 211 | 2 212 | & 213 | low 214 | & 215 | \emph{????????????} 216 | \\ 217 | \hline 218 | 219 | 35 220 | & 221 | 1 222 | & 223 | high 224 | & 225 | binary, \hyperref[alignment]{Alignment} if (byte==x33) a1=4 else a1=0 226 | \\ 227 | \hline 228 | 229 | 36 230 | & 231 | 1 232 | & 233 | low 234 | & 235 | \emph{????????????} 236 | \\ 237 | \hline 238 | 239 | 37 240 | & 241 | 1 242 | & 243 | high 244 | & 245 | int, endianness (x01-little {[}Intel{]} x00-big) 246 | \\ 247 | \hline 248 | 249 | 38 250 | & 251 | 1 252 | & 253 | low 254 | & 255 | \emph{????????????} 256 | \\ 257 | \hline 258 | 259 | 39 260 | & 261 | 1 262 | & 263 | medium 264 | & 265 | ascii, OS type (1-UNIX or 2-WIN). Does not affect format except for the OS strings. 266 | \\ 267 | \hline 268 | 269 | 40 270 | & 271 | 8 272 | & 273 | low 274 | & 275 | \emph{????????????} 276 | \\ 277 | \hline 278 | 279 | 48 280 | & 281 | 8 282 | & 283 | low 284 | & 285 | \emph{????????????} 286 | \\ 287 | \hline 288 | 289 | 56 290 | & 291 | 8 292 | & 293 | low 294 | & 295 | repeat of 32:32+8 296 | \\ 297 | \hline 298 | 299 | 64 300 | & 301 | 6 302 | & 303 | low 304 | & 305 | \emph{????????????} 306 | \\ 307 | \hline 308 | 309 | 70 310 | & 311 | 2 312 | & 313 | low 314 | & 315 | int, \hyperref[character-encoding]{Character Encoding} 316 | \\ 317 | \hline 318 | 319 | 72 320 | & 321 | 12 322 | & 323 | low 324 | & 325 | \emph{????????????} 326 | \\ 327 | \hline 328 | 329 | 84 330 | & 331 | 8 332 | & 333 | high 334 | & 335 | ascii 'SAS FILE' 336 | \\ 337 | \hline 338 | 339 | 92 340 | & 341 | 64 342 | & 343 | high 344 | & 345 | ascii, dataset name 346 | \\ 347 | \hline 348 | 349 | 156 350 | & 351 | 8 352 | & 353 | medium 354 | & 355 | ascii, file type, e.g. \texttt{'DATA ~ ~'} 356 | \\ 357 | \hline 358 | 359 | 164 360 | & 361 | a1 362 | & 363 | medium 364 | & 365 | zero padding when a1=4 . Aligns the double timestamps below on double word boundaries. 366 | \\ 367 | \hline 368 | 369 | 164+a1 370 | & 371 | 8 372 | & 373 | high 374 | & 375 | double, timestamp, date created, secs since 1/1/60 (for SAS version 8.x and higher) 376 | \\ 377 | \hline 378 | 379 | 172+a1 380 | & 381 | 8 382 | & 383 | high 384 | & 385 | double, timestamp, date modified, secs since 1/1/60 (for SAS version 8.x and higher) 386 | \\ 387 | \hline 388 | 389 | 180+a1 390 | & 391 | 16 392 | & 393 | low 394 | & 395 | \emph{????????????} 396 | \\ 397 | \hline 398 | 399 | 196+a1 400 | & 401 | 4 402 | & 403 | high 404 | & 405 | int, length of SAS7BDAT header := HL 406 | \\ 407 | \hline 408 | 409 | 200+a1 410 | & 411 | 4 412 | & 413 | high 414 | & 415 | int, page size := % 416 | \phantomsection\label{pl}PL 417 | \\ 418 | \hline 419 | 420 | 204+a1 421 | & 422 | 4+a2 423 | & 424 | high 425 | & 426 | int, page count := PC . Length 4 or 8 (\textbf{u64}), henceforth denoted \textbf{4|8} 427 | \\ 428 | \hline 429 | 430 | 208+a1+a2 431 | & 432 | 8 433 | & 434 | low 435 | & 436 | \emph{????????????} 437 | \\ 438 | \hline 439 | 440 | 216+a1+a2 441 | & 442 | 8 443 | & 444 | high 445 | & 446 | ascii, SAS release (e.g. 9.0101M3 ) 447 | \\ 448 | \hline 449 | 450 | 224+a1+a2 451 | & 452 | 16 453 | & 454 | high 455 | & 456 | ascii, host (SAS server type, longest observed string has 9 bytes) 457 | \\ 458 | \hline 459 | 460 | 240+a1+a2 461 | & 462 | 16 463 | & 464 | high 465 | & 466 | ascii, OS version number (for UNIX, else null) 467 | \\ 468 | \hline 469 | 470 | 256+a1+a2 471 | & 472 | 16 473 | & 474 | high 475 | & 476 | ascii, OS maker or version (SUN, IBM, sometimes WIN) 477 | \\ 478 | \hline 479 | 480 | 272+a1+a2 481 | & 482 | 16 483 | & 484 | high 485 | & 486 | ascii, OS name (for UNIX, else null) 487 | \\ 488 | \hline 489 | 490 | 288+a1+a2 491 | & 492 | 32 493 | & 494 | low 495 | & 496 | \emph{????????????} 497 | \\ 498 | \hline 499 | 500 | 320+a1+a2 501 | & 502 | 4 503 | & 504 | low 505 | & 506 | int, page sequence signature? (value is close to the value at start of each Page Offset Table) 507 | \\ 508 | \hline 509 | 510 | 324+a1+a2 511 | & 512 | 4 513 | & 514 | low 515 | & 516 | \emph{????????????} 517 | \\ 518 | \hline 519 | 520 | 328+a1+a2 521 | & 522 | 8 523 | & 524 | medium 525 | & 526 | double, 3rd timestamp, sometimes zero 527 | \\ 528 | \hline 529 | 530 | 336+a1+a2 531 | & 532 | \%HL 533 | & 534 | medium 535 | & 536 | zeros 537 | \\ 538 | \hline 539 | 540 | 1024|8192 541 | & & 542 | medium 543 | & 544 | Total length of header (8192 for \textbf{u64}), HL 545 | \\ 546 | \hline 547 | \end{longtable*} 548 | 549 | The 8 bytes beginning at offset 32 hold information which affects the offset of the 'release' and 'host' information. In particular: 550 | 551 | \begin{enumerate} 552 | \item The byte at offset 32 defines the \textbf{u64} (unix 64 bit) file format, which affects many field and header lengths (usually via 4 vs. 8 byte integers). 553 | 554 | \item The byte at offset 35 controls an offset before the timestamps. 555 | 556 | \item The byte at offset 37 defines byte ordering of ints and doubles (most test files were created on Windows and use Intel byte ordering; little endian). 557 | 558 | \item The byte at offset 39 appears to distinguish the OS type, where '1' indicates that the file was generated on a UNIX-like system, such as Linux or SunOS, and '2' indicates the file was generated on a Microsoft Windows platform. However, this does not affect any important fields in the file format. 559 | \end{enumerate} 560 | 561 | The following table describes some of the possible polymorphisms for the 8 bytes at offset 32. The first field lists the name of the file where the sequence was found (see \texttt{data/sas7bdat.sources.RData}), the second lists the eight byte values (hexadecimal), the third field shows bytes 216-239 in ASCII ('.' represents a non-ASCII character or '0'), and the fourth field lists the SAS7BDAT sub-format. 562 | 563 | \setlength{\DUtablewidth}{\linewidth}% 564 | \begin{longtable*}{|p{0.229\DUtablewidth}|p{0.294\DUtablewidth}|p{0.237\DUtablewidth}|p{0.189\DUtablewidth}|} 565 | \hline 566 | \textbf{% 567 | filename 568 | } & \textbf{% 569 | bytes 32-39 570 | } & \textbf{% 571 | bytes 216-239 572 | } & \textbf{% 573 | format 574 | } \\ 575 | \hline 576 | \endfirsthead 577 | \hline 578 | \textbf{% 579 | filename 580 | } & \textbf{% 581 | bytes 32-39 582 | } & \textbf{% 583 | bytes 216-239 584 | } & \textbf{% 585 | format 586 | } \\ 587 | \hline 588 | \endhead 589 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 590 | \endfoot 591 | \endlastfoot 592 | 593 | \texttt{compress\_no.sas7bdat} 594 | & 595 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32} 596 | & 597 | \texttt{9.0101M3NET\_ASRV........} 598 | & 599 | Windows Intel 600 | \\ 601 | \hline 602 | 603 | \texttt{compress\_yes.sas7bdat} 604 | & 605 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32} 606 | & 607 | \texttt{9.0101M3NET\_ASRV........} 608 | & 609 | Windows Intel 610 | \\ 611 | \hline 612 | 613 | \texttt{lowbwt\_i386.sas7bdat} 614 | & 615 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32} 616 | & 617 | \texttt{9.0202M0W32\_VSPRO.......} 618 | & 619 | Windows Intel 620 | \\ 621 | \hline 622 | 623 | \texttt{missing\_values.sas7bdat} 624 | & 625 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32} 626 | & 627 | \texttt{9.0202M0W32\_VSPRO.......} 628 | & 629 | Windows Intel 630 | \\ 631 | \hline 632 | 633 | \texttt{obs\_all\_perf\_1.sas7bdat} 634 | & 635 | \texttt{x22 x22 x00 x32 x22 x01 x02 x32} 636 | & 637 | \texttt{9.0101M3XP\_PRO..........} 638 | & 639 | Windows Intel 640 | \\ 641 | \hline 642 | 643 | \texttt{adsl.sas7bdat} 644 | & 645 | \texttt{x22 x22 x00 x33 x33 x01 x02 x32} 646 | & 647 | \texttt{....9.0202M3X64\_ESRV....} 648 | & 649 | Windows x64 Intel 650 | \\ 651 | \hline 652 | 653 | \texttt{eyecarex.sas7bdat} 654 | & 655 | \texttt{x22 x22 x00 x33 x22 x00 x02 x31} 656 | & 657 | \texttt{....9.0000M0WIN.........} 658 | & 659 | Unix non-Intel 660 | \\ 661 | \hline 662 | 663 | \texttt{lowbwt\_x64.sas7bdat} 664 | & 665 | \texttt{x22 x22 x00 x33 x33 x01 x02 x32} 666 | & 667 | \texttt{....9.0202M2X64\_VSPRO...} 668 | & 669 | Windows x64 Intel 670 | \\ 671 | \hline 672 | 673 | \texttt{natlterr1994.sas7bdat} 674 | & 675 | \texttt{x33 x22 x00 x33 x33 x00 x02 x31} 676 | & 677 | \texttt{........9.0101M3SunOS...} 678 | & 679 | u64 Unix non-Intel 680 | \\ 681 | \hline 682 | 683 | \texttt{natlterr2006.sas7bdat} 684 | & 685 | \texttt{x33 x22 x00 x33 x33 x00 x02 x31} 686 | & 687 | \texttt{........9.0101M3SunOS...} 688 | & 689 | u64 Unix non-Intel 690 | \\ 691 | \hline 692 | 693 | \texttt{txzips.sas7bdat} 694 | & 695 | \texttt{x33 x22 x00 x33 x33 x01 x02 x31} 696 | & 697 | \texttt{........9.0201M0Linux...} 698 | & 699 | u64 Unix Intel 700 | \\ 701 | \hline 702 | \end{longtable*} 703 | 704 | \phantomsection\label{table-of-hexadecimal-decimal-and-binary-values} 705 | The binary representation for the hexadecimal values present in the table above are given below. 706 | 707 | \setlength{\DUtablewidth}{\linewidth}% 708 | \begin{longtable*}{|p{0.145\DUtablewidth}|p{0.098\DUtablewidth}|p{0.168\DUtablewidth}|} 709 | \hline 710 | \textbf{% 711 | hexadecimal 712 | } & \textbf{% 713 | decimal 714 | } & \textbf{% 715 | binary 716 | } \\ 717 | \hline 718 | \endfirsthead 719 | \hline 720 | \textbf{% 721 | hexadecimal 722 | } & \textbf{% 723 | decimal 724 | } & \textbf{% 725 | binary 726 | } \\ 727 | \hline 728 | \endhead 729 | \multicolumn{3}{c}{\hfill ... continued on next page} \\ 730 | \endfoot 731 | \endlastfoot 732 | 733 | \texttt{x01} 734 | & 735 | \texttt{001} 736 | & 737 | \texttt{b00000001} 738 | \\ 739 | \hline 740 | 741 | \texttt{x02} 742 | & 743 | \texttt{002} 744 | & 745 | \texttt{b00000010} 746 | \\ 747 | \hline 748 | 749 | \texttt{x22} 750 | & 751 | \texttt{034} 752 | & 753 | \texttt{b00010010} 754 | \\ 755 | \hline 756 | 757 | \texttt{x31} 758 | & 759 | \texttt{049} 760 | & 761 | \texttt{b00011001} 762 | \\ 763 | \hline 764 | 765 | \texttt{x32} 766 | & 767 | \texttt{050} 768 | & 769 | \texttt{b00011010} 770 | \\ 771 | \hline 772 | 773 | \texttt{x33} 774 | & 775 | \texttt{051} 776 | & 777 | \texttt{b00011011} 778 | \\ 779 | \hline 780 | \end{longtable*} 781 | 782 | 783 | \subsubsection{Alignment% 784 | \label{alignment}% 785 | } 786 | 787 | In files generated by 64 bit builds of SAS, 'alignment' means that all data field offsets containing doubles or 8 byte ints should be a factor of 8 bytes. For files generated by 32 bit builds of SAS, the alignment is 4 bytes. Because \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT Packed Binary Data} may contain double precision values, it appears that all data rows are 64 bit aligned, regardless of whether the file was written with a 32 bit or 64 bit build of SAS. Alignment of data structures according to the platform word length (4 bytes for 32 bit, and 8 bytes for 64 bit architectures) facilitates efficient operations on data stored in memory. It also suggests that parts of SAS7BDAT data file format are platform dependent. One theory is that the SAS implementation utilizes a common C or C++ structure or class to reference data stored in memory. When compiled, these structures are aligned according to the word length of the target platform. Of course, when SAS was originally written, platform differences may not have been forseeable. Hence, these inconsistencies may not have been intentional. 788 | 789 | 790 | \subsubsection{Magic Number% 791 | \label{magic-number}% 792 | } 793 | 794 | The SAS7BDAT magic number is the following 32 byte (hex) sequence.: 795 | 796 | \begin{quote} 797 | \begin{alltt} 798 | x00 x00 x00 x00 x00 x00 x00 x00 799 | x00 x00 x00 x00 xc2 xea x81 x60 800 | xb3 x14 x11 xcf xbd x92 x08 x00 801 | x09 xc7 x31 x8c x18 x1f x10 x11 802 | \end{alltt} 803 | \end{quote} 804 | 805 | In all test files except one (not listed in \texttt{data/sas7bdat.sources.RData}), the magic number above holds. The one anomalous file has the following magic number: 806 | 807 | \begin{quote} 808 | \begin{alltt} 809 | x00 x00 x00 x00 x00 x00 x00 x00 810 | x00 x00 x00 x00 x00 x00 x00 x00 811 | x00 x00 x00 x00 x00 x00 x00 x00 812 | x00 x00 x00 x00 x18 x1f x10 x11 813 | \end{alltt} 814 | \end{quote} 815 | 816 | In addition, the anomalous file is associated with the SAS release \textquotedbl{}3.2TK\textquotedbl{}. Indeed, this file may not have been written by SAS. Otherwise, the anomalous file appears to be formatted similarly to other test files. 817 | 818 | 819 | \subsubsection{Character Encoding% 820 | \label{character-encoding}% 821 | } 822 | 823 | The integer (one or two bytes) at header offset 70 (bytes) indicates the character encoding of string data. The table below lists the values that are known to occur and the associated character encoding. 824 | 825 | \setlength{\DUtablewidth}{\linewidth}% 826 | \begin{longtable*}{|p{0.179\DUtablewidth}|p{0.179\DUtablewidth}|p{0.168\DUtablewidth}|} 827 | \hline 828 | \textbf{% 829 | bytes 70-72 830 | } & \textbf{% 831 | SAS name 832 | } & \textbf{% 833 | iconv name 834 | } \\ 835 | \hline 836 | \endfirsthead 837 | \hline 838 | \textbf{% 839 | bytes 70-72 840 | } & \textbf{% 841 | SAS name 842 | } & \textbf{% 843 | iconv name 844 | } \\ 845 | \hline 846 | \endhead 847 | \multicolumn{3}{c}{\hfill ... continued on next page} \\ 848 | \endfoot 849 | \endlastfoot 850 | 851 | 0 852 | & 853 | (Unspecified) 854 | & 855 | (Unspecified) 856 | \\ 857 | \hline 858 | 859 | 20 860 | & 861 | utf-8 862 | & 863 | UTF-8 864 | \\ 865 | \hline 866 | 867 | 28 868 | & 869 | us-ascii 870 | & 871 | US-ASCII 872 | \\ 873 | \hline 874 | 875 | 29 876 | & 877 | latin1 878 | & 879 | ISO-8859-1 880 | \\ 881 | \hline 882 | 883 | 30 884 | & 885 | latin2 886 | & 887 | ISO-8859-2 888 | \\ 889 | \hline 890 | 891 | 31 892 | & 893 | latin3 894 | & 895 | ISO-8859-3 896 | \\ 897 | \hline 898 | 899 | 34 900 | & 901 | arabic 902 | & 903 | ISO-8859-6 904 | \\ 905 | \hline 906 | 907 | 36 908 | & 909 | hebrew 910 | & 911 | ISO-8859-8 912 | \\ 913 | \hline 914 | 915 | 39 916 | & 917 | thai 918 | & 919 | ISO-8859-11 920 | \\ 921 | \hline 922 | 923 | 40 924 | & 925 | latin5 926 | & 927 | ISO-8859-9 928 | \\ 929 | \hline 930 | 931 | 60 932 | & 933 | wlatin2 934 | & 935 | WINDOWS-1250 936 | \\ 937 | \hline 938 | 939 | 61 940 | & 941 | wcyrillic 942 | & 943 | WINDOWS-1251 944 | \\ 945 | \hline 946 | 947 | 62 948 | & 949 | wlatin1 950 | & 951 | WINDOWS-1252 952 | \\ 953 | \hline 954 | 955 | 63 956 | & 957 | wgreek 958 | & 959 | WINDOWS-1253 960 | \\ 961 | \hline 962 | 963 | 64 964 | & 965 | wturkish 966 | & 967 | WINDOWS-1254 968 | \\ 969 | \hline 970 | 971 | 65 972 | & 973 | whebrew 974 | & 975 | WINDOWS-1255 976 | \\ 977 | \hline 978 | 979 | 66 980 | & 981 | warabic 982 | & 983 | WINDOWS-1256 984 | \\ 985 | \hline 986 | 987 | 119 988 | & 989 | euc-tw 990 | & 991 | EUC-TW 992 | \\ 993 | \hline 994 | 995 | 123 996 | & 997 | big5 998 | & 999 | BIG-5 1000 | \\ 1001 | \hline 1002 | 1003 | 125 1004 | & 1005 | euc-cn 1006 | & 1007 | EUC-CN 1008 | \\ 1009 | \hline 1010 | 1011 | 134 1012 | & 1013 | euc-jp 1014 | & 1015 | EUC-JP 1016 | \\ 1017 | \hline 1018 | 1019 | 138 1020 | & 1021 | shift-jis 1022 | & 1023 | SHIFT-JIS 1024 | \\ 1025 | \hline 1026 | 1027 | 140 1028 | & 1029 | euc-kr 1030 | & 1031 | EUC-KR 1032 | \\ 1033 | \hline 1034 | \end{longtable*} 1035 | 1036 | When the encoding is unspecified, the file uses the encoding of the SAS session that produced it (usually Windows-1252). 1037 | 1038 | 1039 | \section{SAS7BDAT Pages% 1040 | \label{sas7bdat-pages}% 1041 | } 1042 | 1043 | Following the SAS7BDAT header are pages of data. Each page can be one of (at least) four types. The first three are those that contain meta-information (e.g. field/column attributes), packed binary data, or a combination of both. These types are denoted 'meta', 'data', and 'mix' respectively. Meta-information is required to correctly interpret the packed binary information. Hence, this information must be parsed first. In test files, 'meta' and 'mix' pages always precede 'data' pages. In some test data files, there is a fourth page type, denoted 'amd' which appears to encode additional meta information. This page usually occurs last, and appears to contain amended meta information. 1044 | 1045 | The \hyperref[page-offset-table]{page offset table} below describes each page type. Byte offsets appended with one of '(meta/mix)', '(mix)', or '(data)' indicate that the corresponding length and description apply only to pages of the listed type. Provisionally, the internal structure of the 'amd' page type is considered identical to the 'meta' page type. 1046 | 1047 | 1048 | \subsection{Page Offset Table% 1049 | \label{page-offset-table}% 1050 | } 1051 | 1052 | \setlength{\DUtablewidth}{\linewidth}% 1053 | \begin{longtable*}{|p{0.146\DUtablewidth}|p{0.146\DUtablewidth}|p{0.071\DUtablewidth}|p{0.587\DUtablewidth}|} 1054 | \hline 1055 | \textbf{% 1056 | offset 1057 | } & \textbf{% 1058 | length 1059 | } & \textbf{% 1060 | conf. 1061 | } & \textbf{% 1062 | description 1063 | } \\ 1064 | \hline 1065 | \endfirsthead 1066 | \hline 1067 | \textbf{% 1068 | offset 1069 | } & \textbf{% 1070 | length 1071 | } & \textbf{% 1072 | conf. 1073 | } & \textbf{% 1074 | description 1075 | } \\ 1076 | \hline 1077 | \endhead 1078 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 1079 | \endfoot 1080 | \endlastfoot 1081 | 1082 | 0 1083 | & 1084 | 4 1085 | & 1086 | low 1087 | & 1088 | int, page sequence signature? 1089 | \\ 1090 | \hline 1091 | 1092 | 4 1093 | & 1094 | 12|28 1095 | & 1096 | low 1097 | & 1098 | \emph{????????????} length 12 or 28 (\textbf{u64}) 1099 | \\ 1100 | \hline 1101 | 1102 | B 1103 | & 1104 | 2 1105 | & 1106 | medium 1107 | & 1108 | int, bit field \hyperref[page-type]{page type} := \_PGTYPE; B = 16|32 1109 | \\ 1110 | \hline 1111 | 1112 | B+2 1113 | & 1114 | 2 1115 | & 1116 | medium 1117 | & 1118 | int, data block count := % 1119 | \phantomsection\label{bc}BC 1120 | \\ 1121 | \hline 1122 | 1123 | B+4 1124 | & 1125 | 2 1126 | & 1127 | medium 1128 | & 1129 | int, \hyperref[subheader-pointers]{subheader pointers} count := % 1130 | \phantomsection\label{sc}SC <= \hyperref[bc]{BC} 1131 | \\ 1132 | \hline 1133 | 1134 | B+6 1135 | & 1136 | 2 1137 | & 1138 | low 1139 | & 1140 | \emph{????????????} 1141 | \\ 1142 | \hline 1143 | 1144 | B+8 1145 | & 1146 | SC*SL 1147 | & 1148 | medium 1149 | & 1150 | SC \hyperref[subheader-pointers]{subheader pointers}, SL = 12|24 1151 | \\ 1152 | \hline 1153 | 1154 | B+8+SC*SL 1155 | & 1156 | DL 1157 | & 1158 | medium 1159 | & 1160 | if NRD>0, 8 byte alignment; DL = (B+8+SC*SL+7) \% 8 * 8 1161 | \\ 1162 | \hline 1163 | 1164 | B+8+SC*SL+DL 1165 | & 1166 | RC*`RL`\_ 1167 | & 1168 | medium 1169 | & 1170 | \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT packed binary data} data row count := RC = (BC-SC) 1171 | \\ 1172 | \hline 1173 | 1174 | C 1175 | & 1176 | \%`PL`\_ 1177 | & 1178 | medium 1179 | & 1180 | subheader data and/or filler; C = (B+8+SC*SL+DL+RC*RL) 1181 | \\ 1182 | \hline 1183 | \end{longtable*} 1184 | 1185 | 1186 | \subsubsection{Page Type% 1187 | \label{page-type}% 1188 | } 1189 | 1190 | \setlength{\DUtablewidth}{\linewidth}% 1191 | \begin{longtable*}{|p{0.070\DUtablewidth}|p{0.051\DUtablewidth}|p{0.107\DUtablewidth}|p{0.386\DUtablewidth}|p{0.340\DUtablewidth}|} 1192 | \hline 1193 | \textbf{% 1194 | PGTYPE 1195 | } & \textbf{% 1196 | name 1197 | } & \textbf{% 1198 | subheaders 1199 | } & \textbf{% 1200 | uncompressed row data (after subheaders) 1201 | } & \textbf{% 1202 | compressed row data (in subheaders) 1203 | } \\ 1204 | \hline 1205 | \endfirsthead 1206 | \hline 1207 | \textbf{% 1208 | PGTYPE 1209 | } & \textbf{% 1210 | name 1211 | } & \textbf{% 1212 | subheaders 1213 | } & \textbf{% 1214 | uncompressed row data (after subheaders) 1215 | } & \textbf{% 1216 | compressed row data (in subheaders) 1217 | } \\ 1218 | \hline 1219 | \endhead 1220 | \multicolumn{5}{c}{\hfill ... continued on next page} \\ 1221 | \endfoot 1222 | \endlastfoot 1223 | 1224 | 0 1225 | & 1226 | meta 1227 | & 1228 | yes (SC>0) 1229 | & 1230 | no (BC=SC) 1231 | & 1232 | yes 1233 | \\ 1234 | \hline 1235 | 1236 | 256 1237 | & 1238 | data 1239 | & 1240 | no (SC=0) 1241 | & 1242 | yes (RC=BC) 1243 | & 1244 | no 1245 | \\ 1246 | \hline 1247 | 1248 | 512 1249 | & 1250 | mix 1251 | & 1252 | yes (SC>0) 1253 | & 1254 | yes (RC=BC-SC) 1255 | & 1256 | no 1257 | \\ 1258 | \hline 1259 | 1260 | 1024 1261 | & 1262 | amd 1263 | & 1264 | yes? 1265 | & 1266 | yes? 1267 | & 1268 | no? 1269 | \\ 1270 | \hline 1271 | 1272 | 16384 1273 | & 1274 | meta 1275 | & 1276 | yes (SC>0) 1277 | & 1278 | no (BC=SC) 1279 | & 1280 | yes 1281 | \\ 1282 | \hline 1283 | 1284 | -28672 1285 | & 1286 | comp 1287 | & 1288 | no 1289 | & 1290 | no 1291 | & 1292 | no 1293 | \\ 1294 | \hline 1295 | \end{longtable*} 1296 | 1297 | There are at least four page types 'meta', 'data', 'mix', and 'amd'. These types are encoded in the most significant byte of a two byte bit field at page offset 16|32. If no bit is set, the following page is of type 'meta'. If the first, second, or third bits are set, then the page is of type 'data', 'mix', or 'amd', respectively. Hence, if the two bytes are interpreted as an unsigned integer, then the 'meta', 'data', 'mix', and 'amd' types correspond to 0, 256, 512, and 1024, respectively. In compressed files, other bits (and sometimes multiple bits) have been set (e.g., \texttt{1 <{}< 16 | 1 <{}< 13}, which is \texttt{-28672} signed, or \texttt{36864} unsigned). However, the pattern is unclear. 1298 | 1299 | If a page is of type 'meta', 'mix', or 'amd', data beginning at offset byte 24|40 are a sequence of SC SL-byte \hyperref[subheader-pointers]{subheader pointers}, which point to an offset farther down the page. \hyperref[sas7bdat-subheaders]{SAS7BDAT Subheaders} stored at these offsets hold meta information about the database, including the column names, labels, and types. 1300 | If a page is of type 'mix', then \textbf{packed binary data begin at the next 8 byte boundary following the last subheader pointer}. In this case, the data begin at offset B+8+SC*SL+DL, where DL = (B+8+SC*SL+PL+7) \% 8 * 8, and '\%' is the modulo operator. 1301 | 1302 | If a page is of type 'data', then packed binary data begin at offset 24|40. 1303 | 1304 | The 'comp' page was observed as page 2 of the compress\_yes.sas7bdat test file (not distributed with the \texttt{sas7bdat} package). It has BC and SC fields, but no subheader pointers. It contains some initial data and 2 tables. The first table has many rows of length 24; its purpose is unknown. The second table has one entry per data page with the page number and the number of data rows on the page for SC pages. It could be used to access a particular row without reading all preceding data pages. 1305 | 1306 | 1307 | \subsubsection{Subheader Pointers% 1308 | \label{subheader-pointers}% 1309 | } 1310 | 1311 | The subheader pointers encode information about the offset and length of subheaders relative to the beginning of the page where the subheader pointer is located. The purpose of the last four bytes of the subheader pointer are uncertain, but may indicate that additional subheader pointers are to be found on the next page, or that the corresponding subheader is not crucial. 1312 | 1313 | \setlength{\DUtablewidth}{\linewidth}% 1314 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.633\DUtablewidth}|} 1315 | \hline 1316 | \textbf{% 1317 | offset 1318 | } & \textbf{% 1319 | length 1320 | } & \textbf{% 1321 | conf. 1322 | } & \textbf{% 1323 | description 1324 | } \\ 1325 | \hline 1326 | \endfirsthead 1327 | \hline 1328 | \textbf{% 1329 | offset 1330 | } & \textbf{% 1331 | length 1332 | } & \textbf{% 1333 | conf. 1334 | } & \textbf{% 1335 | description 1336 | } \\ 1337 | \hline 1338 | \endhead 1339 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 1340 | \endfoot 1341 | \endlastfoot 1342 | 1343 | 0 1344 | & 1345 | 4|8 1346 | & 1347 | high 1348 | & 1349 | int, offset from page start to subheader 1350 | \\ 1351 | \hline 1352 | 1353 | 4|8 1354 | & 1355 | 4|8 1356 | & 1357 | high 1358 | & 1359 | int, length of subheader := % 1360 | \phantomsection\label{ql}QL 1361 | \\ 1362 | \hline 1363 | 1364 | 8|16 1365 | & 1366 | 1 1367 | & 1368 | medium 1369 | & 1370 | int, compression := % 1371 | \phantomsection\label{comp}COMP 1372 | \\ 1373 | \hline 1374 | 1375 | 9|17 1376 | & 1377 | 1 1378 | & 1379 | low 1380 | & 1381 | int, subheader type := ST 1382 | \\ 1383 | \hline 1384 | 1385 | 10|18 1386 | & 1387 | 2|6 1388 | & 1389 | low 1390 | & 1391 | zeroes 1392 | \\ 1393 | \hline 1394 | 1395 | 12|24 1396 | & & 1397 | high 1398 | & 1399 | Total length of subheader pointer 12|24 (\textbf{u64}), SL 1400 | \\ 1401 | \hline 1402 | \end{longtable*} 1403 | 1404 | QL is sometimes zero, which indicates that no data is referenced by the corresponding subheader pointer. When this occurs, the subheader pointer may be ignored. 1405 | 1406 | \setlength{\DUtablewidth}{\linewidth}% 1407 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.493\DUtablewidth}|} 1408 | \hline 1409 | \textbf{% 1410 | \hyperref[comp]{COMP} 1411 | } & \textbf{% 1412 | description 1413 | } \\ 1414 | \hline 1415 | \endfirsthead 1416 | \hline 1417 | \textbf{% 1418 | \hyperref[comp]{COMP} 1419 | } & \textbf{% 1420 | description 1421 | } \\ 1422 | \hline 1423 | \endhead 1424 | \multicolumn{2}{c}{\hfill ... continued on next page} \\ 1425 | \endfoot 1426 | \endlastfoot 1427 | 1428 | 0 1429 | & 1430 | uncompressed 1431 | \\ 1432 | \hline 1433 | 1434 | 1 1435 | & 1436 | truncated (ignore data) 1437 | \\ 1438 | \hline 1439 | 1440 | 4 1441 | & 1442 | RLE compressed row data with control byte 1443 | \\ 1444 | \hline 1445 | \end{longtable*} 1446 | 1447 | \setlength{\DUtablewidth}{\linewidth}% 1448 | \begin{longtable*}{|p{0.056\DUtablewidth}|p{0.884\DUtablewidth}|} 1449 | \hline 1450 | \textbf{% 1451 | ST 1452 | } & \textbf{% 1453 | subheaders 1454 | } \\ 1455 | \hline 1456 | \endfirsthead 1457 | \hline 1458 | \textbf{% 1459 | ST 1460 | } & \textbf{% 1461 | subheaders 1462 | } \\ 1463 | \hline 1464 | \endhead 1465 | \multicolumn{2}{c}{\hfill ... continued on next page} \\ 1466 | \endfoot 1467 | \endlastfoot 1468 | 1469 | 0 1470 | & 1471 | Row Size, Column Size, Subheader Counts, Column Format and Label, in Uncompressed file 1472 | \\ 1473 | \hline 1474 | 1475 | 1 1476 | & 1477 | Column Text, Column Names, Column Attributes, Column List 1478 | \\ 1479 | \hline 1480 | 1481 | 1 1482 | & 1483 | all subheaders (including row data), in Compressed file. 1484 | \\ 1485 | \hline 1486 | \end{longtable*} 1487 | 1488 | 1489 | \section{SAS7BDAT Subheaders% 1490 | \label{sas7bdat-subheaders}% 1491 | } 1492 | 1493 | Subheaders contain meta information regarding the SAS7BDAT database, including row and column counts, column names, labels, and types. Each subheader is associated with a four- or eight-byte 'signature' (\textbf{u64}) that identifies the subheader type, and hence, how it should be parsed. 1494 | 1495 | 1496 | \subsection{Row Size Subheader% 1497 | \label{row-size-subheader}% 1498 | } 1499 | 1500 | The row size subheader holds information about row length (in bytes), their total count, and their count on a page of type 'mix'. Fields at offset 28|56 and higher are not needed to read the file, but are documented here for completeness. The four test files used for example data in the higher fields are \texttt{eyecarex.sas7bdat}, \texttt{acadindx.sas7bdat}, \texttt{natlterr1994.sas7bdat}, \texttt{txzips.sas7bdat} (non-Intel/Intel x regular/u64). 1501 | 1502 | \setlength{\DUtablewidth}{\linewidth}% 1503 | \begin{longtable*}{|p{0.076\DUtablewidth}|p{0.076\DUtablewidth}|p{0.055\DUtablewidth}|p{0.743\DUtablewidth}|} 1504 | \hline 1505 | \textbf{% 1506 | offset 1507 | } & \textbf{% 1508 | length 1509 | } & \textbf{% 1510 | conf. 1511 | } & \textbf{% 1512 | description 1513 | } \\ 1514 | \hline 1515 | \endfirsthead 1516 | \hline 1517 | \textbf{% 1518 | offset 1519 | } & \textbf{% 1520 | length 1521 | } & \textbf{% 1522 | conf. 1523 | } & \textbf{% 1524 | description 1525 | } \\ 1526 | \hline 1527 | \endhead 1528 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 1529 | \endfoot 1530 | \endlastfoot 1531 | 1532 | 0 1533 | & 1534 | 4|8 1535 | & 1536 | high 1537 | & 1538 | binary, signature xF7F7F7F7|xF7F7F7F700000000 1539 | \\ 1540 | \hline 1541 | 1542 | 4|8 1543 | & 1544 | 16|32 1545 | & 1546 | low 1547 | & 1548 | \emph{????????????} 1549 | \\ 1550 | \hline 1551 | 1552 | 20|40 1553 | & 1554 | 4|8 1555 | & 1556 | high 1557 | & 1558 | int, row length (in bytes) := % 1559 | \phantomsection\label{rl}RL 1560 | \\ 1561 | \hline 1562 | 1563 | 24|48 1564 | & 1565 | 4|8 1566 | & 1567 | high 1568 | & 1569 | int, total row count := TRC 1570 | \\ 1571 | \hline 1572 | 1573 | 28|56 1574 | & 1575 | 8|16 1576 | & 1577 | low 1578 | & 1579 | \emph{????????????} 1580 | \\ 1581 | \hline 1582 | 1583 | 36|72 1584 | & 1585 | 4|8 1586 | & 1587 | medium 1588 | & 1589 | int, number of \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader} on first page where they appear := % 1590 | \phantomsection\label{ncfl1}NCFL1 1591 | \\ 1592 | \hline 1593 | 1594 | 40|80 1595 | & 1596 | 4|8 1597 | & 1598 | medium 1599 | & 1600 | int, number of \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader} on second page where they appear (or 0) := % 1601 | \phantomsection\label{ncfl2}NCFL2 1602 | \\ 1603 | \hline 1604 | 1605 | 44|88 1606 | & 1607 | 8|16 1608 | & 1609 | low 1610 | & 1611 | \emph{????????????} 1612 | \\ 1613 | \hline 1614 | 1615 | 52|104 1616 | & 1617 | 4|8 1618 | & 1619 | medium 1620 | & 1621 | int, page size, equals PL 1622 | \\ 1623 | \hline 1624 | 1625 | 56|112 1626 | & 1627 | 4|8 1628 | & 1629 | low 1630 | & 1631 | \emph{????????????} 1632 | \\ 1633 | \hline 1634 | 1635 | 60|120 1636 | & 1637 | 4|8 1638 | & 1639 | medium 1640 | & 1641 | int, max row count on \textquotedbl{}mix\textquotedbl{} page := % 1642 | \phantomsection\label{mrc}MRC 1643 | \\ 1644 | \hline 1645 | 1646 | 64|128 1647 | & 1648 | 8|16 1649 | & 1650 | medium 1651 | & 1652 | sequence of 8|16 FF, end of initial header 1653 | \\ 1654 | \hline 1655 | 1656 | 72|144 1657 | & 1658 | 148|296 1659 | & 1660 | medium 1661 | & 1662 | zeroes 1663 | \\ 1664 | \hline 1665 | 1666 | 220|440 1667 | & 1668 | 4 1669 | & 1670 | low 1671 | & 1672 | int, page sequence signature (equals current page sequence signature) 1673 | \\ 1674 | \hline 1675 | 1676 | 224|444 1677 | & 1678 | 40|68 1679 | & 1680 | low 1681 | & 1682 | zeroes 1683 | \\ 1684 | \hline 1685 | 1686 | 264|512 1687 | & 1688 | 4|8 1689 | & 1690 | low 1691 | & 1692 | int, value 1 observed in 4 test files 1693 | \\ 1694 | \hline 1695 | 1696 | 268|520 1697 | & 1698 | 2 1699 | & 1700 | low 1701 | & 1702 | int, value 2 observed 1703 | \\ 1704 | \hline 1705 | 1706 | 270|522 1707 | & 1708 | 2|6 1709 | & 1710 | low 1711 | & 1712 | zeroes (pads length of 3 fields to 8|16) 1713 | \\ 1714 | \hline 1715 | 1716 | 272|528 1717 | & 1718 | 4|8 1719 | & 1720 | medium 1721 | & 1722 | int, number of pages with subheader data := NPSHD 1723 | \\ 1724 | \hline 1725 | 1726 | 276|536 1727 | & 1728 | 2 1729 | & 1730 | medium 1731 | & 1732 | int, number of subheaders with positive length on last page with subheader data := NSHPL 1733 | \\ 1734 | \hline 1735 | 1736 | 278|538 1737 | & 1738 | 2|6 1739 | & 1740 | low 1741 | & 1742 | zeroes 1743 | \\ 1744 | \hline 1745 | 1746 | 280|544 1747 | & 1748 | 4|8 1749 | & 1750 | low 1751 | & 1752 | int, values equal to NPSHD observed 1753 | \\ 1754 | \hline 1755 | 1756 | 284|552 1757 | & 1758 | 2 1759 | & 1760 | low 1761 | & 1762 | int, values equal to NSHPL+2 observed 1763 | \\ 1764 | \hline 1765 | 1766 | 286|554 1767 | & 1768 | 2|6 1769 | & 1770 | low 1771 | & 1772 | zeroes 1773 | \\ 1774 | \hline 1775 | 1776 | 288|560 1777 | & 1778 | 4|8 1779 | & 1780 | medium 1781 | & 1782 | int, number of pages in file, equals PC 1783 | \\ 1784 | \hline 1785 | 1786 | 292|568 1787 | & 1788 | 2 1789 | & 1790 | low 1791 | & 1792 | int, values 22,26,9,56 observed 1793 | \\ 1794 | \hline 1795 | 1796 | 294|570 1797 | & 1798 | 2|6 1799 | & 1800 | low 1801 | & 1802 | zeroes 1803 | \\ 1804 | \hline 1805 | 1806 | 296|576 1807 | & 1808 | 4|8 1809 | & 1810 | low 1811 | & 1812 | int, value 1 observed 1813 | \\ 1814 | \hline 1815 | 1816 | 300|584 1817 | & 1818 | 2 1819 | & 1820 | low 1821 | & 1822 | int, values 7|8 observed 1823 | \\ 1824 | \hline 1825 | 1826 | 302|586 1827 | & 1828 | 2|6 1829 | & 1830 | low 1831 | & 1832 | zeroes 1833 | \\ 1834 | \hline 1835 | 1836 | 304|592 1837 | & 1838 | 40|80 1839 | & 1840 | low 1841 | & 1842 | zeroes 1843 | \\ 1844 | \hline 1845 | 1846 | 344|672 1847 | & 1848 | 2 1849 | & 1850 | low 1851 | & 1852 | int, value 0 1853 | \\ 1854 | \hline 1855 | 1856 | 346|674 1857 | & 1858 | 2 1859 | & 1860 | low 1861 | & 1862 | int, values 0|8 1863 | \\ 1864 | \hline 1865 | 1866 | 348|676 1867 | & 1868 | 2 1869 | & 1870 | low 1871 | & 1872 | int, value 4 1873 | \\ 1874 | \hline 1875 | 1876 | 350|678 1877 | & 1878 | 2 1879 | & 1880 | low 1881 | & 1882 | int, value 0 1883 | \\ 1884 | \hline 1885 | 1886 | 352|680 1887 | & 1888 | 2 1889 | & 1890 | low 1891 | & 1892 | int, values 12,32|0 1893 | \\ 1894 | \hline 1895 | 1896 | 354|682 1897 | & 1898 | 2 1899 | & 1900 | low 1901 | & 1902 | int, length of Creator Software string := LCS 1903 | \\ 1904 | \hline 1905 | 1906 | 356|684 1907 | & 1908 | 2 1909 | & 1910 | low 1911 | & 1912 | int, value 0 1913 | \\ 1914 | \hline 1915 | 1916 | 358|686 1917 | & 1918 | 2 1919 | & 1920 | low 1921 | & 1922 | int, value 20 1923 | \\ 1924 | \hline 1925 | 1926 | 360|688 1927 | & 1928 | 2 1929 | & 1930 | low 1931 | & 1932 | int, value of 8 indicates MXNAM and MXLAB valid := IMAXN 1933 | \\ 1934 | \hline 1935 | 1936 | 362|690 1937 | & 1938 | 8 1939 | & 1940 | low 1941 | & 1942 | zeroes 1943 | \\ 1944 | \hline 1945 | 1946 | 370|698 1947 | & 1948 | 2 1949 | & 1950 | low 1951 | & 1952 | int, value 12 1953 | \\ 1954 | \hline 1955 | 1956 | 372|700 1957 | & 1958 | 2 1959 | & 1960 | low 1961 | & 1962 | int, value 8 1963 | \\ 1964 | \hline 1965 | 1966 | 374|702 1967 | & 1968 | 2 1969 | & 1970 | low 1971 | & 1972 | int, value 0 1973 | \\ 1974 | \hline 1975 | 1976 | 376|704 1977 | & 1978 | 2 1979 | & 1980 | low 1981 | & 1982 | int, value 28 1983 | \\ 1984 | \hline 1985 | 1986 | 378|706 1987 | & 1988 | 2 1989 | & 1990 | low 1991 | & 1992 | int, length of Creator PROC step name := LCP 1993 | \\ 1994 | \hline 1995 | 1996 | 380|708 1997 | & 1998 | 36 1999 | & 2000 | low 2001 | & 2002 | zeroes 2003 | \\ 2004 | \hline 2005 | 2006 | 416|744 2007 | & 2008 | 2 2009 | & 2010 | low 2011 | & 2012 | int, value 4 2013 | \\ 2014 | \hline 2015 | 2016 | 418|746 2017 | & 2018 | 2 2019 | & 2020 | low 2021 | & 2022 | int, value 1 2023 | \\ 2024 | \hline 2025 | 2026 | 420|748 2027 | & 2028 | 2 2029 | & 2030 | low 2031 | & 2032 | int, number of Column Text subheaders in file := % 2033 | \phantomsection\label{nct}NCT 2034 | \\ 2035 | \hline 2036 | 2037 | 422|750 2038 | & 2039 | 2 2040 | & 2041 | low 2042 | & 2043 | int, max length of column names := MXNAM (see IMAXN) 2044 | \\ 2045 | \hline 2046 | 2047 | 424|752 2048 | & 2049 | 2 2050 | & 2051 | low 2052 | & 2053 | int, max length of column labels := MXLAB (see IMAXN) 2054 | \\ 2055 | \hline 2056 | 2057 | 426|754 2058 | & 2059 | 12 2060 | & 2061 | low 2062 | & 2063 | zeroes 2064 | \\ 2065 | \hline 2066 | 2067 | 438|766 2068 | & 2069 | 2 2070 | & 2071 | medium 2072 | & 2073 | int, number of data rows on a full page INT{[}(PL - 24 / 40)/\hyperref[rl]{RL}{]}; 0 for compressed file 2074 | \\ 2075 | \hline 2076 | 2077 | 440|768 2078 | & 2079 | 27 2080 | & 2081 | low 2082 | & 2083 | zeroes 2084 | \\ 2085 | \hline 2086 | 2087 | 467|795 2088 | & 2089 | 1 2090 | & 2091 | low 2092 | & 2093 | int, bit field, values 1,5 2094 | \\ 2095 | \hline 2096 | 2097 | 468|796 2098 | & 2099 | 12 2100 | & 2101 | low 2102 | & 2103 | zeroes 2104 | \\ 2105 | \hline 2106 | 2107 | 480|808 2108 | & & 2109 | medium 2110 | & 2111 | Total length of subheader, QL 2112 | \\ 2113 | \hline 2114 | \end{longtable*} 2115 | 2116 | 2117 | \subsection{Column Size Subheader% 2118 | \label{column-size-subheader}% 2119 | } 2120 | 2121 | The \hyperref[column-size-subheader]{column size subheader} holds the number of columns (variables). 2122 | 2123 | \setlength{\DUtablewidth}{\linewidth}% 2124 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.540\DUtablewidth}|} 2125 | \hline 2126 | \textbf{% 2127 | offset 2128 | } & \textbf{% 2129 | length 2130 | } & \textbf{% 2131 | conf. 2132 | } & \textbf{% 2133 | description 2134 | } \\ 2135 | \hline 2136 | \endfirsthead 2137 | \hline 2138 | \textbf{% 2139 | offset 2140 | } & \textbf{% 2141 | length 2142 | } & \textbf{% 2143 | conf. 2144 | } & \textbf{% 2145 | description 2146 | } \\ 2147 | \hline 2148 | \endhead 2149 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2150 | \endfoot 2151 | \endlastfoot 2152 | 2153 | 0 2154 | & 2155 | 4|8 2156 | & 2157 | high 2158 | & 2159 | binary, signature xF6F6F6F6|xF6F6F6F600000000 2160 | \\ 2161 | \hline 2162 | 2163 | 4|8 2164 | & 2165 | 4|8 2166 | & 2167 | high 2168 | & 2169 | int, number of columns := NCOL 2170 | \\ 2171 | \hline 2172 | 2173 | 8|16 2174 | & 2175 | 4|8 2176 | & 2177 | low 2178 | & 2179 | \emph{????????????} usually zeroes 2180 | \\ 2181 | \hline 2182 | 2183 | 12|24 2184 | & & 2185 | medium 2186 | & 2187 | Total length of subheader, QL 2188 | \\ 2189 | \hline 2190 | \end{longtable*} 2191 | 2192 | 2193 | \subsection{Subheader Counts Subheader% 2194 | \label{subheader-counts-subheader}% 2195 | } 2196 | 2197 | This subheader contains information on the first and last appearances of at least 7 common subheader types. Any of these subheaders may appear once or more. Multiple instances of a subheader provide information for an exclusive subset of columns. The order in which data is read from multiple subheaders corresponds to the reading order (left to right) of columns. The structure of this subheader was deduced and reported by Clint Cummins. 2198 | 2199 | \setlength{\DUtablewidth}{\linewidth}% 2200 | \begin{longtable*}{|p{0.108\DUtablewidth}|p{0.088\DUtablewidth}|p{0.077\DUtablewidth}|p{0.677\DUtablewidth}|} 2201 | \hline 2202 | \textbf{% 2203 | offset 2204 | } & \textbf{% 2205 | length 2206 | } & \textbf{% 2207 | conf. 2208 | } & \textbf{% 2209 | description 2210 | } \\ 2211 | \hline 2212 | \endfirsthead 2213 | \hline 2214 | \textbf{% 2215 | offset 2216 | } & \textbf{% 2217 | length 2218 | } & \textbf{% 2219 | conf. 2220 | } & \textbf{% 2221 | description 2222 | } \\ 2223 | \hline 2224 | \endhead 2225 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2226 | \endfoot 2227 | \endlastfoot 2228 | 2229 | 0 2230 | & 2231 | 4|8 2232 | & 2233 | high 2234 | & 2235 | int, signature -1024 (x00FCFFFF|x00FCFFFFFFFFFFFF) 2236 | \\ 2237 | \hline 2238 | 2239 | 4|8 2240 | & 2241 | 4|8 2242 | & 2243 | low 2244 | & 2245 | int, length or offset, usually >= 48 2246 | \\ 2247 | \hline 2248 | 2249 | 8|16 2250 | & 2251 | 4|8 2252 | & 2253 | low 2254 | & 2255 | int, usually 4 2256 | \\ 2257 | \hline 2258 | 2259 | 12|24 2260 | & 2261 | 2 2262 | & 2263 | low 2264 | & 2265 | int, usually 7 (number of nonzero SCVs?) 2266 | \\ 2267 | \hline 2268 | 2269 | 14|26 2270 | & 2271 | 50|94 2272 | & 2273 | low 2274 | & 2275 | \emph{????????????} 2276 | \\ 2277 | \hline 2278 | 2279 | 64|120 2280 | & 2281 | 12*LSCV 2282 | & 2283 | medium 2284 | & 2285 | 12 \hyperref[subheader-count-vectors]{subheader count vectors}, length := LSCV = 20|40 bytes each 2286 | \\ 2287 | \hline 2288 | 2289 | 304|600 2290 | & & 2291 | medium 2292 | & 2293 | Total length of subheader, QL 2294 | \\ 2295 | \hline 2296 | \end{longtable*} 2297 | 2298 | 2299 | \subsubsection{Subheader Count Vectors% 2300 | \label{subheader-count-vectors}% 2301 | } 2302 | 2303 | The subheader count vectors encode information for each of 4 common subheader types, and potentially 12 total subheader types. 2304 | 2305 | \setlength{\DUtablewidth}{\linewidth}% 2306 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.633\DUtablewidth}|} 2307 | \hline 2308 | \textbf{% 2309 | offset 2310 | } & \textbf{% 2311 | length 2312 | } & \textbf{% 2313 | conf. 2314 | } & \textbf{% 2315 | description 2316 | } \\ 2317 | \hline 2318 | \endfirsthead 2319 | \hline 2320 | \textbf{% 2321 | offset 2322 | } & \textbf{% 2323 | length 2324 | } & \textbf{% 2325 | conf. 2326 | } & \textbf{% 2327 | description 2328 | } \\ 2329 | \hline 2330 | \endhead 2331 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2332 | \endfoot 2333 | \endlastfoot 2334 | 2335 | 0 2336 | & 2337 | 4|8 2338 | & 2339 | high 2340 | & 2341 | int, signature (see list below) 2342 | \\ 2343 | \hline 2344 | 2345 | 4|8 2346 | & 2347 | 4|8 2348 | & 2349 | medium 2350 | & 2351 | int, page where this subheader first appears := PAGE1 2352 | \\ 2353 | \hline 2354 | 2355 | 8|16 2356 | & 2357 | 2 2358 | & 2359 | medium 2360 | & 2361 | int, position of subheader pointer in PAGE1 := LOC1 2362 | \\ 2363 | \hline 2364 | 2365 | 10|18 2366 | & 2367 | 2|6 2368 | & 2369 | low 2370 | & 2371 | \emph{????????????} zero padding 2372 | \\ 2373 | \hline 2374 | 2375 | 12|24 2376 | & 2377 | 4|8 2378 | & 2379 | medium 2380 | & 2381 | int, page where this subheader last appears := PAGEL 2382 | \\ 2383 | \hline 2384 | 2385 | 16|32 2386 | & 2387 | 2 2388 | & 2389 | medium 2390 | & 2391 | int, position of subheader pointer in PAGEL := LOCL 2392 | \\ 2393 | \hline 2394 | 2395 | 18|34 2396 | & 2397 | 2|6 2398 | & 2399 | low 2400 | & 2401 | \emph{????????????} zero padding 2402 | \\ 2403 | \hline 2404 | 2405 | 20|40 2406 | & & 2407 | medium 2408 | & 2409 | Total length of subheader count vector, LSCV 2410 | \\ 2411 | \hline 2412 | \end{longtable*} 2413 | 2414 | The LOC1 and LOCL give the positions of the corresponding subheader pointer in PAGE1 and PAGEL, respectively. That is, if there are SC subheader pointers on page PAGE1, then the corresponding subheader pointer first occurs at the LOC1'th position in this array, enumerating from 1. If PAGE1=0, the subheader is not present. If PAGE1=PAGEL and LOC1=LOCL, the subheader appears exactly once. If PAGE1!=PAGEL or LOC1!=LOCL, the subheader appears 2 or more times. In all test files, PAGE1 <= PAGEL, and the corresponding subheaders appear only once per page. The variable \hyperref[nct]{NCT} in the \hyperref[row-size-subheader]{Row Size Subheader} should be used to ensure that all Column Text subheaders are located (and to avoid scanning through all pages in the file when all subheaders are already located). 2415 | 2416 | The first 7 binary signatures in the \hyperref[subheader-count-vectors]{Subheader Count Vectors} array are always: 2417 | 2418 | \setlength{\DUtablewidth}{\linewidth}% 2419 | \begin{longtable*}{|p{0.121\DUtablewidth}|p{0.249\DUtablewidth}|} 2420 | \hline 2421 | \textbf{% 2422 | signature 2423 | } & \textbf{% 2424 | description 2425 | } \\ 2426 | \hline 2427 | \endfirsthead 2428 | \hline 2429 | \textbf{% 2430 | signature 2431 | } & \textbf{% 2432 | description 2433 | } \\ 2434 | \hline 2435 | \endhead 2436 | \multicolumn{2}{c}{\hfill ... continued on next page} \\ 2437 | \endfoot 2438 | \endlastfoot 2439 | 2440 | -4 2441 | & 2442 | Column Attributes 2443 | \\ 2444 | \hline 2445 | 2446 | -3 2447 | & 2448 | Column Text 2449 | \\ 2450 | \hline 2451 | 2452 | -1 2453 | & 2454 | Column Names 2455 | \\ 2456 | \hline 2457 | 2458 | -2 2459 | & 2460 | Column List 2461 | \\ 2462 | \hline 2463 | 2464 | -5 2465 | & 2466 | unknown signature \#1 2467 | \\ 2468 | \hline 2469 | 2470 | -6 2471 | & 2472 | unknown signature \#2 2473 | \\ 2474 | \hline 2475 | 2476 | -7 2477 | & 2478 | unknown signature \#3 2479 | \\ 2480 | \hline 2481 | \end{longtable*} 2482 | 2483 | The remaining 5 out of 12 signatures are zeros in the observed source files. Presumably, these are for subheaders not yet defined, or not present in the collection of test files. 2484 | 2485 | A \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader} may appear on multiple pages, but are not indexed in Subheader Counts. The variables NCFL1 and NCFL2 in the \hyperref[row-size-subheader]{Row Size subheader} may be helpful if you want to know in advance if these appear across multiple pages. 2486 | 2487 | 2488 | \subsection{Column Text Subheader% 2489 | \label{column-text-subheader}% 2490 | } 2491 | 2492 | The column text subheader contains a block of text associated with columns, including the column names, labels, and formats. However, this subheader is not sufficient to parse this information. Other subheaders (e.g. the \hyperref[column-name-subheader]{column name subheader}), which point to specific elements within this subheader are also needed. 2493 | 2494 | \setlength{\DUtablewidth}{\linewidth}% 2495 | \begin{longtable*}{|p{0.092\DUtablewidth}|p{0.081\DUtablewidth}|p{0.081\DUtablewidth}|p{0.697\DUtablewidth}|} 2496 | \hline 2497 | \textbf{% 2498 | offset 2499 | } & \textbf{% 2500 | length 2501 | } & \textbf{% 2502 | conf. 2503 | } & \textbf{% 2504 | description 2505 | } \\ 2506 | \hline 2507 | \endfirsthead 2508 | \hline 2509 | \textbf{% 2510 | offset 2511 | } & \textbf{% 2512 | length 2513 | } & \textbf{% 2514 | conf. 2515 | } & \textbf{% 2516 | description 2517 | } \\ 2518 | \hline 2519 | \endhead 2520 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2521 | \endfoot 2522 | \endlastfoot 2523 | 2524 | 0 2525 | & 2526 | 4|8 2527 | & 2528 | high 2529 | & 2530 | int, signature -3 (xFDFFFFFF|xFDFFFFFFFFFFFFFF) 2531 | \\ 2532 | \hline 2533 | 2534 | 4|8 2535 | & 2536 | 2 2537 | & 2538 | medium 2539 | & 2540 | int, size of text block (QL - 16|20) 2541 | \\ 2542 | \hline 2543 | 2544 | 6|10 2545 | & 2546 | 2 2547 | & 2548 | low 2549 | & 2550 | \emph{????????????} 2551 | \\ 2552 | \hline 2553 | 2554 | 8|12 2555 | & 2556 | 2 2557 | & 2558 | low 2559 | & 2560 | \emph{????????????} 2561 | \\ 2562 | \hline 2563 | 2564 | 10|14 2565 | & 2566 | 2 2567 | & 2568 | low 2569 | & 2570 | \emph{????????????} 2571 | \\ 2572 | \hline 2573 | 2574 | 12|16 2575 | & 2576 | 2 2577 | & 2578 | low 2579 | & 2580 | \emph{????????????} 2581 | \\ 2582 | \hline 2583 | 2584 | 14|18 2585 | & 2586 | 2 2587 | & 2588 | low 2589 | & 2590 | \emph{????????????} 2591 | \\ 2592 | \hline 2593 | 2594 | 16|20 2595 | & 2596 | varies 2597 | & 2598 | medium 2599 | & 2600 | ascii, compression \& Creator PROC step name that generated data 2601 | \\ 2602 | \hline 2603 | 2604 | varies 2605 | & 2606 | \%QL 2607 | & 2608 | high 2609 | & 2610 | ascii, combined column names, labels, formats 2611 | \\ 2612 | \hline 2613 | \end{longtable*} 2614 | 2615 | This subheader sometimes appears more than once; each is a separate array. If so, the \textquotedbl{}column name index\textquotedbl{} field in \hyperref[column-name-pointers]{column name pointers} selects a particular text array - 0 for the first array, 1 for the second, etc. Similarly, \textquotedbl{}column format index\textquotedbl{} and \textquotedbl{}column label index\textquotedbl{} fields also select a text array. Offsets to strings within the text array are multiples of 4, so the column names and labels section of the array often contains many nulls for padding. 2616 | 2617 | The variables LCS and LCP from the \hyperref[row-size-subheader]{Row Size subheader} refer to a text field at the start of the text array (at offset 16|20) in the first Column Text subheader (before the column name strings). This text field also contains compression information. The following logic decodes this initial field: 2618 | 2619 | \begin{enumerate} 2620 | \item If the first 8 bytes of the field are blank, file is not compressed, and set LCS=0. The Creator PROC step name is the LCP bytes starting at offset 16. 2621 | 2622 | \item If LCS > 0 (still), the file is not compressed, the first LCS bytes are the Creator Software string (padded with nulls). Set LCP=0. Stat/Transfer files use this pattern. 2623 | 2624 | \item If the first 8 bytes of the field are \texttt{SASYZCRL}, the file is compressed with Run Length Encoding. The Creator PROC step name is the LCP bytes starting at offset 24. 2625 | 2626 | \item If the first 8 bytes are nonblank and options 2 or 3 above are not used, this probably indicates COMPRESS=BINARY. We need test files to confirm this, though. 2627 | \end{enumerate} 2628 | 2629 | 2630 | \subsection{Column Name Subheader% 2631 | \label{column-name-subheader}% 2632 | } 2633 | 2634 | Column name subheaders contain a sequence of \hyperref[column-name-pointers]{column name pointers} to the offset of each column name \textbf{relative to a} \hyperref[column-text-subheader]{column text subheader}. There may be multiple column name subheaders, indexing into multiple column text subheaders. 2635 | 2636 | \setlength{\DUtablewidth}{\linewidth}% 2637 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.644\DUtablewidth}|} 2638 | \hline 2639 | \textbf{% 2640 | offset 2641 | } & \textbf{% 2642 | length 2643 | } & \textbf{% 2644 | conf. 2645 | } & \textbf{% 2646 | description 2647 | } \\ 2648 | \hline 2649 | \endfirsthead 2650 | \hline 2651 | \textbf{% 2652 | offset 2653 | } & \textbf{% 2654 | length 2655 | } & \textbf{% 2656 | conf. 2657 | } & \textbf{% 2658 | description 2659 | } \\ 2660 | \hline 2661 | \endhead 2662 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2663 | \endfoot 2664 | \endlastfoot 2665 | 2666 | 0 2667 | & 2668 | 4|8 2669 | & 2670 | high 2671 | & 2672 | int, signature -1 (xFFFFFFFF|xFFFFFFFFFFFFFFFF) 2673 | \\ 2674 | \hline 2675 | 2676 | 4|8 2677 | & 2678 | 2 2679 | & 2680 | medium 2681 | & 2682 | int, length of remaining subheader (QL - 16|20) 2683 | \\ 2684 | \hline 2685 | 2686 | 6|10 2687 | & 2688 | 2 2689 | & 2690 | low 2691 | & 2692 | \emph{????????????} 2693 | \\ 2694 | \hline 2695 | 2696 | 8|12 2697 | & 2698 | 2 2699 | & 2700 | low 2701 | & 2702 | \emph{????????????} 2703 | \\ 2704 | \hline 2705 | 2706 | 10|14 2707 | & 2708 | 2 2709 | & 2710 | low 2711 | & 2712 | \emph{????????????} 2713 | \\ 2714 | \hline 2715 | 2716 | 12|16 2717 | & 2718 | 8*CMAX 2719 | & 2720 | medium 2721 | & 2722 | \hyperref[column-name-pointers]{column name pointers} (see below), CMAX=(QL-20|28)/8 2723 | \\ 2724 | \hline 2725 | 2726 | MCN 2727 | & 2728 | 8|12 2729 | & 2730 | low 2731 | & 2732 | zeros, 12|16 + 8*CMAX := MCN 2733 | \\ 2734 | \hline 2735 | \end{longtable*} 2736 | 2737 | Each column name subheader holds CMAX column name pointers. When there are multiple column name subheaders, CMAX will be less than NCOL. 2738 | 2739 | 2740 | \subsubsection{Column Name Pointers% 2741 | \label{column-name-pointers}% 2742 | } 2743 | 2744 | \setlength{\DUtablewidth}{\linewidth}% 2745 | \begin{longtable*}{|p{0.061\DUtablewidth}|p{0.061\DUtablewidth}|p{0.061\DUtablewidth}|p{0.767\DUtablewidth}|} 2746 | \hline 2747 | \textbf{% 2748 | offset 2749 | } & \textbf{% 2750 | length 2751 | } & \textbf{% 2752 | conf. 2753 | } & \textbf{% 2754 | description 2755 | } \\ 2756 | \hline 2757 | \endfirsthead 2758 | \hline 2759 | \textbf{% 2760 | offset 2761 | } & \textbf{% 2762 | length 2763 | } & \textbf{% 2764 | conf. 2765 | } & \textbf{% 2766 | description 2767 | } \\ 2768 | \hline 2769 | \endhead 2770 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2771 | \endfoot 2772 | \endlastfoot 2773 | 2774 | 0 2775 | & 2776 | 2 2777 | & 2778 | high 2779 | & 2780 | int, column name index to select \hyperref[column-text-subheader]{Column Text Subheader} 2781 | \\ 2782 | \hline 2783 | 2784 | 2 2785 | & 2786 | 2 2787 | & 2788 | high 2789 | & 2790 | int, column name offset w.r.t. end of selected Column Text signature. Always a multiple of 4. 2791 | \\ 2792 | \hline 2793 | 2794 | 4 2795 | & 2796 | 2 2797 | & 2798 | high 2799 | & 2800 | int, column name length 2801 | \\ 2802 | \hline 2803 | 2804 | 6 2805 | & 2806 | 2 2807 | & 2808 | low 2809 | & 2810 | zeros 2811 | \\ 2812 | \hline 2813 | 2814 | 8 2815 | & & 2816 | high 2817 | & 2818 | Total length of column name pointer 2819 | \\ 2820 | \hline 2821 | \end{longtable*} 2822 | 2823 | 2824 | \subsection{Column Attributes Subheader% 2825 | \label{column-attributes-subheader}% 2826 | } 2827 | 2828 | The column attribute subheader holds information regarding the column offsets within a data row, the column widths, and the column types (either numeric or character). The column attribute subheader sometimes occurs more than once (in test data). In these cases, column attributes are applied in the order they are parsed. 2829 | 2830 | \setlength{\DUtablewidth}{\linewidth}% 2831 | \begin{longtable*}{|p{0.080\DUtablewidth}|p{0.099\DUtablewidth}|p{0.071\DUtablewidth}|p{0.700\DUtablewidth}|} 2832 | \hline 2833 | \textbf{% 2834 | offset 2835 | } & \textbf{% 2836 | length 2837 | } & \textbf{% 2838 | conf. 2839 | } & \textbf{% 2840 | description 2841 | } \\ 2842 | \hline 2843 | \endfirsthead 2844 | \hline 2845 | \textbf{% 2846 | offset 2847 | } & \textbf{% 2848 | length 2849 | } & \textbf{% 2850 | conf. 2851 | } & \textbf{% 2852 | description 2853 | } \\ 2854 | \hline 2855 | \endhead 2856 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2857 | \endfoot 2858 | \endlastfoot 2859 | 2860 | 0 2861 | & 2862 | 4|8 2863 | & 2864 | high 2865 | & 2866 | int, signature -4 (hex xFCFFFFFF|FCFFFFFFFFFFFFFF) 2867 | \\ 2868 | \hline 2869 | 2870 | 4|8 2871 | & 2872 | 2 2873 | & 2874 | medium 2875 | & 2876 | int, length of remaining subheader 2877 | \\ 2878 | \hline 2879 | 2880 | 6|10 2881 | & 2882 | 2 2883 | & 2884 | low 2885 | & 2886 | \emph{????????????} 2887 | \\ 2888 | \hline 2889 | 2890 | 8|12 2891 | & 2892 | 2 2893 | & 2894 | low 2895 | & 2896 | \emph{????????????} 2897 | \\ 2898 | \hline 2899 | 2900 | 10|14 2901 | & 2902 | 2 2903 | & 2904 | low 2905 | & 2906 | \emph{????????????} 2907 | \\ 2908 | \hline 2909 | 2910 | 12|16 2911 | & 2912 | LCAV*CMAX 2913 | & 2914 | high 2915 | & 2916 | \hyperref[column-attribute-vectors]{column attribute vectors} (see below), CMAX=(QL-20|28)/LCAV, LCAV=12|16 2917 | \\ 2918 | \hline 2919 | 2920 | MCA 2921 | & 2922 | 8|12 2923 | & 2924 | low 2925 | & 2926 | MCA = 12|16 + LCAV*CMAX 2927 | \\ 2928 | \hline 2929 | \end{longtable*} 2930 | 2931 | 2932 | \subsubsection{Column Attribute Vectors% 2933 | \label{column-attribute-vectors}% 2934 | } 2935 | 2936 | \setlength{\DUtablewidth}{\linewidth}% 2937 | \begin{longtable*}{|p{0.179\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.563\DUtablewidth}|} 2938 | \hline 2939 | \textbf{% 2940 | offset 2941 | } & \textbf{% 2942 | length 2943 | } & \textbf{% 2944 | conf. 2945 | } & \textbf{% 2946 | description 2947 | } \\ 2948 | \hline 2949 | \endfirsthead 2950 | \hline 2951 | \textbf{% 2952 | offset 2953 | } & \textbf{% 2954 | length 2955 | } & \textbf{% 2956 | conf. 2957 | } & \textbf{% 2958 | description 2959 | } \\ 2960 | \hline 2961 | \endhead 2962 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 2963 | \endfoot 2964 | \endlastfoot 2965 | 2966 | 0 2967 | & 2968 | 4|8 2969 | & 2970 | high 2971 | & 2972 | int, column offset in data row (in bytes) 2973 | \\ 2974 | \hline 2975 | 2976 | 4|8 2977 | & 2978 | 4 2979 | & 2980 | high 2981 | & 2982 | int, column width 2983 | \\ 2984 | \hline 2985 | 2986 | 8|12 2987 | & 2988 | 2 2989 | & 2990 | low 2991 | & 2992 | name length flag 2993 | \\ 2994 | \hline 2995 | 2996 | 10|14 2997 | & 2998 | 1 2999 | & 3000 | high 3001 | & 3002 | int, column type (1 = numeric, 2 = character) 3003 | \\ 3004 | \hline 3005 | 3006 | 11|15 3007 | & 3008 | 1 3009 | & 3010 | low 3011 | & 3012 | \emph{????????????} 3013 | \\ 3014 | \hline 3015 | 3016 | 12|16 3017 | & & 3018 | high 3019 | & 3020 | Total length of column attribute vector, LCAV 3021 | \\ 3022 | \hline 3023 | \end{longtable*} 3024 | 3025 | Observed values of name length flag in the source files: 3026 | 3027 | \setlength{\DUtablewidth}{\linewidth}% 3028 | \begin{longtable*}{|p{0.183\DUtablewidth}|p{0.757\DUtablewidth}|} 3029 | \hline 3030 | \textbf{% 3031 | name length flag 3032 | } & \textbf{% 3033 | description 3034 | } \\ 3035 | \hline 3036 | \endfirsthead 3037 | \hline 3038 | \textbf{% 3039 | name length flag 3040 | } & \textbf{% 3041 | description 3042 | } \\ 3043 | \hline 3044 | \endhead 3045 | \multicolumn{2}{c}{\hfill ... continued on next page} \\ 3046 | \endfoot 3047 | \endlastfoot 3048 | 3049 | 4 3050 | & 3051 | name length <= 8 3052 | \\ 3053 | \hline 3054 | 3055 | 1024 3056 | & 3057 | usually means name length <= 8 , but sometimes the length is 9-12 3058 | \\ 3059 | \hline 3060 | 3061 | 2048 3062 | & 3063 | name length > 8 3064 | \\ 3065 | \hline 3066 | 3067 | 2560 3068 | & 3069 | name length > 8 3070 | \\ 3071 | \hline 3072 | \end{longtable*} 3073 | 3074 | 3075 | \subsection{Column Format and Label Subheader% 3076 | \label{column-format-and-label-subheader}% 3077 | } 3078 | 3079 | The column format and label subheader contains pointers to a column format and label \textbf{relative to a} \hyperref[column-text-subheader]{column text subheader}. Since the column label subheader only contains information regarding a single column, there are typically as many of these subheaders as columns. The structure of column format pointers was contributed by Clint Cummins. 3080 | 3081 | \setlength{\DUtablewidth}{\linewidth}% 3082 | \begin{longtable*}{|p{0.071\DUtablewidth}|p{0.071\DUtablewidth}|p{0.063\DUtablewidth}|p{0.746\DUtablewidth}|} 3083 | \hline 3084 | \textbf{% 3085 | offset 3086 | } & \textbf{% 3087 | length 3088 | } & \textbf{% 3089 | conf. 3090 | } & \textbf{% 3091 | description 3092 | } \\ 3093 | \hline 3094 | \endfirsthead 3095 | \hline 3096 | \textbf{% 3097 | offset 3098 | } & \textbf{% 3099 | length 3100 | } & \textbf{% 3101 | conf. 3102 | } & \textbf{% 3103 | description 3104 | } \\ 3105 | \hline 3106 | \endhead 3107 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 3108 | \endfoot 3109 | \endlastfoot 3110 | 3111 | 0 3112 | & 3113 | 4|8 3114 | & 3115 | high 3116 | & 3117 | int, signature -1026 (hex FEFB \& 2 or 6 FFs) 3118 | \\ 3119 | \hline 3120 | 3121 | 4|8 3122 | & 3123 | 30|38 3124 | & 3125 | low 3126 | & 3127 | \emph{????????????} 3128 | \\ 3129 | \hline 3130 | 3131 | 34|46 3132 | & 3133 | 2 3134 | & 3135 | high 3136 | & 3137 | int, column format index to select \hyperref[column-text-subheader]{Column Text Subheader} 3138 | \\ 3139 | \hline 3140 | 3141 | 36|48 3142 | & 3143 | 2 3144 | & 3145 | high 3146 | & 3147 | int, column format offset w.r.t. end of selected Column Text signature. A multiple of 4. 3148 | \\ 3149 | \hline 3150 | 3151 | 38|50 3152 | & 3153 | 2 3154 | & 3155 | high 3156 | & 3157 | int, column format length 3158 | \\ 3159 | \hline 3160 | 3161 | 40|52 3162 | & 3163 | 2 3164 | & 3165 | high 3166 | & 3167 | int, column label index to select \hyperref[column-text-subheader]{Column Text Subheader} 3168 | \\ 3169 | \hline 3170 | 3171 | 42|54 3172 | & 3173 | 2 3174 | & 3175 | high 3176 | & 3177 | int, column label offset w.r.t. end of selected Column Text signature. A multiple of 4. 3178 | \\ 3179 | \hline 3180 | 3181 | 44|56 3182 | & 3183 | 2 3184 | & 3185 | high 3186 | & 3187 | int, column label length 3188 | \\ 3189 | \hline 3190 | 3191 | 46|58 3192 | & 3193 | 6 3194 | & 3195 | low 3196 | & 3197 | \emph{????????????} 3198 | \\ 3199 | \hline 3200 | 3201 | 52|64 3202 | & & 3203 | medium 3204 | & 3205 | Total length of subheader, QL 3206 | \\ 3207 | \hline 3208 | \end{longtable*} 3209 | 3210 | 3211 | \subsection{Column List Subheader% 3212 | \label{column-list-subheader}% 3213 | } 3214 | 3215 | The purpose of this subheader is not clear. But the structure is partly identified. Information related to this subheader was contributed by Clint Cummins. eyecarex (created by Stat/Transfer) does not have this subheader. 3216 | 3217 | \setlength{\DUtablewidth}{\linewidth}% 3218 | \begin{longtable*}{|p{0.098\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.610\DUtablewidth}|} 3219 | \hline 3220 | \textbf{% 3221 | offset 3222 | } & \textbf{% 3223 | length 3224 | } & \textbf{% 3225 | conf. 3226 | } & \textbf{% 3227 | description 3228 | } \\ 3229 | \hline 3230 | \endfirsthead 3231 | \hline 3232 | \textbf{% 3233 | offset 3234 | } & \textbf{% 3235 | length 3236 | } & \textbf{% 3237 | conf. 3238 | } & \textbf{% 3239 | description 3240 | } \\ 3241 | \hline 3242 | \endhead 3243 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 3244 | \endfoot 3245 | \endlastfoot 3246 | 3247 | 0 3248 | & 3249 | 4|8 3250 | & 3251 | high 3252 | & 3253 | int, signature -2 (hex FE \& 3 or 7 FFs) 3254 | \\ 3255 | \hline 3256 | 3257 | 4|8 3258 | & 3259 | 2 3260 | & 3261 | low 3262 | & 3263 | int, value close to offset in subheader pointer 3264 | \\ 3265 | \hline 3266 | 3267 | 6|10 3268 | & 3269 | 6 3270 | & 3271 | low 3272 | & 3273 | \emph{????????????} 3274 | \\ 3275 | \hline 3276 | 3277 | 12|16 3278 | & 3279 | 4|8 3280 | & 3281 | medium 3282 | & 3283 | int, length of remaining subheader 3284 | \\ 3285 | \hline 3286 | 3287 | 16|24 3288 | & 3289 | 2 3290 | & 3291 | low 3292 | & 3293 | int, usually equals NCOL 3294 | \\ 3295 | \hline 3296 | 3297 | 18|26 3298 | & 3299 | 2 3300 | & 3301 | medium 3302 | & 3303 | int, length of column list := CL, usually CL > NCOL 3304 | \\ 3305 | \hline 3306 | 3307 | 20|28 3308 | & 3309 | 2 3310 | & 3311 | low 3312 | & 3313 | int, usually 1 3314 | \\ 3315 | \hline 3316 | 3317 | 22|30 3318 | & 3319 | 2 3320 | & 3321 | low 3322 | & 3323 | int, usually equals NCOL 3324 | \\ 3325 | \hline 3326 | 3327 | 24|32 3328 | & 3329 | 2 3330 | & 3331 | low 3332 | & 3333 | int, usually 3 equal values 3334 | \\ 3335 | \hline 3336 | 3337 | 26|34 3338 | & 3339 | 2 3340 | & 3341 | low 3342 | & 3343 | int, usually 3 equal values 3344 | \\ 3345 | \hline 3346 | 3347 | 28|36 3348 | & 3349 | 2 3350 | & 3351 | low 3352 | & 3353 | int, usually 3 equal values 3354 | \\ 3355 | \hline 3356 | 3357 | 30|38 3358 | & 3359 | 2*CL 3360 | & 3361 | medium 3362 | & 3363 | \hyperref[column-list-values]{column list values} (see below) 3364 | \\ 3365 | \hline 3366 | 3367 | MCL 3368 | & 3369 | 8 3370 | & 3371 | low 3372 | & 3373 | usually zeros, 30|38 + 2*CL := MCL 3374 | \\ 3375 | \hline 3376 | \end{longtable*} 3377 | 3378 | 3379 | \subsubsection{Column List Values% 3380 | \label{column-list-values}% 3381 | } 3382 | 3383 | These values are 2 byte integers, with (CL-NCOL) zero values. Each nonzero value is unique, between -NCOL and NCOL. The significance of signedness and ordering is unknown. The values do not correspond to a sorting order of columns. 3384 | 3385 | 3386 | \subsection{Compressed Binary Data Subheader% 3387 | \label{compressed-binary-data-subheader}% 3388 | } 3389 | 3390 | When a SAS7BDAT file is created by SAS with the option COMPRESS=CHAR or COMPRESS=YES, each row of data is compressed independently with a Run Length Encoding (RLE) structure. This yields a variable length compressed row. Each such row is stored in a single subheader in sequential order, indexed by the \hyperref[subheader-pointers]{subheader pointers}. A RLE compressed data row is identified by COMP=4 in the subheader pointer, and does not have a subheader signature. If a particular row had highly variable data and yielded no compression, it is still stored in a subheader, but uncompressed with COMP=0 instead of COMP=4. The test file \texttt{compress\_yes.sas7bdat} has such highly variable (random) data and all its rows are in this COMP=0 form of subheaders. It takes up more space than the uncompressed version \texttt{compress\_no.sas7bdat}, due to the extra length of the subheader pointers. The final subheader on a page is usually COMP=1, which indicates a truncated row to be ignored; the complete data row appears on the next page. 3391 | 3392 | The SAS option COMPRESS=BINARY apparently uses a RDC (Ross Data Compression) structure instead of RLE. We need more test files to investigate this structure, and only document RLE at present. 3393 | 3394 | 3395 | \subsubsection{Run Length Encoding% 3396 | \label{run-length-encoding}% 3397 | } 3398 | 3399 | In RLE, the compressed row data is a series of control bytes, each optionally followed by data bytes. The control byte specifies how the data bytes are interpreted, or is self contained. The control byte has 2 parts - the upper 4 bits are the Command, and the lower 4 bits are the Length. Each is an uint in the range 0-15. For example, control byte 82 (hex) is Command 8 and Length 2, and control byte F4 (hex) is command 15 (F hex) and Length 4. We have identified the functions of the 11 different Command values which are observed in the test files. The RLE structure was contributed by Clint Cummins. 3400 | 3401 | \setlength{\DUtablewidth}{\linewidth}% 3402 | \begin{longtable*}{|p{0.052\DUtablewidth}|p{0.046\DUtablewidth}|p{0.087\DUtablewidth}|p{0.765\DUtablewidth}|} 3403 | \hline 3404 | \textbf{% 3405 | Command 3406 | } & \textbf{% 3407 | Length 3408 | } & \textbf{% 3409 | Name 3410 | } & \textbf{% 3411 | Function 3412 | } \\ 3413 | \hline 3414 | \endfirsthead 3415 | \hline 3416 | \textbf{% 3417 | Command 3418 | } & \textbf{% 3419 | Length 3420 | } & \textbf{% 3421 | Name 3422 | } & \textbf{% 3423 | Function 3424 | } \\ 3425 | \hline 3426 | \endhead 3427 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 3428 | \endfoot 3429 | \endlastfoot 3430 | 3431 | 0 3432 | & 3433 | 0 3434 | & 3435 | Copy64 3436 | & 3437 | using the first byte as a uint length L (0-255), Copy the next N=64+L bytes from the input to the output (copies 64 to 319 bytes) 3438 | \\ 3439 | \hline 3440 | 3441 | 1 3442 | & 3443 | ? 3444 | & 3445 | ? 3446 | & 3447 | \emph{????????????} (not observed in test files) 3448 | \\ 3449 | \hline 3450 | 3451 | 2 3452 | & 3453 | ? 3454 | & 3455 | ? 3456 | & 3457 | \emph{????????????} (not observed in test files) 3458 | \\ 3459 | \hline 3460 | 3461 | 3 3462 | & 3463 | ? 3464 | & 3465 | ? 3466 | & 3467 | \emph{????????????} (not observed in test files) 3468 | \\ 3469 | \hline 3470 | 3471 | 4 3472 | & 3473 | ? 3474 | & 3475 | ? 3476 | & 3477 | \emph{????????????} (not observed in test files) 3478 | \\ 3479 | \hline 3480 | 3481 | 5 3482 | & 3483 | ? 3484 | & 3485 | ? 3486 | & 3487 | \emph{????????????} (not observed in test files) 3488 | \\ 3489 | \hline 3490 | 3491 | 6 3492 | & 3493 | 0 3494 | & 3495 | InsertBlank17 3496 | & 3497 | using the first byte as a uint length L, Insert N=17+L blanks (decimal 32, hex 20) in the output (inserts 17 to 273 blanks) 3498 | \\ 3499 | \hline 3500 | 3501 | 7 3502 | & 3503 | 0 3504 | & 3505 | InsertZero17 3506 | & 3507 | using the first byte as a uint length L, Insert N=17+L zero bytes in the output 3508 | \\ 3509 | \hline 3510 | 3511 | 8 3512 | & 3513 | L 3514 | & 3515 | Copy1 3516 | & 3517 | using the Length bits as a uint length L (0-15), Copy the next N=1+L bytes from the input to the output (copies 1 to 16 bytes) 3518 | \\ 3519 | \hline 3520 | 3521 | 9 3522 | & 3523 | L 3524 | & 3525 | Copy17 3526 | & 3527 | Copy the next N=17+L bytes from the input to the output (copies 17 to 32 bytes) 3528 | \\ 3529 | \hline 3530 | 3531 | 10 (A) 3532 | & 3533 | L 3534 | & 3535 | Copy33 3536 | & 3537 | Copy the next N=33+L bytes from the input to the output (copies 33 to 48 bytes) 3538 | \\ 3539 | \hline 3540 | 3541 | 11 (B) 3542 | & 3543 | L 3544 | & 3545 | Copy49 3546 | & 3547 | Copy the next N=49+L bytes from the input to the output (copies 49 to 64 bytes) 3548 | \\ 3549 | \hline 3550 | 3551 | 12 (C) 3552 | & 3553 | L 3554 | & 3555 | InsertByte3 3556 | & 3557 | Insert N=3+L copies of the next byte in the output (inserts 3 to 18 bytes) 3558 | \\ 3559 | \hline 3560 | 3561 | 13 (D) 3562 | & 3563 | L 3564 | & 3565 | Insert@2 3566 | & 3567 | Insert N=2+L @ (decimal 64, hex 40) bytes in the output (inserts 2 to 17 @ bytes) 3568 | \\ 3569 | \hline 3570 | 3571 | 14 (E) 3572 | & 3573 | L 3574 | & 3575 | InsertBlank2 3576 | & 3577 | Insert N=2+L blanks in the output 3578 | \\ 3579 | \hline 3580 | 3581 | 15 (F) 3582 | & 3583 | L 3584 | & 3585 | InsertZero2 3586 | & 3587 | Insert N=2+L zero bytes in the output 3588 | \\ 3589 | \hline 3590 | \end{longtable*} 3591 | 3592 | The most common Commands in \texttt{obs\_all\_perf\_1.sas7bdat} are F and 8 (alternating). This file is entirely 8 byte doubles, so the F commands often handle consecutive zero bytes in zero value doubles. 3593 | 3594 | 3595 | \subsubsection{RLE Example 1% 3596 | \label{rle-example-1}% 3597 | } 3598 | 3599 | Compressed data row: 3600 | 3601 | \texttt{87 A B C D E F G H F2 8A 1 2 3 4 5 6 7 8 9 A B D0 A1 a b c d e f g ... z} 3602 | 3603 | \texttt{CB -8-data-bytes-{}- CB CB -{}-11-data-bytes-{}-{}-{}-{}-{}- CB CB -{}-34-data-bytes-{}-} 3604 | 3605 | \texttt{Copy1 ~ ~ ~ ~ ~ ~ ~InsertZero2 ~ ~ ~ ~ ~ ~ ~ ~ Ins Copy33 next 34 bytes} 3606 | 3607 | \texttt{Next 8 bytes ~ ~ ~ 4 00h bytes ~ ~ ~ ~ ~ ~ ~ ~ 2 40h} 3608 | 3609 | There are 5 Control Bytes (CB) in the above sequence. 3610 | 3611 | \begin{enumerate} 3612 | \item 87: Copy1 next 8 bytes 3613 | 3614 | \item F2: InsertZero2 4 00h bytes 3615 | 3616 | \item 8A: Copy1 next 11 bytes 3617 | 3618 | \item D0: Insert@2 2 40h bytes 3619 | 3620 | \item A1: Copy33 next 34 bytes 3621 | \end{enumerate} 3622 | 3623 | Output uncompressed row: 3624 | 3625 | \texttt{A B C D E F G H 00 00 00 00 1 2 3 4 5 6 7 8 9 A B 40 40 a b c ... z} 3626 | 3627 | 3628 | \subsubsection{RLE Example 2% 3629 | \label{rle-example-2}% 3630 | } 3631 | 3632 | Compressed data row: 3633 | 3634 | \texttt{87 A B C D E F G H C1 99 A5 a b c ... z} 3635 | 3636 | \texttt{CB -8-data-bytes-{}- CB ar CB -last-bytes} 3637 | 3638 | \texttt{Copy1 8 ~ ~ ~ ~ ~ ~InsBy Copy33 38 bytes} 3639 | 3640 | Control Bytes in Example 2: 3641 | 3642 | \begin{enumerate} 3643 | \item 87: Copy1 next 8 bytes 3644 | 3645 | \item C1,99: InsertByte3 4 99h bytes 3646 | 3647 | \item A5: Copy33 next 38 bytes 3648 | \end{enumerate} 3649 | 3650 | Output uncompressed row: 3651 | 3652 | \texttt{A B C D E F G H 99 99 99 99 a b c ... z} 3653 | 3654 | Once a data row is uncompressed, use the \hyperref[sas7bdat-packed-binary-data]{SAS7BDAT Packed Binary Data} description below to read the variables. 3655 | 3656 | 3657 | \section{SAS7BDAT Packed Binary Data% 3658 | \label{sas7bdat-packed-binary-data}% 3659 | } 3660 | 3661 | SAS7BDAT packed binary are uncompressed, and appear after any subheaders on the page; see the \hyperref[page-offset-table]{Page Offset Table}. These data are stored by rows, where the size of a row (in bytes) is defined by the \hyperref[row-size-subheader]{row size subheader}. When multiple rows occur on a single page, they are immediately adjacent. When a database contains many rows, it is typical that the collection of rows (i.e. their data) is evenly distributed to a number of 'data' pages. However, in test files, no single row's data is broken across two or more pages. A single data row is parsed by interpreting the binary data according to the collection of column attributes contained in the \hyperref[column-attributes-subheader]{column attributes subheader}. Binary data can be interpreted in two ways, as ASCII characters, or as floating point numbers. The column width attribute specifies the number of bytes associated with a column. For character data, this interpretation is straight-forward. For numeric data, interpretation of the column width is more complex. 3662 | 3663 | The common binary representation of floating point numbers has three parts; the sign (\texttt{s}), exponent (\texttt{e}), and mantissa (\texttt{m}). The corresponding floating point number is \texttt{s * m * b \textasciicircum{} e}, where \texttt{b} is the base (2 for binary, 10 for decimal). Under the IEEE 754 floating point standard, the sign, exponent, and mantissa are encoded by 1, 11, and 52 bits respectively, totaling 8 bytes. In SAS7BDAT file, numeric quantities can be 3, 4, 5, 6, 7, or 8 bytes in length. For numeric quantities of less than 8 bytes, the remaining number of bytes are truncated from the least significant part of the mantissa. Hence, the minimum and maximum numeric values are identical for all byte lengths, but shorter numeric values have reduced precision. 3664 | 3665 | Reduction in precision is characterized by the largest integer such that itself and all smaller integers have an exact representation, denoted \texttt{M}. At best, all integers greater than \texttt{M} are approximated to the nearest multiple of \texttt{b}. The table of \hyperref[numeric-binary-formats]{numeric binary formats} below lists \texttt{M} values and describes how bits are distributed among the six possible column widths in SAS7BDAT files, and lists. 3666 | 3667 | 3668 | \subsection{Numeric Binary Formats% 3669 | \label{numeric-binary-formats}% 3670 | } 3671 | 3672 | \setlength{\DUtablewidth}{\linewidth}% 3673 | \begin{longtable*}{|p{0.075\DUtablewidth}|p{0.075\DUtablewidth}|p{0.063\DUtablewidth}|p{0.110\DUtablewidth}|p{0.110\DUtablewidth}|p{0.203\DUtablewidth}|} 3674 | \hline 3675 | \textbf{% 3676 | size 3677 | } & \textbf{% 3678 | bytes 3679 | } & \textbf{% 3680 | sign 3681 | } & \textbf{% 3682 | exponent 3683 | } & \textbf{% 3684 | mantissa 3685 | } & \textbf{% 3686 | \texttt{M} 3687 | } \\ 3688 | \hline 3689 | \endfirsthead 3690 | \hline 3691 | \textbf{% 3692 | size 3693 | } & \textbf{% 3694 | bytes 3695 | } & \textbf{% 3696 | sign 3697 | } & \textbf{% 3698 | exponent 3699 | } & \textbf{% 3700 | mantissa 3701 | } & \textbf{% 3702 | \texttt{M} 3703 | } \\ 3704 | \hline 3705 | \endhead 3706 | \multicolumn{6}{c}{\hfill ... continued on next page} \\ 3707 | \endfoot 3708 | \endlastfoot 3709 | 3710 | 24bit 3711 | & 3712 | 3 3713 | & 3714 | 1 3715 | & 3716 | 11 3717 | & 3718 | 12 3719 | & 3720 | 8192 3721 | \\ 3722 | \hline 3723 | 3724 | 32bit 3725 | & 3726 | 4 3727 | & 3728 | 1 3729 | & 3730 | 11 3731 | & 3732 | 20 3733 | & 3734 | 2097152 3735 | \\ 3736 | \hline 3737 | 3738 | 40bit 3739 | & 3740 | 5 3741 | & 3742 | 1 3743 | & 3744 | 11 3745 | & 3746 | 28 3747 | & 3748 | 536870912 3749 | \\ 3750 | \hline 3751 | 3752 | 48bit 3753 | & 3754 | 6 3755 | & 3756 | 1 3757 | & 3758 | 11 3759 | & 3760 | 36 3761 | & 3762 | 137438953472 3763 | \\ 3764 | \hline 3765 | 3766 | 56bit 3767 | & 3768 | 7 3769 | & 3770 | 1 3771 | & 3772 | 11 3773 | & 3774 | 44 3775 | & 3776 | 35184372088832 3777 | \\ 3778 | \hline 3779 | 3780 | 64bit 3781 | & 3782 | 8 3783 | & 3784 | 1 3785 | & 3786 | 11 3787 | & 3788 | 52 3789 | & 3790 | 9007199254740990 3791 | \\ 3792 | \hline 3793 | \end{longtable*} 3794 | 3795 | 3796 | \subsection{Dates, Currency, and Formatting% 3797 | \label{dates-currency-and-formatting}% 3798 | } 3799 | 3800 | Column formatting infomation is encoded within the \hyperref[column-text-subheader]{Column Text Subheader} and \hyperref[column-format-and-label-subheader]{Column Format and Label Subheader}. Columns with formatting information have special meaning and interpretation. For example, numeric values may represent dates, encoded as the number of seconds since midnight, January 1, 1960. The format string for fields encoded this way is \textquotedbl{}DATETIME\textquotedbl{}. Using R, these values may be converted using the as.POSIXct or as.POSIXlt functions with argument \texttt{origin=\textquotedbl{}1960-01-01\textquotedbl{}}. The most common date format strings correspond to numeric fields, and are interpreted as follows: 3801 | 3802 | \setlength{\DUtablewidth}{\linewidth}% 3803 | \begin{longtable*}{|p{0.110\DUtablewidth}|p{0.470\DUtablewidth}|p{0.156\DUtablewidth}|} 3804 | \hline 3805 | \textbf{% 3806 | Format 3807 | } & \textbf{% 3808 | Interpretation 3809 | } & \textbf{% 3810 | R Function 3811 | } \\ 3812 | \hline 3813 | \endfirsthead 3814 | \hline 3815 | \textbf{% 3816 | Format 3817 | } & \textbf{% 3818 | Interpretation 3819 | } & \textbf{% 3820 | R Function 3821 | } \\ 3822 | \hline 3823 | \endhead 3824 | \multicolumn{3}{c}{\hfill ... continued on next page} \\ 3825 | \endfoot 3826 | \endlastfoot 3827 | 3828 | DATE 3829 | & 3830 | Number of days since January 1, 1960 3831 | & 3832 | chron::chron 3833 | \\ 3834 | \hline 3835 | 3836 | TIME 3837 | & 3838 | Number of seconds since midnight 3839 | & 3840 | as.POSIXct 3841 | \\ 3842 | \hline 3843 | 3844 | DATETIME 3845 | & 3846 | Number of seconds since January 1, 1960 3847 | & 3848 | as.POSIXct 3849 | \\ 3850 | \hline 3851 | \end{longtable*} 3852 | 3853 | There are many additional format strings for numeric and character fields. 3854 | 3855 | 3856 | \section{Platform Differences% 3857 | \label{platform-differences}% 3858 | } 3859 | 3860 | The test files referenced in \texttt{data/sas7bdat.sources.RData} were examined over a period of time. Files with non-Microsoft Windows markings were only observed late into the writing of this document. Consequently (but not intentionally), the SAS7BDAT description above was first deduced for SAS datasets generated on the most commonly observed platform: Microsoft Windows. The extensions to SAS7BDAT files for \textbf{u64} and non-Intel formats was contributed a little later by Clint Cummins. 3861 | 3862 | In particular, the files \texttt{natlerr1944.sas7bdat}, \texttt{natlerr2006.sas7bdat} appear to be generated on the 'SunOS' platform (\textbf{u64}, non-Intel). \texttt{txzips.sas7bdat} was created on Linux 64-bit SAS server (\textbf{u64}, Intel). \texttt{eyecarex.sas7bdat} is non-Intel, possibly 32-bit PowerPC. 3863 | 3864 | The files \texttt{cfrance2.sas7bdat}, \texttt{cfrance.sas7bdat}, \texttt{coutline.sas7bdat}, \texttt{gfrance2.sas7bdat}, \texttt{gfrance.sas7bdat}, \texttt{goutline.sas7bdat}, \texttt{xfrance2.sas7bdat}, \texttt{xfrance.sas7bdat}, \texttt{xoutline.sas7bdat} appear to be generated on a 32-bit 'Linux' Intel system. They have the same format as Windows files, except for the (ignorable) OS strings in the first header. 3865 | 3866 | Text may appear in non-ASCII compatible, partially ASCII compatible, or multi-byte encodings. In particular, Kasper Sorenson discovered some text that appears to be encoded using the Windows-1252 'code page'. 3867 | 3868 | \textbf{Key Test Files} 3869 | 3870 | \setlength{\DUtablewidth}{\linewidth}% 3871 | \begin{longtable*}{|p{0.301\DUtablewidth}|p{0.639\DUtablewidth}|} 3872 | \hline 3873 | \textbf{% 3874 | filename 3875 | } & \textbf{% 3876 | format features 3877 | } \\ 3878 | \hline 3879 | \endfirsthead 3880 | \hline 3881 | \textbf{% 3882 | filename 3883 | } & \textbf{% 3884 | format features 3885 | } \\ 3886 | \hline 3887 | \endhead 3888 | \multicolumn{2}{c}{\hfill ... continued on next page} \\ 3889 | \endfoot 3890 | \endlastfoot 3891 | 3892 | \texttt{acadindx.sas7bdat} 3893 | & 3894 | non-u64, Intel (most files are like this one) 3895 | \\ 3896 | \hline 3897 | 3898 | \texttt{br.sas7bdat} 3899 | & 3900 | truncated doubles (widths 3,4,6; compare with br2 widths all 8) 3901 | \\ 3902 | \hline 3903 | 3904 | \texttt{eyecarex.sas7bdat} 3905 | & 3906 | non-u64, non-Intel, written by Stat/Transfer 3907 | \\ 3908 | \hline 3909 | 3910 | \texttt{txzips.sas7bdat} 3911 | & 3912 | u64, Intel 3913 | \\ 3914 | \hline 3915 | 3916 | \texttt{natlterr1994.sas7bdat} 3917 | & 3918 | u64, non-Intel 3919 | \\ 3920 | \hline 3921 | 3922 | \texttt{hltheds2006.sas7bdat} 3923 | & 3924 | 2 Column Attributes subheaders 3925 | \\ 3926 | \hline 3927 | 3928 | \texttt{moshim.sas7bdat} 3929 | & 3930 | 3 Column Attributes subheaders 3931 | \\ 3932 | \hline 3933 | 3934 | \texttt{flightdelays.sas7bdat} 3935 | & 3936 | 2 Column Text subheaders 3937 | \\ 3938 | \hline 3939 | 3940 | \texttt{ymcls\_p2\_long\_040506.sas7bdat} 3941 | & 3942 | 5 Column Text subheaders, first Column Attributes subheader is on page 6 3943 | \\ 3944 | \hline 3945 | 3946 | \texttt{flightschedule.sas7bdat} 3947 | & 3948 | 2+ Column Text subheaders 3949 | \\ 3950 | \hline 3951 | 3952 | \texttt{internationalflight.sas7bdat} 3953 | & 3954 | 2+ Column Text subheaders 3955 | \\ 3956 | \hline 3957 | 3958 | \texttt{marchflights.sas7bdat} 3959 | & 3960 | 2+ Column Text subheaders 3961 | \\ 3962 | \hline 3963 | 3964 | \texttt{mechanicslevel1.sas7bdat} 3965 | & 3966 | 2+ Column Text subheaders 3967 | \\ 3968 | \hline 3969 | 3970 | \texttt{compress\_yes.sas7bdat} 3971 | & 3972 | COMPRESS=CHAR, one PGTYPE=-28672, no RLE compression (COMP=0) 3973 | \\ 3974 | \hline 3975 | 3976 | \texttt{obs\_all\_perf\_1.sas7bdat} 3977 | & 3978 | COMPRESS=CHAR, many PGTYPE=16384, much RLE compression (COMP=4) 3979 | \\ 3980 | \hline 3981 | \end{longtable*} 3982 | 3983 | 3984 | \section{Compression Data% 3985 | \label{compression-data}% 3986 | } 3987 | 3988 | The table below presents the results of compression tests on a collection of 142 SAS7BDAT data files (sources in \texttt{data/}). The 'type' field represents the type of compression, 'ctime' is the compression time (in seconds), 'dtime' is the decompression time, and the 'compression ratio' field holds the cumulative disk usage (in megabytes) before and after compression. Although the \texttt{xz} algorithm requires significantly more time to compress these data, the decompression time is on par with gzip. 3989 | 3990 | \setlength{\DUtablewidth}{\linewidth}% 3991 | \begin{longtable*}{|p{0.168\DUtablewidth}|p{0.086\DUtablewidth}|p{0.086\DUtablewidth}|p{0.307\DUtablewidth}|} 3992 | \hline 3993 | \textbf{% 3994 | type 3995 | } & \textbf{% 3996 | ctime 3997 | } & \textbf{% 3998 | dtime 3999 | } & \textbf{% 4000 | compression ratio 4001 | } \\ 4002 | \hline 4003 | \endfirsthead 4004 | \hline 4005 | \textbf{% 4006 | type 4007 | } & \textbf{% 4008 | ctime 4009 | } & \textbf{% 4010 | dtime 4011 | } & \textbf{% 4012 | compression ratio 4013 | } \\ 4014 | \hline 4015 | \endhead 4016 | \multicolumn{4}{c}{\hfill ... continued on next page} \\ 4017 | \endfoot 4018 | \endlastfoot 4019 | 4020 | gzip -9 4021 | & 4022 | 76.7s 4023 | & 4024 | 2.6s 4025 | & 4026 | 541M / 30.3M = 17.9 4027 | \\ 4028 | \hline 4029 | 4030 | bzip2 -9 4031 | & 4032 | 92.7s 4033 | & 4034 | 11.2s 4035 | & 4036 | 541M / 19.0M = 28.5 4037 | \\ 4038 | \hline 4039 | 4040 | xz -9 4041 | & 4042 | 434.2s 4043 | & 4044 | 2.7s 4045 | & 4046 | 541M / 12.8M = 42.3 4047 | \\ 4048 | \hline 4049 | \end{longtable*} 4050 | 4051 | 4052 | \section{Software Prototype% 4053 | \label{software-prototype}% 4054 | } 4055 | 4056 | The prototype program for reading SAS7BDAT formatted files is implemented entirely in R (see file \texttt{src/sas7bdat.R}). Files not recognized as having been generated under a Microsoft Windows platform are rejected (for now). Implementation of the \texttt{read.sas7bdat} function should be considered a 'reference implementation', and not one designed with performance in mind. 4057 | 4058 | There are certain advantages and disadvantages to developing a prototype of this nature in R. 4059 | 4060 | Advantages: 4061 | 4062 | \begin{enumerate} 4063 | \item R is an interpreted language with built-in debugger. Hence, experimental routines may be implemented and debugged quickly and interactively, without the need of external compiler or debugger tools (e.g. gcc, gdb). 4064 | 4065 | \item R programs are portable across a variety of computing platforms. This is especially important in the present context, because manipulating files stored on disk is a platform-specific task. Platform-specific operations are abstracted from the R user. 4066 | \end{enumerate} 4067 | 4068 | Disadvantages: 4069 | 4070 | \begin{enumerate} 4071 | \item Manipulating binary (raw) data in R is a relatively new capability. The best tools and practices for binary data operations are not as developed as those for other data types. 4072 | 4073 | \item Interpreted code is often much less efficient than compiled code. This is not major disadvantage for prototype implementations because human code development is far less efficient than the R interpreter. Gains made in efficient code development using an interpreted language far outweigh benefit of compiled languages. 4074 | \end{enumerate} 4075 | 4076 | Another software implementation was made by Clint Cummins, in the TSP econometrics package (mainly as an independent platform for exploring the format). 4077 | 4078 | 4079 | \section{ToDo% 4080 | \label{todo}% 4081 | } 4082 | 4083 | \begin{itemize} 4084 | \item obtain test files which use COMPRESS=BINARY, and develop identification and uncompression procedures 4085 | 4086 | \item look for data which will reliably distinguish between structural subheaders (which have one of the known signatures) and uncompressed row data, which may have row data in the signature position that matches one of the known signatures. Both use COMP=0. Are NPSHD and NSHPL sufficient to do this? 4087 | 4088 | \item obtain test files with more than 2.1 billion (and more than 4.2 billion) data rows, i.e. where 8 byte integer TRC in \textbf{u64} is apparently needed. Do the non-u64 files handle this, with additional fields beyond the 4 byte TRC used for segmentation? Is TRC a (signed) int or (unsigned) uint? 4089 | 4090 | \item identify any SAS7BDAT encryption flag (this is not the same as 'cracking', or breaking encryption); we just identify if a file is encrypted and not readable without a key 4091 | 4092 | \item experiment further with 'amendment page' concept 4093 | 4094 | \item consider header bytes -by- SAS\_host 4095 | 4096 | \item check that only one page of type \textquotedbl{}mix\textquotedbl{} is observed. If so insert \textquotedbl{}In all test cases (\texttt{data/sources.csv}), there are exactly zero or one pages of type 'mix'.\textquotedbl{} under the \hyperref[page-offset-table]{Page Offset Table} header. {[}May not be needed, because the BC and SC fields in each Page Offset Table make the \hyperref[mrc]{MRC} field in the initial header unnecessary.{]} 4097 | 4098 | \item identify all missing value representations: missing numeric values appear to be represented as '0000000000D1FFFF' (nan) for numeric 'double' quantities. 4099 | 4100 | \item identify purpose of various unknown header quantities 4101 | 4102 | \item determine purpose of Column List subheader 4103 | 4104 | \item determine purpose and pattern of 'page sequence signature' fields. Are they useful? 4105 | 4106 | \item identify how non-ASCII encoding is specified 4107 | 4108 | \item implement R options to read just header (and subheader) information without data, and an option to read just some data fields, and not all fields. {[}The TSP implemenation already does this, and can also read a subset of the data rows.{]} 4109 | \end{itemize} 4110 | 4111 | \end{document} 4112 | -------------------------------------------------------------------------------- /vignettes/sas7bdat.rst: -------------------------------------------------------------------------------- 1 | =============================== 2 | SAS7BDAT Database Binary Format 3 | =============================== 4 | 5 | by: 6 | 7 | | Matthew S. Shotwell, PhD 8 | | Assistant Professor 9 | | Department of Biostatistics 10 | | Vanderbilt University 11 | | matt.shotwell@vanderbilt.edu 12 | 13 | 1/9/2013 update (**u64** format extensions, Row Size fields, and RLE compression) by: 14 | 15 | | Clint Cummins, PhD 16 | | clint@stanford.edu 17 | 18 | 19 | Copyright (C) 2013 is retained by the authors listed above. This work is licensed under the Creative Commons Attribution-ShareAlike 3.0 Unported License. To view a copy of this license, visit http://creativecommons.org/licenses/by-sa/3.0/. 20 | 21 | Contents 22 | ======== 23 | 24 | - `Introduction`_ 25 | - `SAS7BDAT Header`_ 26 | - `SAS7BDAT Pages`_ 27 | - `SAS7BDAT Subheaders`_ 28 | - `SAS7BDAT Packed Binary Data`_ 29 | - `Platform Differences`_ 30 | - `Compression Data`_ 31 | - `Software Prototype`_ 32 | - `ToDo`_ 33 | 34 | Introduction 35 | ============ 36 | 37 | The SAS7BDAT file is a binary database storage file. At the time of this writing, no description of the SAS7BDAT file format was publicly available. Hence, users who wish to read and manipulate these files were required to obtain a license for the SAS software, or third party software with support for SAS7BDAT files. The purpose of this document is to promote interoperability between SAS and other popular statistical software packages, especially R (http://www.r-project.org/). 38 | 39 | The information below was deduced by examining the contents of many SAS7BDAT databases downloaded freely from internet resources (see ``data/sas7bdat.sources.RData``). No guarantee is made regarding its accuracy. No SAS software, nor any other software requiring the purchase of a license was used. 40 | 41 | SAS7BDAT files consist of binary encoded data. Data files encoded in this format often have the extension '.sas7bdat'. The name 'SAS7BDAT' is not official, but is used throughout this document to refer to SAS database files formatted according to the descriptions below. 42 | 43 | There are significant differences in the SAS7BDAT format depending on the operating systems and computer hardware platforms (32bit vs. 64bit). See the section on `platform differences`_ for more details. The format described below is sufficient to read the entire collection of test files referenced in ``data/sas7bdat.sources.RData`` (i.e. files associated with 32bit and some 64bit builds of SAS for Microsoft Windows, and **u64** SAS versions). This includes files created with COMPRESS=CHAR. The format described here is probably not sufficient to **write** SAS7BDAT format files, due to lingering uncertainties. 44 | 45 | The figure below illustrates the overall structure of the SAS7BDAT database. Each file consists of a header (length := HL bytes), followed by PC pages, each of length PL bytes (PC and PL are shorthand for 'page count' and 'page size' respectively, and are used to denote these quantities throughout this document).:: 46 | 47 | ---------- 48 | | HL | header 49 | ---------- 50 | | PL | page 1 51 | ---------- 52 | | PL | page 2 53 | ---------- 54 | ... 55 | ---------- 56 | | PL | page PC 57 | ---------- 58 | 59 | Throughout this document, hexadecimal digits are denoted with a preceding 'x', binary digits with a preceding 'b', and decimal digits with no preceding character. For example, see the below `table of hexadecimal, decimal, and binary values`_. 60 | 61 | SAS7BDAT Header 62 | =============== 63 | 64 | The SAS7BDAT file header contains a binary file identifier (*i.e.*, a magic number), the dataset name, timestamp, the number pages (PC), their size (PL) and a variety of other values that pertain to the database as a whole. The purpose of many header fields remain unknown, but are likely to include specifications for data compression and encryption, password protection, and dates/times of creation and/or modification. Most files encountered encode multi-byte values little-endian (least significant byte first). However, some files have big-endian values. Hence, it appears that multi-byte values are encoded using endianness of the platform where the file was written. See `Platform Differences`_ for a table of key test files which differ in several ways. 65 | 66 | The *offset table* below describes the SAS7BDAT file header as a sequence of bytes. Information stored in the table is indexed by its byte offset (first column) in the header and its length (second column) in bytes. For example, the field at offset 0 has length 32 bytes. Hence, bytes 0,1,...,31 comprise the data for this field. Byte lengths having the form '%n' should read: 'the number of bytes remaining up to, but not including byte n'. The fourth column gives a shorthand description of the data contained at the corresponding offset. For example, 'int, page size := PL' indicates that the data stored at the corresponding location is a signed integer representing the page size, which we denote PL. The description *????????????* indicates that the meaning of data stored at the corresponding offset is unknown. The third column represents the author's confidence (low, medium, high) in the corresponding offset, length, and description. Each offset table in this document is formatted in a similar fashion. Variables defined in an offset table are sometimes used in subsequent tables. 67 | 68 | Header Offset Table 69 | ------------------- 70 | 71 | ============== ====== ====== =============================================== 72 | offset length conf. description 73 | ============== ====== ====== =============================================== 74 | 0 32 high binary, `magic number`_ 75 | 32 1 high binary, Alignment_: if (byte==x33) a2=4 else a2=0 . **u64** is true if a2=4 (unix 64 bit format). 76 | 33 2 low *????????????* 77 | 35 1 high binary, Alignment_ if (byte==x33) a1=4 else a1=0 78 | 36 1 low *????????????* 79 | 37 1 high int, endianness (x01-little [Intel] x00-big) 80 | 38 1 low *????????????* 81 | 39 1 medium ascii, OS type (1-UNIX or 2-WIN). Does not affect format except for the OS strings. 82 | 40 8 low *????????????* 83 | 48 8 low *????????????* 84 | 56 8 low repeat of 32:32+8 85 | 64 6 low *????????????* 86 | 70 2 low int, `Character Encoding`_ 87 | 72 12 low *????????????* 88 | 84 8 high ascii 'SAS FILE' 89 | 92 64 high ascii, dataset name 90 | 156 8 medium ascii, file type, e.g. ``'DATA '`` 91 | 164 a1 medium zero padding when a1=4 . Aligns the double timestamps below on double word boundaries. 92 | 164+a1 8 high double, timestamp, date created, secs since 1/1/60 (for SAS version 8.x and higher) 93 | 172+a1 8 high double, timestamp, date modified, secs since 1/1/60 (for SAS version 8.x and higher) 94 | 180+a1 16 low *????????????* 95 | 196+a1 4 high int, length of SAS7BDAT header := HL 96 | 200+a1 4 high int, page size := _`PL` 97 | 204+a1 4+a2 high int, page count := PC . Length 4 or 8 (**u64**), henceforth denoted **4|8** 98 | 208+a1+a2 8 low *????????????* 99 | 216+a1+a2 8 high ascii, SAS release (e.g. 9.0101M3 ) 100 | 224+a1+a2 16 high ascii, host (SAS server type, longest observed string has 9 bytes) 101 | 240+a1+a2 16 high ascii, OS version number (for UNIX, else null) 102 | 256+a1+a2 16 high ascii, OS maker or version (SUN, IBM, sometimes WIN) 103 | 272+a1+a2 16 high ascii, OS name (for UNIX, else null) 104 | 288+a1+a2 32 low *????????????* 105 | 320+a1+a2 4 low int, page sequence signature? (value is close to the value at start of each Page Offset Table) 106 | 324+a1+a2 4 low *????????????* 107 | 328+a1+a2 8 medium double, 3rd timestamp, sometimes zero 108 | 336+a1+a2 %HL medium zeros 109 | 1024|8192 medium Total length of header (8192 for **u64**), HL 110 | ============== ====== ====== =============================================== 111 | 112 | The 8 bytes beginning at offset 32 hold information which affects the offset of the 'release' and 'host' information. In particular: 113 | 114 | 1. The byte at offset 32 defines the **u64** (unix 64 bit) file format, which affects many field and header lengths (usually via 4 vs. 8 byte integers). 115 | 2. The byte at offset 35 controls an offset before the timestamps. 116 | 3. The byte at offset 37 defines byte ordering of ints and doubles (most test files were created on Windows and use Intel byte ordering; little endian). 117 | 4. The byte at offset 39 appears to distinguish the OS type, where '1' indicates that the file was generated on a UNIX-like system, such as Linux or SunOS, and '2' indicates the file was generated on a Microsoft Windows platform. However, this does not affect any important fields in the file format. 118 | 119 | The following table describes some of the possible polymorphisms for the 8 bytes at offset 32. The first field lists the name of the file where the sequence was found (see ``data/sas7bdat.sources.RData``), the second lists the eight byte values (hexadecimal), the third field shows bytes 216-239 in ASCII ('.' represents a non-ASCII character or '\0'), and the fourth field lists the SAS7BDAT sub-format. 120 | 121 | =========================== =================================== ============================ ====================== 122 | filename bytes 32-39 bytes 216-239 format 123 | =========================== =================================== ============================ ====================== 124 | ``compress_no.sas7bdat`` ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0101M3NET_ASRV........`` Windows Intel 125 | ``compress_yes.sas7bdat`` ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0101M3NET_ASRV........`` Windows Intel 126 | ``lowbwt_i386.sas7bdat`` ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0202M0W32_VSPRO.......`` Windows Intel 127 | ``missing_values.sas7bdat`` ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0202M0W32_VSPRO.......`` Windows Intel 128 | ``obs_all_perf_1.sas7bdat`` ``x22 x22 x00 x32 x22 x01 x02 x32`` ``9.0101M3XP_PRO..........`` Windows Intel 129 | ``adsl.sas7bdat`` ``x22 x22 x00 x33 x33 x01 x02 x32`` ``....9.0202M3X64_ESRV....`` Windows x64 Intel 130 | ``eyecarex.sas7bdat`` ``x22 x22 x00 x33 x22 x00 x02 x31`` ``....9.0000M0WIN.........`` Unix non-Intel 131 | ``lowbwt_x64.sas7bdat`` ``x22 x22 x00 x33 x33 x01 x02 x32`` ``....9.0202M2X64_VSPRO...`` Windows x64 Intel 132 | ``natlterr1994.sas7bdat`` ``x33 x22 x00 x33 x33 x00 x02 x31`` ``........9.0101M3SunOS...`` u64 Unix non-Intel 133 | ``natlterr2006.sas7bdat`` ``x33 x22 x00 x33 x33 x00 x02 x31`` ``........9.0101M3SunOS...`` u64 Unix non-Intel 134 | ``txzips.sas7bdat`` ``x33 x22 x00 x33 x33 x01 x02 x31`` ``........9.0201M0Linux...`` u64 Unix Intel 135 | =========================== =================================== ============================ ====================== 136 | 137 | .. _`table of hexadecimal, decimal, and binary values`: 138 | 139 | The binary representation for the hexadecimal values present in the table above are given below. 140 | 141 | =========== ======= ============= 142 | hexadecimal decimal binary 143 | =========== ======= ============= 144 | ``x01`` ``001`` ``b00000001`` 145 | ``x02`` ``002`` ``b00000010`` 146 | ``x22`` ``034`` ``b00010010`` 147 | ``x31`` ``049`` ``b00011001`` 148 | ``x32`` ``050`` ``b00011010`` 149 | ``x33`` ``051`` ``b00011011`` 150 | =========== ======= ============= 151 | 152 | Alignment 153 | +++++++++ 154 | 155 | In files generated by 64 bit builds of SAS, 'alignment' means that all data field offsets containing doubles or 8 byte ints should be a factor of 8 bytes. For files generated by 32 bit builds of SAS, the alignment is 4 bytes. Because `SAS7BDAT Packed Binary Data`_ may contain double precision values, it appears that all data rows are 64 bit aligned, regardless of whether the file was written with a 32 bit or 64 bit build of SAS. Alignment of data structures according to the platform word length (4 bytes for 32 bit, and 8 bytes for 64 bit architectures) facilitates efficient operations on data stored in memory. It also suggests that parts of SAS7BDAT data file format are platform dependent. One theory is that the SAS implementation utilizes a common C or C++ structure or class to reference data stored in memory. When compiled, these structures are aligned according to the word length of the target platform. Of course, when SAS was originally written, platform differences may not have been forseeable. Hence, these inconsistencies may not have been intentional. 156 | 157 | Magic Number 158 | ++++++++++++ 159 | 160 | The SAS7BDAT magic number is the following 32 byte (hex) sequence.:: 161 | 162 | x00 x00 x00 x00 x00 x00 x00 x00 163 | x00 x00 x00 x00 xc2 xea x81 x60 164 | xb3 x14 x11 xcf xbd x92 x08 x00 165 | x09 xc7 x31 x8c x18 x1f x10 x11 166 | 167 | In all test files except one (not listed in ``data/sas7bdat.sources.RData``), the magic number above holds. The one anomalous file has the following magic number:: 168 | 169 | x00 x00 x00 x00 x00 x00 x00 x00 170 | x00 x00 x00 x00 x00 x00 x00 x00 171 | x00 x00 x00 x00 x00 x00 x00 x00 172 | x00 x00 x00 x00 x18 x1f x10 x11 173 | 174 | In addition, the anomalous file is associated with the SAS release "3.2TK". Indeed, this file may not have been written by SAS. Otherwise, the anomalous file appears to be formatted similarly to other test files. 175 | 176 | Character Encoding 177 | ++++++++++++++++++ 178 | 179 | The integer (one or two bytes) at header offset 70 (bytes) indicates the character encoding of string data. The table below lists the values that are known to occur and the associated character encoding. 180 | 181 | ============== ============== ============= 182 | bytes 70-72 SAS name iconv name 183 | ============== ============== ============= 184 | 0 (Unspecified) (Unspecified) 185 | 20 utf-8 UTF-8 186 | 28 us-ascii US-ASCII 187 | 29 latin1 ISO-8859-1 188 | 30 latin2 ISO-8859-2 189 | 31 latin3 ISO-8859-3 190 | 34 arabic ISO-8859-6 191 | 36 hebrew ISO-8859-8 192 | 39 thai ISO-8859-11 193 | 40 latin5 ISO-8859-9 194 | 60 wlatin2 WINDOWS-1250 195 | 61 wcyrillic WINDOWS-1251 196 | 62 wlatin1 WINDOWS-1252 197 | 63 wgreek WINDOWS-1253 198 | 64 wturkish WINDOWS-1254 199 | 65 whebrew WINDOWS-1255 200 | 66 warabic WINDOWS-1256 201 | 119 euc-tw EUC-TW 202 | 123 big5 BIG-5 203 | 125 euc-cn EUC-CN 204 | 134 euc-jp EUC-JP 205 | 138 shift-jis SHIFT-JIS 206 | 140 euc-kr EUC-KR 207 | ============== ============== ============= 208 | 209 | When the encoding is unspecified, the file uses the encoding of the SAS session that produced it (usually Windows-1252). 210 | 211 | SAS7BDAT Pages 212 | ============== 213 | 214 | Following the SAS7BDAT header are pages of data. Each page can be one of (at least) four types. The first three are those that contain meta-information (e.g. field/column attributes), packed binary data, or a combination of both. These types are denoted 'meta', 'data', and 'mix' respectively. Meta-information is required to correctly interpret the packed binary information. Hence, this information must be parsed first. In test files, 'meta' and 'mix' pages always precede 'data' pages. In some test data files, there is a fourth page type, denoted 'amd' which appears to encode additional meta information. This page usually occurs last, and appears to contain amended meta information. 215 | 216 | The `page offset table`_ below describes each page type. Byte offsets appended with one of '(meta/mix)', '(mix)', or '(data)' indicate that the corresponding length and description apply only to pages of the listed type. Provisionally, the internal structure of the 'amd' page type is considered identical to the 'meta' page type. 217 | 218 | Page Offset Table 219 | ----------------- 220 | 221 | ============== ============== ====== =============================================== 222 | offset length conf. description 223 | ============== ============== ====== =============================================== 224 | 0 4 low int, page sequence signature? 225 | 4 12|28 low *????????????* length 12 or 28 (**u64**) 226 | B 2 medium int, bit field `page type`_ := _PGTYPE; B = 16|32 227 | B+2 2 medium int, data block count := _`BC` 228 | B+4 2 medium int, `subheader pointers`_ count := _`SC` <= `BC`_ 229 | B+6 2 low *????????????* 230 | B+8 SC*SL medium SC `subheader pointers`_, SL = 12|24 231 | B+8+SC*SL DL medium if NRD>0, 8 byte alignment; DL = (B+8+SC*SL+7) % 8 * 8 232 | B+8+SC*SL+DL RC*`RL`_ medium `SAS7BDAT packed binary data`_ data row count := RC = (BC-SC) 233 | C %`PL`_ medium subheader data and/or filler; C = (B+8+SC*SL+DL+RC*RL) 234 | ============== ============== ====== =============================================== 235 | 236 | Page Type 237 | +++++++++ 238 | 239 | ====== ==== ========== ======================================== =================== 240 | PGTYPE name subheaders uncompressed row data (after subheaders) compressed row data (in subheaders) 241 | ====== ==== ========== ======================================== =================== 242 | 0 meta yes (SC>0) no (BC=SC) yes 243 | 256 data no (SC=0) yes (RC=BC) no 244 | 512 mix yes (SC>0) yes (RC=BC-SC) no 245 | 1024 amd yes? yes? no? 246 | 16384 meta yes (SC>0) no (BC=SC) yes 247 | -28672 comp no no no 248 | ====== ==== ========== ======================================== =================== 249 | 250 | There are at least four page types 'meta', 'data', 'mix', and 'amd'. These types are encoded in the most significant byte of a two byte bit field at page offset 16|32. If no bit is set, the following page is of type 'meta'. If the first, second, or third bits are set, then the page is of type 'data', 'mix', or 'amd', respectively. Hence, if the two bytes are interpreted as an unsigned integer, then the 'meta', 'data', 'mix', and 'amd' types correspond to 0, 256, 512, and 1024, respectively. In compressed files, other bits (and sometimes multiple bits) have been set (e.g., ``1 << 16 | 1 << 13``, which is ``-28672`` signed, or ``36864`` unsigned). However, the pattern is unclear. 251 | 252 | If a page is of type 'meta', 'mix', or 'amd', data beginning at offset byte 24|40 are a sequence of SC SL-byte `subheader pointers`_, which point to an offset farther down the page. `SAS7BDAT Subheaders`_ stored at these offsets hold meta information about the database, including the column names, labels, and types. 253 | If a page is of type 'mix', then **packed binary data begin at the next 8 byte boundary following the last subheader pointer**. In this case, the data begin at offset B+8+SC*SL+DL, where DL = (B+8+SC*SL+PL+7) % 8 * 8, and '%' is the modulo operator. 254 | 255 | If a page is of type 'data', then packed binary data begin at offset 24|40. 256 | 257 | The 'comp' page was observed as page 2 of the compress_yes.sas7bdat test file (not distributed with the ``sas7bdat`` package). It has BC and SC fields, but no subheader pointers. It contains some initial data and 2 tables. The first table has many rows of length 24; its purpose is unknown. The second table has one entry per data page with the page number and the number of data rows on the page for SC pages. It could be used to access a particular row without reading all preceding data pages. 258 | 259 | Subheader Pointers 260 | ++++++++++++++++++ 261 | 262 | The subheader pointers encode information about the offset and length of subheaders relative to the beginning of the page where the subheader pointer is located. The purpose of the last four bytes of the subheader pointer are uncertain, but may indicate that additional subheader pointers are to be found on the next page, or that the corresponding subheader is not crucial. 263 | 264 | ======= ====== ====== =============================================== 265 | offset length conf. description 266 | ======= ====== ====== =============================================== 267 | 0 4|8 high int, offset from page start to subheader 268 | 4|8 4|8 high int, length of subheader := _`QL` 269 | 8|16 1 medium int, compression := _`COMP` 270 | 9|17 1 low int, subheader type := ST 271 | 10|18 2|6 low zeroes 272 | 12|24 high Total length of subheader pointer 12|24 (**u64**), SL 273 | ======= ====== ====== =============================================== 274 | 275 | QL is sometimes zero, which indicates that no data is referenced by the corresponding subheader pointer. When this occurs, the subheader pointer may be ignored. 276 | 277 | ======= ============ 278 | `COMP`_ description 279 | ======= ============ 280 | 0 uncompressed 281 | 1 truncated (ignore data) 282 | 4 RLE compressed row data with control byte 283 | ======= ============ 284 | 285 | ==== ============ 286 | ST subheaders 287 | ==== ============ 288 | 0 Row Size, Column Size, Subheader Counts, Column Format and Label, in Uncompressed file 289 | 1 Column Text, Column Names, Column Attributes, Column List 290 | 1 all subheaders (including row data), in Compressed file. 291 | ==== ============ 292 | 293 | 294 | SAS7BDAT Subheaders 295 | =================== 296 | 297 | Subheaders contain meta information regarding the SAS7BDAT database, including row and column counts, column names, labels, and types. Each subheader is associated with a four- or eight-byte 'signature' (**u64**) that identifies the subheader type, and hence, how it should be parsed. 298 | 299 | Row Size Subheader 300 | ------------------ 301 | 302 | The row size subheader holds information about row length (in bytes), their total count, and their count on a page of type 'mix'. Fields at offset 28|56 and higher are not needed to read the file, but are documented here for completeness. The four test files used for example data in the higher fields are ``eyecarex.sas7bdat``, ``acadindx.sas7bdat``, ``natlterr1994.sas7bdat``, ``txzips.sas7bdat`` (non-Intel/Intel x regular/u64). 303 | 304 | ========= ========= ====== =============================================== 305 | offset length conf. description 306 | ========= ========= ====== =============================================== 307 | 0 4|8 high binary, signature xF7F7F7F7|xF7F7F7F700000000 308 | 4|8 16|32 low *????????????* 309 | 20|40 4|8 high int, row length (in bytes) := _`RL` 310 | 24|48 4|8 high int, total row count := TRC 311 | 28|56 8|16 low *????????????* 312 | 36|72 4|8 medium int, number of `Column Format and Label Subheader`_ on first page where they appear := _`NCFL1` 313 | 40|80 4|8 medium int, number of `Column Format and Label Subheader`_ on second page where they appear (or 0) := _`NCFL2` 314 | 44|88 8|16 low *????????????* 315 | 52|104 4|8 medium int, page size, equals PL 316 | 56|112 4|8 low *????????????* 317 | 60|120 4|8 medium int, max row count on "mix" page := _`MRC` 318 | 64|128 8|16 medium sequence of 8|16 FF, end of initial header 319 | 72|144 148|296 medium zeroes 320 | 220|440 4 low int, page sequence signature (equals current page sequence signature) 321 | 224|444 40|68 low zeroes 322 | 264|512 4|8 low int, value 1 observed in 4 test files 323 | 268|520 2 low int, value 2 observed 324 | 270|522 2|6 low zeroes (pads length of 3 fields to 8|16) 325 | 272|528 4|8 medium int, number of pages with subheader data := NPSHD 326 | 276|536 2 medium int, number of subheaders with positive length on last page with subheader data := NSHPL 327 | 278|538 2|6 low zeroes 328 | 280|544 4|8 low int, values equal to NPSHD observed 329 | 284|552 2 low int, values equal to NSHPL+2 observed 330 | 286|554 2|6 low zeroes 331 | 288|560 4|8 medium int, number of pages in file, equals PC 332 | 292|568 2 low int, values 22,26,9,56 observed 333 | 294|570 2|6 low zeroes 334 | 296|576 4|8 low int, value 1 observed 335 | 300|584 2 low int, values 7|8 observed 336 | 302|586 2|6 low zeroes 337 | 304|592 40|80 low zeroes 338 | 344|672 2 low int, value 0 339 | 346|674 2 low int, values 0|8 340 | 348|676 2 low int, value 4 341 | 350|678 2 low int, value 0 342 | 352|680 2 low int, values 12,32|0 343 | 354|682 2 low int, length of Creator Software string := LCS 344 | 356|684 2 low int, value 0 345 | 358|686 2 low int, value 20 346 | 360|688 2 low int, value of 8 indicates MXNAM and MXLAB valid := IMAXN 347 | 362|690 8 low zeroes 348 | 370|698 2 low int, value 12 349 | 372|700 2 low int, value 8 350 | 374|702 2 low int, value 0 351 | 376|704 2 low int, value 28 352 | 378|706 2 low int, length of Creator PROC step name := LCP 353 | 380|708 36 low zeroes 354 | 416|744 2 low int, value 4 355 | 418|746 2 low int, value 1 356 | 420|748 2 low int, number of Column Text subheaders in file := _`NCT` 357 | 422|750 2 low int, max length of column names := MXNAM (see IMAXN) 358 | 424|752 2 low int, max length of column labels := MXLAB (see IMAXN) 359 | 426|754 12 low zeroes 360 | 438|766 2 medium int, number of data rows on a full page INT[(PL - 24 / 40)/`RL`_]; 0 for compressed file 361 | 440|768 27 low zeroes 362 | 467|795 1 low int, bit field, values 1,5 363 | 468|796 12 low zeroes 364 | 480|808 medium Total length of subheader, QL 365 | ========= ========= ====== =============================================== 366 | 367 | 368 | 369 | Column Size Subheader 370 | --------------------- 371 | 372 | The `column size subheader`_ holds the number of columns (variables). 373 | 374 | ======= ====== ====== ================================= 375 | offset length conf. description 376 | ======= ====== ====== ================================= 377 | 0 4|8 high binary, signature xF6F6F6F6|xF6F6F6F600000000 378 | 4|8 4|8 high int, number of columns := NCOL 379 | 8|16 4|8 low *????????????* usually zeroes 380 | 12|24 medium Total length of subheader, QL 381 | ======= ====== ====== ================================= 382 | 383 | 384 | Subheader Counts Subheader 385 | -------------------------- 386 | 387 | This subheader contains information on the first and last appearances of at least 7 common subheader types. Any of these subheaders may appear once or more. Multiple instances of a subheader provide information for an exclusive subset of columns. The order in which data is read from multiple subheaders corresponds to the reading order (left to right) of columns. The structure of this subheader was deduced and reported by Clint Cummins. 388 | 389 | ========= ======= ====== =============================================== 390 | offset length conf. description 391 | ========= ======= ====== =============================================== 392 | 0 4|8 high int, signature -1024 (x00FCFFFF|x00FCFFFFFFFFFFFF) 393 | 4|8 4|8 low int, length or offset, usually >= 48 394 | 8|16 4|8 low int, usually 4 395 | 12|24 2 low int, usually 7 (number of nonzero SCVs?) 396 | 14|26 50|94 low *????????????* 397 | 64|120 12*LSCV medium 12 `subheader count vectors`_, length := LSCV = 20|40 bytes each 398 | 304|600 medium Total length of subheader, QL 399 | ========= ======= ====== =============================================== 400 | 401 | Subheader Count Vectors 402 | +++++++++++++++++++++++ 403 | 404 | The subheader count vectors encode information for each of 4 common subheader types, and potentially 12 total subheader types. 405 | 406 | ======= ====== ====== ===================================================== 407 | offset length conf. description 408 | ======= ====== ====== ===================================================== 409 | 0 4|8 high int, signature (see list below) 410 | 4|8 4|8 medium int, page where this subheader first appears := PAGE1 411 | 8|16 2 medium int, position of subheader pointer in PAGE1 := LOC1 412 | 10|18 2|6 low *????????????* zero padding 413 | 12|24 4|8 medium int, page where this subheader last appears := PAGEL 414 | 16|32 2 medium int, position of subheader pointer in PAGEL := LOCL 415 | 18|34 2|6 low *????????????* zero padding 416 | 20|40 medium Total length of subheader count vector, LSCV 417 | ======= ====== ====== ===================================================== 418 | 419 | The LOC1 and LOCL give the positions of the corresponding subheader pointer in PAGE1 and PAGEL, respectively. That is, if there are SC subheader pointers on page PAGE1, then the corresponding subheader pointer first occurs at the LOC1'th position in this array, enumerating from 1. If PAGE1=0, the subheader is not present. If PAGE1=PAGEL and LOC1=LOCL, the subheader appears exactly once. If PAGE1!=PAGEL or LOC1!=LOCL, the subheader appears 2 or more times. In all test files, PAGE1 <= PAGEL, and the corresponding subheaders appear only once per page. The variable `NCT`_ in the `Row Size Subheader`_ should be used to ensure that all Column Text subheaders are located (and to avoid scanning through all pages in the file when all subheaders are already located). 420 | 421 | The first 7 binary signatures in the `Subheader Count Vectors`_ array are always: 422 | 423 | ========= ==================== 424 | signature description 425 | ========= ==================== 426 | -4 Column Attributes 427 | -3 Column Text 428 | -1 Column Names 429 | -2 Column List 430 | -5 unknown signature #1 431 | -6 unknown signature #2 432 | -7 unknown signature #3 433 | ========= ==================== 434 | 435 | The remaining 5 out of 12 signatures are zeros in the observed source files. Presumably, these are for subheaders not yet defined, or not present in the collection of test files. 436 | 437 | A `Column Format and Label Subheader`_ may appear on multiple pages, but are not indexed in Subheader Counts. The variables NCFL1 and NCFL2 in the `Row Size subheader`_ may be helpful if you want to know in advance if these appear across multiple pages. 438 | 439 | 440 | Column Text Subheader 441 | --------------------- 442 | 443 | The column text subheader contains a block of text associated with columns, including the column names, labels, and formats. However, this subheader is not sufficient to parse this information. Other subheaders (e.g. the `column name subheader`_), which point to specific elements within this subheader are also needed. 444 | 445 | ======= ====== ====== =============================================== 446 | offset length conf. description 447 | ======= ====== ====== =============================================== 448 | 0 4|8 high int, signature -3 (xFDFFFFFF|xFDFFFFFFFFFFFFFF) 449 | 4|8 2 medium int, size of text block (QL - 16|20) 450 | 6|10 2 low *????????????* 451 | 8|12 2 low *????????????* 452 | 10|14 2 low *????????????* 453 | 12|16 2 low *????????????* 454 | 14|18 2 low *????????????* 455 | 16|20 varies medium ascii, compression & Creator PROC step name that generated data 456 | varies %QL high ascii, combined column names, labels, formats 457 | ======= ====== ====== =============================================== 458 | 459 | This subheader sometimes appears more than once; each is a separate array. If so, the "column name index" field in `column name pointers`_ selects a particular text array - 0 for the first array, 1 for the second, etc. Similarly, "column format index" and "column label index" fields also select a text array. Offsets to strings within the text array are multiples of 4, so the column names and labels section of the array often contains many nulls for padding. 460 | 461 | The variables LCS and LCP from the `Row Size subheader`_ refer to a text field at the start of the text array (at offset 16|20) in the first Column Text subheader (before the column name strings). This text field also contains compression information. The following logic decodes this initial field: 462 | 463 | 1. If the first 8 bytes of the field are blank, file is not compressed, and set LCS=0. The Creator PROC step name is the LCP bytes starting at offset 16. 464 | 2. If LCS > 0 (still), the file is not compressed, the first LCS bytes are the Creator Software string (padded with nulls). Set LCP=0. Stat/Transfer files use this pattern. 465 | 3. If the first 8 bytes of the field are ``SASYZCRL``, the file is compressed with Run Length Encoding. The Creator PROC step name is the LCP bytes starting at offset 24. 466 | 4. If the first 8 bytes are nonblank and options 2 or 3 above are not used, this probably indicates COMPRESS=BINARY. We need test files to confirm this, though. 467 | 468 | 469 | Column Name Subheader 470 | --------------------- 471 | 472 | Column name subheaders contain a sequence of `column name pointers`_ to the offset of each column name **relative to a** `column text subheader`_. There may be multiple column name subheaders, indexing into multiple column text subheaders. 473 | 474 | ======= ====== ====== ==================================================== 475 | offset length conf. description 476 | ======= ====== ====== ==================================================== 477 | 0 4|8 high int, signature -1 (xFFFFFFFF|xFFFFFFFFFFFFFFFF) 478 | 4|8 2 medium int, length of remaining subheader (QL - 16|20) 479 | 6|10 2 low *????????????* 480 | 8|12 2 low *????????????* 481 | 10|14 2 low *????????????* 482 | 12|16 8*CMAX medium `column name pointers`_ (see below), CMAX=(QL-20|28)/8 483 | MCN 8|12 low zeros, 12|16 + 8*CMAX := MCN 484 | ======= ====== ====== ==================================================== 485 | 486 | Each column name subheader holds CMAX column name pointers. When there are multiple column name subheaders, CMAX will be less than NCOL. 487 | 488 | Column Name Pointers 489 | ++++++++++++++++++++ 490 | 491 | ====== ====== ====== ====================================================== 492 | offset length conf. description 493 | ====== ====== ====== ====================================================== 494 | 0 2 high int, column name index to select `Column Text Subheader`_ 495 | 2 2 high int, column name offset w.r.t. end of selected Column Text signature. Always a multiple of 4. 496 | 4 2 high int, column name length 497 | 6 2 low zeros 498 | 8 high Total length of column name pointer 499 | ====== ====== ====== ====================================================== 500 | 501 | 502 | Column Attributes Subheader 503 | --------------------------- 504 | 505 | The column attribute subheader holds information regarding the column offsets within a data row, the column widths, and the column types (either numeric or character). The column attribute subheader sometimes occurs more than once (in test data). In these cases, column attributes are applied in the order they are parsed. 506 | 507 | ======= ========= ====== =================================================== 508 | offset length conf. description 509 | ======= ========= ====== =================================================== 510 | 0 4|8 high int, signature -4 (hex xFCFFFFFF|FCFFFFFFFFFFFFFF) 511 | 4|8 2 medium int, length of remaining subheader 512 | 6|10 2 low *????????????* 513 | 8|12 2 low *????????????* 514 | 10|14 2 low *????????????* 515 | 12|16 LCAV*CMAX high `column attribute vectors`_ (see below), CMAX=(QL-20|28)/LCAV, LCAV=12|16 516 | MCA 8|12 low MCA = 12|16 + LCAV*CMAX 517 | ======= ========= ====== =================================================== 518 | 519 | Column Attribute Vectors 520 | ++++++++++++++++++++++++ 521 | 522 | ============== ====== ====== =============================================== 523 | offset length conf. description 524 | ============== ====== ====== =============================================== 525 | 0 4|8 high int, column offset in data row (in bytes) 526 | 4|8 4 high int, column width 527 | 8|12 2 low name length flag 528 | 10|14 1 high int, column type (1 = numeric, 2 = character) 529 | 11|15 1 low *????????????* 530 | 12|16 high Total length of column attribute vector, LCAV 531 | ============== ====== ====== =============================================== 532 | 533 | Observed values of name length flag in the source files: 534 | 535 | ================ ================================================================= 536 | name length flag description 537 | ================ ================================================================= 538 | 4 name length <= 8 539 | 1024 usually means name length <= 8 , but sometimes the length is 9-12 540 | 2048 name length > 8 541 | 2560 name length > 8 542 | ================ ================================================================= 543 | 544 | 545 | Column Format and Label Subheader 546 | --------------------------------- 547 | 548 | The column format and label subheader contains pointers to a column format and label **relative to a** `column text subheader`_. Since the column label subheader only contains information regarding a single column, there are typically as many of these subheaders as columns. The structure of column format pointers was contributed by Clint Cummins. 549 | 550 | ======= ======= ====== =============================================== 551 | offset length conf. description 552 | ======= ======= ====== =============================================== 553 | 0 4|8 high int, signature -1026 (hex FEFB & 2 or 6 FFs) 554 | 4|8 30|38 low *????????????* 555 | 34|46 2 high int, column format index to select `Column Text Subheader`_ 556 | 36|48 2 high int, column format offset w.r.t. end of selected Column Text signature. A multiple of 4. 557 | 38|50 2 high int, column format length 558 | 40|52 2 high int, column label index to select `Column Text Subheader`_ 559 | 42|54 2 high int, column label offset w.r.t. end of selected Column Text signature. A multiple of 4. 560 | 44|56 2 high int, column label length 561 | 46|58 6 low *????????????* 562 | 52|64 medium Total length of subheader, QL 563 | ======= ======= ====== =============================================== 564 | 565 | Column List Subheader 566 | --------------------- 567 | 568 | The purpose of this subheader is not clear. But the structure is partly identified. Information related to this subheader was contributed by Clint Cummins. eyecarex (created by Stat/Transfer) does not have this subheader. 569 | 570 | ======= ====== ====== =============================================== 571 | offset length conf. description 572 | ======= ====== ====== =============================================== 573 | 0 4|8 high int, signature -2 (hex FE & 3 or 7 FFs) 574 | 4|8 2 low int, value close to offset in subheader pointer 575 | 6|10 6 low *????????????* 576 | 12|16 4|8 medium int, length of remaining subheader 577 | 16|24 2 low int, usually equals NCOL 578 | 18|26 2 medium int, length of column list := CL, usually CL > NCOL 579 | 20|28 2 low int, usually 1 580 | 22|30 2 low int, usually equals NCOL 581 | 24|32 2 low int, usually 3 equal values 582 | 26|34 2 low int, usually 3 equal values 583 | 28|36 2 low int, usually 3 equal values 584 | 30|38 2*CL medium `column list values`_ (see below) 585 | MCL 8 low usually zeros, 30|38 + 2*CL := MCL 586 | ======= ====== ====== =============================================== 587 | 588 | Column List Values 589 | ++++++++++++++++++ 590 | 591 | These values are 2 byte integers, with (CL-NCOL) zero values. Each nonzero value is unique, between -NCOL and NCOL. The significance of signedness and ordering is unknown. The values do not correspond to a sorting order of columns. 592 | 593 | Compressed Binary Data Subheader 594 | -------------------------------- 595 | 596 | When a SAS7BDAT file is created by SAS with the option COMPRESS=CHAR or COMPRESS=YES, each row of data is compressed independently with a Run Length Encoding (RLE) structure. This yields a variable length compressed row. Each such row is stored in a single subheader in sequential order, indexed by the `subheader pointers`_. A RLE compressed data row is identified by COMP=4 in the subheader pointer, and does not have a subheader signature. If a particular row had highly variable data and yielded no compression, it is still stored in a subheader, but uncompressed with COMP=0 instead of COMP=4. The test file ``compress_yes.sas7bdat`` has such highly variable (random) data and all its rows are in this COMP=0 form of subheaders. It takes up more space than the uncompressed version ``compress_no.sas7bdat``, due to the extra length of the subheader pointers. The final subheader on a page is usually COMP=1, which indicates a truncated row to be ignored; the complete data row appears on the next page. 597 | 598 | The SAS option COMPRESS=BINARY apparently uses a RDC (Ross Data Compression) structure instead of RLE. We need more test files to investigate this structure, and only document RLE at present. 599 | 600 | Run Length Encoding 601 | +++++++++++++++++++ 602 | 603 | In RLE, the compressed row data is a series of control bytes, each optionally followed by data bytes. The control byte specifies how the data bytes are interpreted, or is self contained. The control byte has 2 parts - the upper 4 bits are the Command, and the lower 4 bits are the Length. Each is an uint in the range 0-15. For example, control byte 82 (hex) is Command 8 and Length 2, and control byte F4 (hex) is command 15 (F hex) and Length 4. We have identified the functions of the 11 different Command values which are observed in the test files. The RLE structure was contributed by Clint Cummins. 604 | 605 | ======= ====== ============= ============================ 606 | Command Length Name Function 607 | ======= ====== ============= ============================ 608 | 0 0 Copy64 using the first byte as a uint length L (0-255), Copy the next N=64+L bytes from the input to the output (copies 64 to 319 bytes) 609 | 1 ? ? *????????????* (not observed in test files) 610 | 2 ? ? *????????????* (not observed in test files) 611 | 3 ? ? *????????????* (not observed in test files) 612 | 4 ? ? *????????????* (not observed in test files) 613 | 5 ? ? *????????????* (not observed in test files) 614 | 6 0 InsertBlank17 using the first byte as a uint length L, Insert N=17+L blanks (decimal 32, hex 20) in the output (inserts 17 to 273 blanks) 615 | 7 0 InsertZero17 using the first byte as a uint length L, Insert N=17+L zero bytes in the output 616 | 8 L Copy1 using the Length bits as a uint length L (0-15), Copy the next N=1+L bytes from the input to the output (copies 1 to 16 bytes) 617 | 9 L Copy17 Copy the next N=17+L bytes from the input to the output (copies 17 to 32 bytes) 618 | 10 (A) L Copy33 Copy the next N=33+L bytes from the input to the output (copies 33 to 48 bytes) 619 | 11 (B) L Copy49 Copy the next N=49+L bytes from the input to the output (copies 49 to 64 bytes) 620 | 12 (C) L InsertByte3 Insert N=3+L copies of the next byte in the output (inserts 3 to 18 bytes) 621 | 13 (D) L Insert@2 Insert N=2+L @ (decimal 64, hex 40) bytes in the output (inserts 2 to 17 @ bytes) 622 | 14 (E) L InsertBlank2 Insert N=2+L blanks in the output 623 | 15 (F) L InsertZero2 Insert N=2+L zero bytes in the output 624 | ======= ====== ============= ============================ 625 | 626 | The most common Commands in ``obs_all_perf_1.sas7bdat`` are F and 8 (alternating). This file is entirely 8 byte doubles, so the F commands often handle consecutive zero bytes in zero value doubles. 627 | 628 | RLE Example 1 629 | +++++++++++++ 630 | 631 | Compressed data row: 632 | 633 | ``87 A B C D E F G H F2 8A 1 2 3 4 5 6 7 8 9 A B D0 A1 a b c d e f g ... z`` 634 | 635 | ``CB -8-data-bytes-- CB CB --11-data-bytes------ CB CB --34-data-bytes--`` 636 | 637 | ``Copy1 InsertZero2 Ins Copy33 next 34 bytes`` 638 | 639 | ``Next 8 bytes 4 00h bytes 2 40h`` 640 | 641 | There are 5 Control Bytes (CB) in the above sequence. 642 | 643 | 1. 87: Copy1 next 8 bytes 644 | 2. F2: InsertZero2 4 00h bytes 645 | 3. 8A: Copy1 next 11 bytes 646 | 4. D0: Insert@2 2 40h bytes 647 | 5. A1: Copy33 next 34 bytes 648 | 649 | Output uncompressed row: 650 | 651 | ``A B C D E F G H 00 00 00 00 1 2 3 4 5 6 7 8 9 A B 40 40 a b c ... z`` 652 | 653 | RLE Example 2 654 | +++++++++++++ 655 | 656 | Compressed data row: 657 | 658 | ``87 A B C D E F G H C1 99 A5 a b c ... z`` 659 | 660 | ``CB -8-data-bytes-- CB ar CB -last-bytes`` 661 | 662 | ``Copy1 8 InsBy Copy33 38 bytes`` 663 | 664 | Control Bytes in Example 2: 665 | 666 | 1. 87: Copy1 next 8 bytes 667 | 2. C1,99: InsertByte3 4 99h bytes 668 | 3. A5: Copy33 next 38 bytes 669 | 670 | Output uncompressed row: 671 | 672 | ``A B C D E F G H 99 99 99 99 a b c ... z`` 673 | 674 | Once a data row is uncompressed, use the `SAS7BDAT Packed Binary Data`_ description below to read the variables. 675 | 676 | 677 | 678 | SAS7BDAT Packed Binary Data 679 | =========================== 680 | 681 | SAS7BDAT packed binary are uncompressed, and appear after any subheaders on the page; see the `Page Offset Table`_. These data are stored by rows, where the size of a row (in bytes) is defined by the `row size subheader`_. When multiple rows occur on a single page, they are immediately adjacent. When a database contains many rows, it is typical that the collection of rows (i.e. their data) is evenly distributed to a number of 'data' pages. However, in test files, no single row's data is broken across two or more pages. A single data row is parsed by interpreting the binary data according to the collection of column attributes contained in the `column attributes subheader`_. Binary data can be interpreted in two ways, as ASCII characters, or as floating point numbers. The column width attribute specifies the number of bytes associated with a column. For character data, this interpretation is straight-forward. For numeric data, interpretation of the column width is more complex. 682 | 683 | The common binary representation of floating point numbers has three parts; the sign (``s``), exponent (``e``), and mantissa (``m``). The corresponding floating point number is ``s * m * b ^ e``, where ``b`` is the base (2 for binary, 10 for decimal). Under the IEEE 754 floating point standard, the sign, exponent, and mantissa are encoded by 1, 11, and 52 bits respectively, totaling 8 bytes. In SAS7BDAT file, numeric quantities can be 3, 4, 5, 6, 7, or 8 bytes in length. For numeric quantities of less than 8 bytes, the remaining number of bytes are truncated from the least significant part of the mantissa. Hence, the minimum and maximum numeric values are identical for all byte lengths, but shorter numeric values have reduced precision. 684 | 685 | Reduction in precision is characterized by the largest integer such that itself and all smaller integers have an exact representation, denoted ``M``. At best, all integers greater than ``M`` are approximated to the nearest multiple of ``b``. The table of `numeric binary formats`_ below lists ``M`` values and describes how bits are distributed among the six possible column widths in SAS7BDAT files, and lists. 686 | 687 | Numeric Binary Formats 688 | ---------------------- 689 | 690 | ===== ===== ==== ======== ======== ================ 691 | size bytes sign exponent mantissa ``M`` 692 | ===== ===== ==== ======== ======== ================ 693 | 24bit 3 1 11 12 8192 694 | 32bit 4 1 11 20 2097152 695 | 40bit 5 1 11 28 536870912 696 | 48bit 6 1 11 36 137438953472 697 | 56bit 7 1 11 44 35184372088832 698 | 64bit 8 1 11 52 9007199254740990 699 | ===== ===== ==== ======== ======== ================ 700 | 701 | Dates, Currency, and Formatting 702 | ------------------------------- 703 | 704 | Column formatting infomation is encoded within the `Column Text Subheader`_ and `Column Format and Label Subheader`_. Columns with formatting information have special meaning and interpretation. For example, numeric values may represent dates, encoded as the number of seconds since midnight, January 1, 1960. The format string for fields encoded this way is "DATETIME". Using R, these values may be converted using the as.POSIXct or as.POSIXlt functions with argument ``origin="1960-01-01"``. The most common date format strings correspond to numeric fields, and are interpreted as follows: 705 | 706 | ======== ======================================= ============ 707 | Format Interpretation R Function 708 | ======== ======================================= ============ 709 | DATE Number of days since January 1, 1960 chron::chron 710 | TIME Number of seconds since midnight as.POSIXct 711 | DATETIME Number of seconds since January 1, 1960 as.POSIXct 712 | ======== ======================================= ============ 713 | 714 | There are many additional format strings for numeric and character fields. 715 | 716 | Platform Differences 717 | ==================== 718 | 719 | The test files referenced in ``data/sas7bdat.sources.RData`` were examined over a period of time. Files with non-Microsoft Windows markings were only observed late into the writing of this document. Consequently (but not intentionally), the SAS7BDAT description above was first deduced for SAS datasets generated on the most commonly observed platform: Microsoft Windows. The extensions to SAS7BDAT files for **u64** and non-Intel formats was contributed a little later by Clint Cummins. 720 | 721 | In particular, the files ``natlerr1944.sas7bdat``, ``natlerr2006.sas7bdat`` appear to be generated on the 'SunOS' platform (**u64**, non-Intel). ``txzips.sas7bdat`` was created on Linux 64-bit SAS server (**u64**, Intel). ``eyecarex.sas7bdat`` is non-Intel, possibly 32-bit PowerPC. 722 | 723 | The files ``cfrance2.sas7bdat``, ``cfrance.sas7bdat``, ``coutline.sas7bdat``, ``gfrance2.sas7bdat``, ``gfrance.sas7bdat``, ``goutline.sas7bdat``, ``xfrance2.sas7bdat``, ``xfrance.sas7bdat``, ``xoutline.sas7bdat`` appear to be generated on a 32-bit 'Linux' Intel system. They have the same format as Windows files, except for the (ignorable) OS strings in the first header. 724 | 725 | Text may appear in non-ASCII compatible, partially ASCII compatible, or multi-byte encodings. In particular, Kasper Sorenson discovered some text that appears to be encoded using the Windows-1252 'code page'. 726 | 727 | **Key Test Files** 728 | 729 | ================================= ====================================== 730 | filename format features 731 | ================================= ====================================== 732 | ``acadindx.sas7bdat`` non-u64, Intel (most files are like this one) 733 | ``br.sas7bdat`` truncated doubles (widths 3,4,6; compare with br2 widths all 8) 734 | ``eyecarex.sas7bdat`` non-u64, non-Intel, written by Stat/Transfer 735 | ``txzips.sas7bdat`` u64, Intel 736 | ``natlterr1994.sas7bdat`` u64, non-Intel 737 | ``hltheds2006.sas7bdat`` 2 Column Attributes subheaders 738 | ``moshim.sas7bdat`` 3 Column Attributes subheaders 739 | ``flightdelays.sas7bdat`` 2 Column Text subheaders 740 | ``ymcls_p2_long_040506.sas7bdat`` 5 Column Text subheaders, first Column Attributes subheader is on page 6 741 | ``flightschedule.sas7bdat`` 2+ Column Text subheaders 742 | ``internationalflight.sas7bdat`` 2+ Column Text subheaders 743 | ``marchflights.sas7bdat`` 2+ Column Text subheaders 744 | ``mechanicslevel1.sas7bdat`` 2+ Column Text subheaders 745 | ``compress_yes.sas7bdat`` COMPRESS=CHAR, one PGTYPE=-28672, no RLE compression (COMP=0) 746 | ``obs_all_perf_1.sas7bdat`` COMPRESS=CHAR, many PGTYPE=16384, much RLE compression (COMP=4) 747 | ================================= ====================================== 748 | 749 | 750 | Compression Data 751 | ================ 752 | 753 | The table below presents the results of compression tests on a collection of 142 SAS7BDAT data files (sources in ``data/``). The 'type' field represents the type of compression, 'ctime' is the compression time (in seconds), 'dtime' is the decompression time, and the 'compression ratio' field holds the cumulative disk usage (in megabytes) before and after compression. Although the ``xz`` algorithm requires significantly more time to compress these data, the decompression time is on par with gzip. 754 | 755 | ============= ====== ====== ========================= 756 | type ctime dtime compression ratio 757 | ============= ====== ====== ========================= 758 | gzip -9 76.7s 2.6s 541M / 30.3M = 17.9 759 | bzip2 -9 92.7s 11.2s 541M / 19.0M = 28.5 760 | xz -9 434.2s 2.7s 541M / 12.8M = 42.3 761 | ============= ====== ====== ========================= 762 | 763 | 764 | Software Prototype 765 | ================== 766 | 767 | The prototype program for reading SAS7BDAT formatted files is implemented entirely in R (see file ``src/sas7bdat.R``). Files not recognized as having been generated under a Microsoft Windows platform are rejected (for now). Implementation of the ``read.sas7bdat`` function should be considered a 'reference implementation', and not one designed with performance in mind. 768 | 769 | There are certain advantages and disadvantages to developing a prototype of this nature in R. 770 | 771 | Advantages: 772 | 773 | 1. R is an interpreted language with built-in debugger. Hence, experimental routines may be implemented and debugged quickly and interactively, without the need of external compiler or debugger tools (e.g. gcc, gdb). 774 | 2. R programs are portable across a variety of computing platforms. This is especially important in the present context, because manipulating files stored on disk is a platform-specific task. Platform-specific operations are abstracted from the R user. 775 | 776 | Disadvantages: 777 | 778 | 1. Manipulating binary (raw) data in R is a relatively new capability. The best tools and practices for binary data operations are not as developed as those for other data types. 779 | 2. Interpreted code is often much less efficient than compiled code. This is not major disadvantage for prototype implementations because human code development is far less efficient than the R interpreter. Gains made in efficient code development using an interpreted language far outweigh benefit of compiled languages. 780 | 781 | Another software implementation was made by Clint Cummins, in the TSP econometrics package (mainly as an independent platform for exploring the format). 782 | 783 | ToDo 784 | ==== 785 | 786 | - obtain test files which use COMPRESS=BINARY, and develop identification and uncompression procedures 787 | - look for data which will reliably distinguish between structural subheaders (which have one of the known signatures) and uncompressed row data, which may have row data in the signature position that matches one of the known signatures. Both use COMP=0. Are NPSHD and NSHPL sufficient to do this? 788 | - obtain test files with more than 2.1 billion (and more than 4.2 billion) data rows, i.e. where 8 byte integer TRC in **u64** is apparently needed. Do the non-u64 files handle this, with additional fields beyond the 4 byte TRC used for segmentation? Is TRC a (signed) int or (unsigned) uint? 789 | - identify any SAS7BDAT encryption flag (this is not the same as 'cracking', or breaking encryption); we just identify if a file is encrypted and not readable without a key 790 | - experiment further with 'amendment page' concept 791 | - consider header bytes -by- SAS_host 792 | - check that only one page of type "mix" is observed. If so insert "In all test cases (``data/sources.csv``), there are exactly zero or one pages of type 'mix'." under the `Page Offset Table`_ header. [May not be needed, because the BC and SC fields in each Page Offset Table make the `MRC`_ field in the initial header unnecessary.] 793 | - identify all missing value representations: missing numeric values appear to be represented as '0000000000D1FFFF' (nan) for numeric 'double' quantities. 794 | - identify purpose of various unknown header quantities 795 | - determine purpose of Column List subheader 796 | - determine purpose and pattern of 'page sequence signature' fields. Are they useful? 797 | - identify how non-ASCII encoding is specified 798 | - implement R options to read just header (and subheader) information without data, and an option to read just some data fields, and not all fields. [The TSP implemenation already does this, and can also read a subset of the data rows.] 799 | --------------------------------------------------------------------------------