├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── DESCRIPTION ├── Makefile ├── NAMESPACE ├── R ├── colours.R ├── read.R ├── read_comments.R ├── read_rels.R ├── read_style.R ├── read_workbook.R ├── read_worksheet.R ├── readxl.R ├── rexcel-package.r ├── shared_strings.R └── utils.R ├── README.md ├── appveyor.yml ├── inst └── sheets │ ├── Ekaterinburg_IP_9-RESAVED.xlsx │ ├── Ekaterinburg_IP_9.xlsx │ ├── README.md │ ├── defined-names.xlsx │ ├── exp.xlsx │ ├── gabe.xlsx │ ├── gs-test-formula-formatting.xlsx │ ├── mini-gap.xlsx │ └── only_numbers.xlsx ├── internal ├── ekaterinburg.R ├── img │ ├── Kinsey-Male.jpg │ ├── Kinsey-Spreadsheet.jpg │ ├── doctorow_2016-May-11.jpg │ ├── enron-example.png │ ├── excel-reactivity.gif │ └── excel-reactivity.mov └── mini-gap │ ├── [Content_Types].xml │ ├── _rels │ ├── .rels │ └── rels.xml │ └── xl │ ├── _rels │ └── workbook.xml.rels │ ├── drawings │ ├── worksheetdrawing1.xml │ ├── worksheetdrawing2.xml │ ├── worksheetdrawing3.xml │ ├── worksheetdrawing4.xml │ └── worksheetdrawing5.xml │ ├── sharedStrings.xml │ ├── styles.xml │ ├── workbook.xml │ └── worksheets │ ├── _rels │ ├── sheet1.xml.rels │ ├── sheet2.xml.rels │ ├── sheet3.xml.rels │ ├── sheet4.xml.rels │ └── sheet5.xml.rels │ ├── sheet1.xml │ ├── sheet2.xml │ ├── sheet3.xml │ ├── sheet4.xml │ └── sheet5.xml ├── man ├── rexcel.Rd ├── rexcel_read.Rd ├── rexcel_read_workbook.Rd └── rexcel_readxl.Rd ├── rexcel.Rproj └── tests ├── testthat.R └── testthat ├── helper-rexcel.R ├── test-colours.R ├── test-gs.R ├── test-readxl.R ├── test-rexcel-read-workbook.R ├── test-view.R └── test-xlsx-utils.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^Makefile$ 4 | ^README.Rmd$ 5 | ^.travis.yml$ 6 | ^appveyor.yml$ 7 | ^internal$ 8 | inst/sheets/README.md 9 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | example.xlsx 5 | tests/testthat/readxl 6 | tests/testthat/*.xlsx 7 | inst/doc 8 | ~$*.xlsx 9 | tests/testthat/reference/readxl 10 | internal/2016-06-09_skype.md 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: r 2 | cache: packages 3 | warnings_are_errors: true 4 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: rexcel 2 | Title: R/Excel Linen Driver 3 | Version: 0.0.1 4 | Author: Rich FitzJohn and Jenny Bryan 5 | Maintainer: Rich FitzJohn 6 | Description: rexcel now is part of a complete sentence. 7 | License: GPL-3 8 | LazyData: true 9 | URL: https://github.com/rsheets/rexcel 10 | BugReports: https://github.com/rsheets/rexcel/issues 11 | Imports: 12 | cellranger (>= 1.1.0), 13 | linen (>= 0.0.4), 14 | progress, 15 | tibble (>= 1.1), 16 | xml2 (>= 1.0.0) 17 | Suggests: 18 | testthat, 19 | googlesheets (>= 0.2.0), 20 | rprojroot, 21 | readxl 22 | RoxygenNote: 5.0.1.9000 23 | Remotes: 24 | hadley/xml2, 25 | rsheets/linen 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | PACKAGE := $(shell grep '^Package:' DESCRIPTION | sed -E 's/^Package:[[:space:]]+//') 2 | RSCRIPT = Rscript --no-init-file 3 | 4 | all: install 5 | 6 | test: 7 | STORR_SKIP_DOWNLOADS=true make test_all 8 | 9 | test_all: 10 | ${RSCRIPT} -e 'library(methods); devtools::test()' 11 | 12 | roxygen: 13 | @mkdir -p man 14 | ${RSCRIPT} -e "library(methods); devtools::document()" 15 | 16 | install: 17 | R CMD INSTALL . 18 | 19 | build: 20 | R CMD build . 21 | 22 | check: 23 | _R_CHECK_CRAN_INCOMING_=FALSE make check_all 24 | 25 | check_all: build 26 | R CMD check --as-cran --no-manual `ls -1tr ${PACKAGE}*gz | tail -n1` 27 | @rm -f `ls -1tr ${PACKAGE}*gz | tail -n1` 28 | @rm -rf ${PACKAGE}.Rcheck 29 | 30 | autodoc: 31 | ${RSCRIPT} autodoc.R process 32 | 33 | staticdocs: 34 | @mkdir -p inst/staticdocs 35 | ${RSCRIPT} -e "library(methods); staticdocs::build_site()" 36 | rm -f vignettes/*.html 37 | @rmdir inst/staticdocs 38 | website: staticdocs 39 | ./update_web.sh 40 | 41 | # No real targets! 42 | .PHONY: all test document install vignettes 43 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(rexcel_read) 4 | export(rexcel_read_workbook) 5 | export(rexcel_readxl) 6 | -------------------------------------------------------------------------------- /R/colours.R: -------------------------------------------------------------------------------- 1 | col_apply_tint <- function(col, tint) { 2 | if (length(tint) == 1L && length(col) > 1L) { 3 | tint <- rep(tint, length(col)) 4 | } 5 | i <- tint < 0 6 | hsl <- rgb2hsl(grDevices::col2rgb(col)) 7 | if (any(i)) { 8 | hsl[3L, i] <- hsl[3L, i] * (1 + tint) 9 | } 10 | i <- !i 11 | if (any(i)) { 12 | hsl[3L, i] <- hsl[3L, i] * (1 - tint) + tint 13 | } 14 | rgb2col(hsl2rgb(hsl)) 15 | } 16 | 17 | ## NOTE: the spec is unfortunately a little vague about the 18 | ## interpretation of the alpha channel; in the example colours 19 | ## (p. 1763) they use 00 to indicate opacity but empirically (and 20 | ## conventionally) FF is used. 21 | argb2rgb <- function(x, opaque="FF") { 22 | a <- substr(x, 1L, 2L) 23 | rgb <- paste0("#", substr(x, 3L, 8L)) 24 | if (a == opaque) rgb else paste0(rgb, a) 25 | } 26 | 27 | check_col_matrix <- function(m) { 28 | if (!is.matrix(m)) { 29 | if (length(m) == 3L) { 30 | m <- matrix(m, 3L, 1L) 31 | } else if (length(m) == 4L) { 32 | m <- matrix(m, 3L, 1L) 33 | } else { 34 | stop("Invalid input for m") 35 | } 36 | } 37 | if (!any(nrow(m) == c(3, 4))) { 38 | stop("Invalid input for m") 39 | } 40 | m 41 | } 42 | 43 | rgb2hsl <- function(m) { 44 | m <- check_col_matrix(m) 45 | if (length(m) == 0L) { 46 | nms <- c("h", "s", "l", if (nrow(m) == 4L) "a") 47 | ret <- matrix(numeric(), length(nms), 0) 48 | rownames(ret) <- nms 49 | return(ret) 50 | } 51 | m <- m / 255 52 | r <- apply(m[1:3, , drop=FALSE], 2, range) 53 | c_min <- r[1L, ] 54 | c_max <- r[2L, ] 55 | delta <- c_max - c_min 56 | 57 | l <- (c_max + c_min) / 2 58 | s <- delta / (1 - abs(2 * l - 1)) 59 | 60 | r <- m[1L, ] 61 | g <- m[2L, ] 62 | b <- m[3L, ] 63 | 64 | i <- apply(m[1:3, , drop=FALSE], 2, which.max) 65 | h <- numeric(length(s)) 66 | j <- i == 1L 67 | h[j] <- (g[j] - b[j]) / delta[j] %% 6 68 | j <- i == 2L 69 | h[j] <- (b[j] - r[j]) / delta[j] + 2 70 | j <- i == 3L 71 | h[j] <- (r[j] - g[j]) / delta[j] + 4 72 | 73 | h <- h / 6 74 | 75 | i <- delta == 0 76 | h[i] <- s[i] <- 0 77 | 78 | i <- h < 0 79 | h[i] <- h[i] %% 1 80 | 81 | if (nrow(m) == 4L) { 82 | rbind(h, s, l, a=m[4L, ]) 83 | } else { 84 | rbind(h, s, l) 85 | } 86 | } 87 | 88 | hsl2rgb <- function(m) { 89 | m <- check_col_matrix(m) 90 | 91 | h <- m[1L, ] * 360 / 60 92 | s <- m[2L, ] 93 | l <- m[3L, ] 94 | 95 | C <- (1 - abs(2 * l - 1)) * s 96 | X <- C * (1 - abs(h %% 2 - 1)) 97 | H <- floor(h) 98 | 99 | cx <- rbind(C, X) 100 | rgb <- array(0, dim(m)) 101 | rgb[1:2, H == 0] <- cx[, H == 0] 102 | rgb[1:2, H == 1] <- cx[2:1, H == 1] 103 | rgb[2:3, H == 2] <- cx[, H == 2] 104 | rgb[2:3, H == 3] <- cx[2:1, H == 3] 105 | rgb[c(1, 3), H == 4] <- cx[2:1, H == 4] 106 | rgb[c(1, 3), H == 5] <- cx[, H == 5] 107 | 108 | mm <- l - C / 2 109 | ret <- (rgb + rep(mm, each=3)) * 255 110 | 111 | rownames(ret) <- c("red", "green", "blue") 112 | if (nrow(m) == 4L) { 113 | ret <- rbind(ret, alpha=m[4L, ] * 255) 114 | } 115 | ret 116 | } 117 | 118 | rgb2col <- function(m) { 119 | grDevices::rgb(m[1, ], m[2, ], m[3, ], 120 | alpha = if (nrow(m) == 4L) m[4, ], maxColorValue = 255) 121 | } 122 | -------------------------------------------------------------------------------- /R/read.R: -------------------------------------------------------------------------------- 1 | ##' This function does not get the data into a usable form but at 2 | ##' least loads it up into R so we can poke about with it. The 3 | ##' resulting loaded data can distinguish between formulae and data, 4 | ##' numbers and text. Merged cells are detected. Style information 5 | ##' is included, though nothing is done with it yet. A summary of the 6 | ##' data is printed if you print the resulting object. 7 | ##' 8 | ##' @title Read an xlsx file that probably contains nontabular data 9 | ##' 10 | ##' @param path Path to the xlsx file to load. xls files are not supported. 11 | ##' 12 | ##' @param sheet Sheet number (not name at this point). Googlesheets 13 | ##' exported sheets are likely not to do the right thing. 14 | ##' 15 | ##' @return An \code{xlsx} object that can be printed. Future methods 16 | ##' might do something sensible. The structure is subject to 17 | ##' complete change so is not documented here. 18 | ##' @export 19 | rexcel_read <- function(path, sheet=1L) { 20 | rexcel_read_workbook(path, sheet, FALSE)$sheets[[1L]] 21 | } 22 | 23 | ##' Read an entire workbook 24 | ##' 25 | ##' @title Read an Excel workbook 26 | ##' 27 | ##' @param path Path to the xlsx file to load. xls files are not supported. 28 | ##' 29 | ##' @param sheets Character or integer vector of sheets to read, or 30 | ##' \code{NULL} to read all sheets (the default) 31 | ##' 32 | ##' @param progress Display a progress bar? 33 | ##' @export 34 | rexcel_read_workbook <- function(path, sheets=NULL, progress=TRUE) { 35 | 36 | if (!is_xlsx(path)) { 37 | stop("`path` does not appear to point to valid xlsx:\n", path, 38 | call. = FALSE) 39 | } 40 | 41 | dat <- xlsx_read_workbook(path) 42 | 43 | if (is.null(sheets)) { 44 | sheets <- xlsx_sheet_names(dat) 45 | } else if (is.numeric(sheets)) { 46 | sheets <- xlsx_sheet_names(dat)[sheets] 47 | } 48 | 49 | p <- progress(paste0(basename(path), " [:bar] :current / :total"), 50 | length(sheets), show=progress) 51 | p(0) 52 | 53 | ## Some of this will move into the worksheet and save some of the 54 | ## ugly options passinh here. 55 | strings <- xlsx_read_shared_strings(path) 56 | date_offset <- xlsx_date_offset(path) 57 | 58 | style_xlsx <- xlsx_read_style(path) 59 | lookup <- tibble::tibble( 60 | font = style_xlsx$cell_xfs$font_id, 61 | fill = style_xlsx$cell_xfs$fill_id, 62 | border = style_xlsx$cell_xfs$border_id, 63 | num_fmt = style_xlsx$cell_xfs$num_fmt_id) 64 | 65 | ## This becomes read_number_formats? 66 | if (nrow(style_xlsx$num_fmts) > 0L) { 67 | n <- max(style_xlsx$num_fmts$num_format_id) 68 | fmt <- rep(NA_character_, n) 69 | fmt[seq_along(xlsx_format_codes())] <- xlsx_format_codes() 70 | fmt[style_xlsx$num_fmts$num_format_id] <- style_xlsx$num_fmts$format_code 71 | custom_date <- style_xlsx$num_fmts$num_format_id[ 72 | grepl("[dmyhs]", style_xlsx$num_fmts$format_code)] 73 | } else { 74 | fmt <- xlsx_format_codes() 75 | custom_date <- integer(0) 76 | } 77 | is_date_time <- xlsx_is_date_time(seq_along(fmt), custom_date) 78 | num_fmt <- tibble::tibble(num_fmt = fmt, is_date_time = is_date_time) 79 | 80 | style <- linen::linen_style(lookup, font=style_xlsx$fonts, 81 | fill=style_xlsx$fills, 82 | border=style_xlsx$borders, 83 | num_fmt=num_fmt) 84 | 85 | workbook <- linen::workbook(sheets, style, dat$defined_names) 86 | for (s in sheets) { 87 | p(1) 88 | rexcel_read_worksheet(path, s, workbook, dat, strings, style, date_offset) 89 | } 90 | 91 | workbook 92 | } 93 | 94 | ## The name here is a bit of a gong show, as is the general logic. I 95 | ## hope this will refine a bit over the next little bit. 96 | rexcel_read_worksheet <- function(path, sheet, workbook, 97 | workbook_dat, strings, style, date_offset) { 98 | if (is.numeric(sheet)) { 99 | sheet_name <- workbook$names[[sheet]] 100 | } else if (is.character(sheet)) { 101 | sheet_name <- sheet 102 | } else { 103 | stop("Invalid input for sheet") 104 | } 105 | 106 | target <- xlsx_internal_sheet_name(sheet, workbook_dat) 107 | rels <- xlsx_read_rels(path, target) 108 | 109 | xml <- xlsx_read_sheet(path, sheet, workbook_dat) 110 | ns <- xml2::xml_ns(xml) 111 | 112 | merged <- xlsx_read_merged(xml, ns) 113 | view <- xlsx_ct_worksheet_views(xml, ns) 114 | cols <- xlsx_ct_cols(xml, ns) # NOTE: not used yet 115 | dat <- xlsx_parse_cells(xml, ns, strings, style, date_offset) 116 | rows <- dat$rows 117 | cells <- linen::cells(dat$cells$ref, dat$cells$style, dat$cells$type, 118 | dat$cells$value, dat$cells$formula) 119 | 120 | comments <- NULL 121 | if (!is.null(rels)) { 122 | path_comments <- rels$target_abs[rels$type == "comments"] 123 | if (length(path_comments) == 1L) { 124 | comments <- xlsx_read_comments(path, path_comments) 125 | } else if (length(path_comments) > 1L) { 126 | stop("CHECK THIS") # TODO: assertion. 127 | } 128 | } 129 | 130 | linen::worksheet(sheet_name, cols, rows, cells, merged, view, comments, 131 | workbook) 132 | } 133 | 134 | xlsx_list_files <- function(path) { 135 | ret <- tibble::as_tibble(utils::unzip(path, list = TRUE)) 136 | names(ret) <- tolower(names(ret)) 137 | ret[order(ret$name), ] 138 | } 139 | 140 | xlsx_read_sheet <- function(path, sheet, workbook_dat) { 141 | xml <- xlsx_read_file(path, xlsx_internal_sheet_name(sheet, workbook_dat)) 142 | stopifnot(xml2::xml_name(xml) == "worksheet") 143 | xml 144 | } 145 | 146 | xlsx_read_file <- function(path, file) { 147 | tmp <- tempfile() 148 | dir.create(tmp) 149 | ## Oh boy more terrible default behaviour. 150 | filename <- tryCatch(utils::unzip(path, file, exdir=tmp), 151 | warning=function(e) stop(e)) 152 | on.exit(unlink(tmp, recursive=TRUE)) 153 | xml2::read_xml(filename) 154 | } 155 | 156 | xlsx_read_file_if_exists <- function(path, file, missing=NULL) { 157 | ## TODO: Appropriate error handling here is difficult; we should 158 | ## check that `path` exists, but by the time that this is called we 159 | ## know that already. 160 | tmp <- tempfile() 161 | dir.create(tmp) 162 | filename <- tryCatch(utils::unzip(path, file, exdir=tmp), 163 | warning=function(e) NULL, 164 | error=function(e) NULL) 165 | if (is.null(filename)) { 166 | missing 167 | } else { 168 | on.exit(unlink(tmp, recursive=TRUE)) 169 | xml2::read_xml(filename) 170 | } 171 | } 172 | 173 | ## sheetData: https://msdn.microsoft.com/EN-US/library/office/documentformat.openxml.spreadsheet.sheetdata.aspx 174 | ## 175 | ## Nothing looks interesting in sheetData, and all elements must be 176 | ## 'row'. 177 | ## 178 | ## row: https://msdn.microsoft.com/EN-US/library/office/documentformat.openxml.spreadsheet.row.aspx 179 | ## The only interesting attribute here is "hidden", but that 180 | ## includes being collapsed by outline, so probably not that 181 | ## interesting. 182 | ## 183 | ## cell: https://msdn.microsoft.com/EN-US/library/office/documentformat.openxml.spreadsheet.cell.aspx 184 | ## 185 | ## Might contain 186 | ## : formula 187 | ## rich test inline 188 | ## value 189 | ## Interesting attributes: 190 | ## r: an A1 style reference to the locatiopn of this cell 191 | ## s: the index of this cell's style (if colours are used as a guide) 192 | ## t: type "an enumeration representing the cell's data type", the 193 | ## only reference to which I can find is 194 | ## http://mailman.vse.cz/pipermail/sc34wg4/attachments/20100428/3fc0a446/attachment-0001.pdf 195 | ## - b: boolean 196 | ## - d: date (ISO 8601) 197 | ## - e: error 198 | ## - inlineStr: inline string in rich text format, with 199 | ## contents in the 'is' element, not the 'v' element. 200 | ## - n: number 201 | ## - s: shared string 202 | ## - str: formula string 203 | ## 204 | ## However, many numbers seem not to have a "t" attribute which is 205 | ## charming. 206 | ## 207 | ## NOTE: handling of formulae is potentially tricky as they can have an attribute "shared" which 208 | ## 209 | ## Blank cells have no children at all. 210 | ## 211 | ## See readxl/src/XlsxCell.h: XlsxCell::type() 212 | xlsx_parse_cells <- function(xml, ns, strings, style_data, date_offset) { 213 | sheet_data <- xlsx_read_sheet_data(xml, ns, strings) 214 | cells <- sheet_data$cells 215 | rows <- sheet_data$rows 216 | 217 | is_date_time <- linen::style_lookup(style_data, idt = "num_fmt/is_date_time", 218 | idx = cells$style)$idt 219 | is_date_time[is.na(is_date_time)] <- FALSE 220 | 221 | ## TODO: could/should this not be done when 'cells' is first loaded? 222 | type <- character(nrow(cells)) 223 | type[!is.na(cells$type) & cells$type == "b"] <- "bool" 224 | type[!is.na(cells$type) & cells$type == "s" | cells$type == "str"] <- "text" 225 | i <- is.na(cells$type) | cells$type == "n" 226 | type[i] <- ifelse(is_date_time[i], "date", "number") 227 | type[lengths(cells$value) == 0L] <- "blank" 228 | cells$type <- type 229 | 230 | i <- type == "date" 231 | cells$value[i] <- 232 | as.list(as.POSIXct(unlist(cells$value[i]) * 86400, "UTC", date_offset)) 233 | 234 | list(cells=cells, rows=rows) 235 | } 236 | 237 | xlsx_sheet_names <- function(dat) { 238 | if (is.character(dat)) { 239 | dat <- xlsx_read_workbook(dat) 240 | } 241 | sheets <- dat$sheets 242 | sheets$name[sheets$type == "worksheet" & sheets$state != "veryHidden"] 243 | } 244 | 245 | ## Return the filename within the bundle 246 | xlsx_internal_sheet_name <- function(sheet, dat) { 247 | if (length(sheet) != 1L) { 248 | stop("'sheet' must be a scalar") 249 | } 250 | if (is.na(sheet)) { 251 | stop("'sheet' must be non-missing") 252 | } 253 | 254 | sheets <- dat$sheets 255 | sheets <- sheets[sheets$type == "worksheet" & sheets$state != "veryHidden", ] 256 | 257 | if (is.character(sheet)) { 258 | target <- sheets$target_abs[match(sheet, sheets$name)] 259 | } else if (is.numeric(sheet)) { 260 | target <- sheets$target_abs[[sheet]] 261 | } else { 262 | stop("invalid input") 263 | } 264 | target 265 | } 266 | 267 | ## NOTE: Date handling will change a bit once I get the string parsing 268 | ## stuff entirely worked out. 269 | xlsx_date_offset <- function(path) { 270 | ## See readxl/src/utils.h: dateOffset 271 | ## See readxl/src/XlsxWorkbook.h: is1904 272 | xml <- xlsx_read_file(path, "xl/workbook.xml") 273 | ns <- xml2::xml_ns(xml) 274 | xpath <- sprintf("/%s/%s/@date1904", 275 | xlsx_name("workbook", ns), xlsx_name("workbookPr", ns)) 276 | date1904 <- xml2::xml_find_first(xml, xpath, xml2::xml_ns(xml)) 277 | if (inherits(date1904, "xml_missing")) { 278 | date_is_1904 <- FALSE 279 | } else { 280 | ## TODO: in theory we should do whatever atoi would allow here 281 | ## (that's what Hadley uses in the C++) but I have a sheet that 282 | ## contains this as "false". So I'm trying this way for now. 283 | value <- xml2::xml_text(date1904) 284 | date_is_1904 <- value == "1" || value == "true" 285 | } 286 | if (date_is_1904) "1904-01-01" else "1899-12-30" 287 | } 288 | 289 | xlsx_is_date_time <- function(id, custom) { 290 | ## See readxl's src/CellType.h: isDateTime() 291 | id %in% c(c(14:22, 27:36, 45:47, 50:58, 71:81), custom) 292 | } 293 | -------------------------------------------------------------------------------- /R/read_comments.R: -------------------------------------------------------------------------------- 1 | ## [x] 18.7.1 author (Author) -- xlsx_ct_authors 2 | ## [x] 18.7.2 authors (Authors) -- (in xlsx_ct_author) 3 | ## [x] 18.7.3 comment (Comment) -- xlsx_ct_comment 4 | ## [x] 18.7.4 commentList (List of Comments) -- xlsx_ct_comment_list 5 | ## [ ] 18.7.5 commentPr (Comment Properties) 6 | ## [x] 18.7.6 comments (Comments) -- xlsx_ct_comments 7 | ## [x] 18.7.7 text (Comment Text) -- xlsx_ct_rst 8 | xlsx_read_comments <- function(path, file) { 9 | xml <- xlsx_read_file(path, file) 10 | xlsx_ct_comments(xml, xml2::xml_ns(xml)) 11 | } 12 | 13 | xlsx_ct_comments <- function(xml, ns) { 14 | authors <- xlsx_ct_authors(xml, ns) 15 | xlsx_ct_comment_list(xml, ns, authors) 16 | } 17 | 18 | xlsx_ct_authors <- function(xml, ns) { 19 | authors <- 20 | xml2::xml_children(xml2::xml_find_first(xml, xlsx_name("authors", ns), ns)) 21 | vcapply(authors, xml2::xml_text) 22 | } 23 | 24 | xlsx_ct_comment_list <- function(xml, ns, authors) { 25 | process_container(xml, xlsx_name("commentList", ns), ns, 26 | xlsx_ct_comment, authors) 27 | } 28 | 29 | xlsx_ct_comment <- function(x, ns, authors) { 30 | at <- as.list(xml2::xml_attrs(x)) 31 | text <- xlsx_ct_rst(xml2::xml_find_first(x, xlsx_name("text", ns), ns), ns) 32 | tibble::tibble( 33 | ref = attr_character(at$ref), 34 | author = authors[attr_integer(at$authorId) + 1L], 35 | shape_id = attr_integer(at$shapeId), 36 | text = text) 37 | } 38 | -------------------------------------------------------------------------------- /R/read_rels.R: -------------------------------------------------------------------------------- 1 | 2 | xlsx_read_rels <- function(path, file) { 3 | xml <- xlsx_read_file_if_exists(path, xlsx_path_rels(file)) 4 | if (is.null(xml)) { 5 | NULL 6 | } else { 7 | ## TODO: These are allowed to be external references I think; in 8 | ## which case the abs path here is not correct. 9 | ## 10 | ## TODO: In the case where these are absolute references, the 11 | ## relative references are incorrect, but doing this properly 12 | ## requires some path arithmetic. 13 | ## 14 | ## NOTE: while this is an "absolute" path, we lack the initial slash... 15 | ret <- rbind_df(lapply(xml2::xml_children(xml), xlsx_parse_relationship)) 16 | 17 | target_abs <- ret$target 18 | i <- grepl("^/", target_abs) 19 | if (any(i)) { 20 | target_abs[i] <- sub("^/", "", target_abs[i]) 21 | } 22 | j <- !i 23 | if (any(j)) { 24 | target_abs[j] <- path_join(dirname(file), target_abs[j]) 25 | } 26 | ret$target_abs <- target_abs 27 | ret 28 | } 29 | } 30 | 31 | xlsx_parse_relationship <- function(x) { 32 | at <- as.list(xml2::xml_attrs(x)) 33 | tibble::tibble( 34 | id = attr_character(at$Id), 35 | type = basename(at$Type), 36 | target = at$Target) 37 | } 38 | 39 | ## Part 2, 9.3.3, p. 24 40 | ## > A special naming convention is used for the Relationships 41 | ## > part. First, the Relationships part for a part in a given folder 42 | ## > in the name hierarchy is stored in a sub-folder called 43 | ## > “_rels”. Second, the name of the Relationships part is formed by 44 | ## > appending “.rels” to the name of the original part. Package 45 | ## > relationships are found in the package relationships part named 46 | ## > “/_rels/.rels”. 47 | xlsx_path_rels <- function(filename) { 48 | file.path(dirname(filename), "_rels", 49 | paste0(basename(filename), ".rels")) 50 | } 51 | -------------------------------------------------------------------------------- /R/read_style.R: -------------------------------------------------------------------------------- 1 | ## Basically everything here follows from "Styles": section 18.8 (p. 1744). 2 | ## 3 | ## I have been fairly thorough at pulling things in, but it's not 4 | ## really complete. There are enumeration types that I have not 5 | ## turned into factors (arguments go either way, really). There are a 6 | ## few things that are going to be very hard to deal with, too. 7 | ## 8 | ## [x] 18.8.1 alignment (Alignment) -- xlsx_ct_alignment 9 | ## [x] 18.8.2 b (Bold) -- xlsx_ct_boolean_property 10 | ## [x] 18.8.3 bgColor (Background Color) -- xlsx_ct_color 11 | ## [x] 18.8.4 border (Border) -- xlsx_ct_border 12 | ## [x] 18.8.5 borders (Borders) -- xlsx_ct_borders 13 | ## [x] 18.8.6 bottom (Bottom Border) -- xlsx_ct_border_pr 14 | ## [x] 18.8.7 cellStyle (Cell Style) -- xlsx_ct_cell_style 15 | ## [x] 18.8.8 cellStyles (Cell Styles) -- xlsx_ct_cell_styles 16 | ## [x] 18.8.9 cellStyleXfs (Formatting Records) -- xlsx_ct_cell_style_xfs 17 | ## [x] 18.8.10 cellXfs (Cell Formats) -- xlsx_ct_cell_xfs 18 | ## [ ] 18.8.11 colors (Colors) 19 | ## [x] 18.8.12 condense (Condense) -- xlsx_ct_boolean_property 20 | ## [-] 18.8.13 diagonal (Diagonal) -- (xlsx_ct_border_pr) 21 | ## [ ] 18.8.14 dxf (Formatting) 22 | ## [ ] 18.8.15 dxfs (Formats) 23 | ## [x] 18.8.16 end (Trailing Edge Border) -- xlsx_ct_border_pr 24 | ## [x] 18.8.17 extend (Extend) -- xlsx_ct_boolean_property 25 | ## [x] 18.8.18 family (Font Family) -- xlsx_st_font_family 26 | ## [x] 18.8.19 fgColor (Foreground Color) -- xlsx_ct_color 27 | ## [x] 18.8.20 fill (Fill) -- xlsx_ct_fill 28 | ## [x] 18.8.21 fills (Fills) -- xlsx_ct_fills 29 | ## [x] 18.8.22 font (Font) -- xlsx_ct_font 30 | ## [x] 18.8.23 fonts (Fonts) -- xlsx_ct_fonts 31 | ## [x] 18.8.24 gradientFill (Gradient) -- xlsx_ct_gradient_fill 32 | ## [ ] 18.8.25 horizontal (Horizontal Inner Borders) 33 | ## [x] 18.8.26 i (Italic) -- xlsx_ct_boolean_property 34 | ## [x] 18.8.27 indexedColors (Color Indexes) -- xlsx_ct_indexed_colors 35 | ## [ ] 18.8.28 mruColors (MRU Colors) 36 | ## [x] 18.8.29 name (Font Name) -- (plain text handling in xlsx_ct_font) 37 | ## [x] 18.8.30 numFmt (Number Format) -- xlsx_ct_num_fmt 38 | ## [x] 18.8.31 numFmts (Number Formats) -- xlsx_ct_num_fmts 39 | ## [x] 18.8.32 patternFill (Pattern) -- xlsx_ct_pattern_fill 40 | ## [ ] 18.8.33 protection (Protection Properties) 41 | ## [x] 18.8.34 rgbColor (RGB Color) -- xlsx_ct_rgbcolor 42 | ## [x] 18.8.35 scheme (Scheme) -- (plain text handling in xlsx_ct_font) 43 | ## [x] 18.8.36 shadow (Shadow) -- xlsx_ct_boolean_property 44 | ## [x] 18.8.37 start (Leading Edge Border) -- xlsx_ct_border_pr 45 | ## [ ] 18.8.38 stop (Gradient Stop) 46 | ## [x] 18.8.39 styleSheet (Style Sheet) -- xlsx_read_style 47 | ## [ ] 18.8.40 tableStyle (Table Style) 48 | ## [ ] 18.8.41 tableStyleElement (Table Style) 49 | ## [ ] 18.8.42 tableStyles (Table Styles) 50 | ## [x] 18.8.43 top (Top Border) -- xlsx_ct_border_pr 51 | ## [ ] 18.8.44 vertical (Vertical Inner Border) 52 | ## [x] 18.8.45 xf (Format) -- xlsx_ct_xf 53 | ## 54 | ## Most elements at some point things get called a CT_ (for 55 | ## "Complex Type"), e.g., CT_Color; at that point the processing thing 56 | ## is called xlsx_ct_boolean_property or similar. Note that this 57 | ## drives off the *type*, not off the element name. These are often, 58 | ## but not always, the same. 59 | ## 60 | ## There are no strong argument conventions either; if anything can 61 | ## contain a colour the both theme and index are passed through, as 62 | ## these contain information to convert colour types into an RGB 63 | ## triplet. If any Xpath query is used then the namespace is passed 64 | ## along as ns. 65 | ## 66 | ## From the point of view of the rest of the package, the only entry 67 | ## point to use is xlsx_read_style which will return a list of a great 68 | ## many data.frames (the format here will get cleaned up soon). 69 | ## 70 | ## Some of the needed functions here come from the shared string table. 71 | ## 72 | ## 73 | xlsx_read_style <- function(path) { 74 | xml <- xlsx_read_file(path, "xl/styles.xml") 75 | ns <- xml2::xml_ns(xml) 76 | 77 | theme <- xlsx_read_theme(path) 78 | index <- xlsx_ct_indexed_colors(xml, ns) 79 | 80 | fonts <- xlsx_ct_fonts(xml, ns, theme, index) 81 | fills <- xlsx_ct_fills(xml, ns, theme, index) 82 | borders <- xlsx_ct_borders(xml, ns, theme, index) 83 | 84 | cell_style_xfs <- xlsx_ct_cell_style_xfs(xml, ns) 85 | cell_xfs <- xlsx_ct_cell_xfs(xml, ns) 86 | cell_styles <- xlsx_ct_cell_styles(xml, ns) 87 | num_fmts <- xlsx_ct_num_fmts(xml, ns) 88 | 89 | list(fonts=fonts, 90 | fills=fills, 91 | borders=borders, 92 | cell_style_xfs=cell_style_xfs, 93 | cell_xfs=cell_xfs, 94 | cell_styles=cell_styles, 95 | num_fmts=num_fmts) 96 | } 97 | 98 | ## NOTE: this only reads the the colour information from the theme as 99 | ## nothing else looks that exciting in there, really. 100 | xlsx_read_theme <- function(path) { 101 | ## TODO: Strictly, the theme information should come from the 102 | ## workbook.rels.xml file by looking to see which file has the 103 | ## appropriate officeDocument/2006/relationships/theme entry, but this 104 | ## should be fine for now. 105 | ## 106 | ## NOTE: MSDN suggests that this will always be theme1.xml for Excel 107 | ## and only n>1 for PowerPoint. 108 | xml <- xlsx_read_file_if_exists(path, "xl/theme/theme1.xml") 109 | if (is.null(xml)) { 110 | return(NULL) 111 | } 112 | ns <- xml2::xml_ns(xml) 113 | tmp <- xml2::xml_find_first(xml, "/a:theme/a:themeElements/a:clrScheme", ns) 114 | 115 | ## Empirical ordering, based on one random website. I have not 116 | ## found the support for this in the actual spec yet and have seen a 117 | ## few variants on the ordering listed there incl dk1/lt1/dk2/lt2/accent... 118 | 119 | nms <- c("lt1", "dk1", "lt2", "dk2", 120 | paste0("accent", 1:6), 121 | "hlink", "folHlink") 122 | f <- function(x, xml, ns) { 123 | tmp <- xml2::xml_find_first(xml, paste0(".//a:", x), ns) 124 | nd <- xml2::xml_children(tmp)[[1L]] 125 | at <- switch(xml2::xml_name(nd), sysClr="lastClr", srgbClr="val") 126 | paste0("#", xml2::xml_attr(nd, at, ns)) 127 | } 128 | pal <- vcapply(nms, f, xml, ns) 129 | 130 | list(palette=pal) 131 | } 132 | 133 | ## 18.8.23 fonts 134 | xlsx_ct_fonts <- function(xml, ns, theme, index) { 135 | process_container(xml, xlsx_name("fonts", ns), ns, xlsx_ct_font, 136 | theme, index) 137 | } 138 | 139 | ## 18.8.22 font 140 | ## 141 | ## The link to the actual definition is broken, but p. 3930, l 3797 142 | ## looks good. Beware of the similar but different CT_Font probably 143 | ## for Word's XML. 144 | ## 145 | ## Possible daughter elements (all optional but at most one of each present) 146 | ## 147 | ## name (CT_FontName) 148 | ## charset (CT_IntProperty) 149 | ## family (CT_FontFamily) 150 | ## b, i, strike, outline, shadow, condense, extend (CT_BooleanProperty) 151 | ## color (CT_Color) 152 | ## sz (CT_FontSize) 153 | ## u (CT_UnderlineProperty) 154 | ## vertAlign (CT_VerticalAlignFontProperty) - subscript / superscript 155 | ## scheme (CT_FontScheme) 156 | ## 157 | ## Looks like horizontal alignment comes through with the xf element 158 | ## in cellxfs, but I think I ignore that at the moment. Seems like an 159 | ## odd place tbh. 160 | ## 161 | ## Despite most elements being CT_*, most of this is just that if the 162 | ## element is present a "val" attribute is required. 163 | ## 164 | ## Note that some of the elements here are defined in the "Shared 165 | ## Strings" section of the spec. Others I have not tracked down yet. 166 | xlsx_ct_font <- function(x, ns, theme, index) { 167 | name <- xml2::xml_text( 168 | xml2::xml_find_first(x, xlsx_name("name/@val", ns), ns)) 169 | ## ignoring charset 170 | family <- xlsx_st_font_family( 171 | xml2::xml_find_first(x, xlsx_name("family", ns), ns)) 172 | 173 | b <- xlsx_ct_boolean_property( 174 | xml2::xml_find_first(x, xlsx_name("b", ns), ns)) 175 | i <- xlsx_ct_boolean_property( 176 | xml2::xml_find_first(x, xlsx_name("i", ns), ns)) 177 | strike <- xlsx_ct_boolean_property( 178 | xml2::xml_find_first(x, xlsx_name("strike", ns), ns)) 179 | outline <- xlsx_ct_boolean_property( 180 | xml2::xml_find_first(x, xlsx_name("outline", ns), ns)) 181 | shadow <- xlsx_ct_boolean_property( 182 | xml2::xml_find_first(x, xlsx_name("shadow", ns), ns)) 183 | condense <- xlsx_ct_boolean_property( 184 | xml2::xml_find_first(x, xlsx_name("condense", ns), ns)) 185 | extend <- xlsx_ct_boolean_property( 186 | xml2::xml_find_first(x, xlsx_name("extend", ns), ns)) 187 | 188 | color <- xlsx_ct_color( 189 | xml2::xml_find_first(x, xlsx_name("color", ns), ns), theme, index) 190 | sz <- xlsx_ct_font_size( 191 | xml2::xml_find_first(x, xlsx_name("sz", ns), ns)) 192 | 193 | u <- xlsx_ct_underline_property( 194 | xml2::xml_find_first(x, xlsx_name("u", ns), ns)) 195 | ## This one here is either baseline, superscript or subscript. So 196 | ## probably not terribly useful and fairly confuse-able with 197 | ## _actual_ vertical alignment. 198 | ## vertAlign <- xml2::xml_text( 199 | ## xml2::xml_find_first(x, xlsx_name("vertAlign/@val", ns), ns)) 200 | scheme <- xml2::xml_text( 201 | xml2::xml_find_first(x, xlsx_name("scheme/@val", ns), ns)) 202 | 203 | tibble::tibble(name, family, 204 | b, i, strike, outline, shadow, condense, extend, 205 | color, sz, u, scheme) 206 | } 207 | 208 | ## 18.8.18 family 209 | xlsx_st_font_family <- function(f, missing=NA_character_) { 210 | pos <- c(NA_character_, "Roman", "Swiss", "Modern", "Script", "Decorative", 211 | rep("<>", 9)) 212 | if (inherits(f, "xml_missing")) { 213 | missing 214 | } else { 215 | pos[[as.integer(xml2::xml_attr(f, "val")) + 1L]] 216 | } 217 | } 218 | 219 | ## Used by a bunch of things. The actual definition is on l 3751 of 220 | ## A.2 (p. 3929). Note that the xsd defines that if the element is 221 | ## present but @val is empty it defaults to TRUE. 222 | xlsx_ct_boolean_property <- function(b, missing=FALSE) { 223 | if (inherits(b, "xml_missing")) { 224 | missing 225 | } else { 226 | val <- xml2::xml_attr(b, "val") 227 | if (is.na(val)) TRUE else as.logical(as.integer(val)) 228 | } 229 | } 230 | 231 | ## 18.8.21 fills 232 | xlsx_ct_fills <- function(xml, ns, theme, index) { 233 | process_container(xml, xlsx_name("fills", ns), ns, xlsx_ct_fill, theme, index) 234 | } 235 | 236 | ## 18.8.20 fill 237 | xlsx_ct_fill <- function(x, ns, theme, index) { 238 | ## TODO: In the case where not all of these are "pattern" (i.e., we 239 | ## have a gradient fill) this will not work correctly because we 240 | ## need totally different things here. I think what we'll return 241 | ## there is type=gradient, and then a lookup to a gradient table, so 242 | ## this will expand by one more column with gradient_id perhaps. 243 | 244 | ## The only options here, according to the xsd (A.2, p. 3925, 245 | ## l. 3498) is a single element of patternFill or gradientFill 246 | xk <- xml2::xml_children(x)[[1L]] 247 | if (xml2::xml_name(xk) == "patternFill") { 248 | xlsx_ct_pattern_fill(xk, ns, theme, index) 249 | } else { 250 | xlsx_ct_gradient_fill(xk, ns, theme, index) 251 | } 252 | } 253 | 254 | ## 18.8.32 patternFill 255 | xlsx_ct_pattern_fill <- function(x, ns, theme, index) { 256 | pattern_type <- xml2::xml_attr(x, "patternType") 257 | fg <- xlsx_ct_color(xml2::xml_find_first(x, xlsx_name("fgColor", ns), ns), 258 | theme, index) 259 | bg <- xlsx_ct_color(xml2::xml_find_first(x, xlsx_name("bgColor", ns), ns), 260 | theme, index) 261 | c(type="pattern", pattern_type=pattern_type, fg=fg, bg=bg) 262 | } 263 | 264 | ## 18.8.24 gradientFill 265 | xlsx_ct_gradient_fill <- function(x, ns, theme, index) { 266 | ## zero or more stop elements, plus attributes type, degree, left, 267 | ## right, bottom, all of which are optional. I think that 268 | ## realistically we'll have to dump these into a separate lookup 269 | ## table or something. 270 | ## 271 | ## It will be interesting to see what is used in the main corpus. 272 | ## Even with the terrible things that people do to spreadsheets I'd 273 | ## hope that this is not actually common. 274 | stop("Ignoring gradient fill") 275 | } 276 | 277 | ## 18.8.3 bgColor 278 | ## 18.8.19 fgColor 279 | xlsx_ct_color <- function(x, theme, index) { 280 | if (inherits(x, "xml_missing")) { 281 | NA_character_ 282 | } else { 283 | ## The schema is vague on this point but let's make the assumption 284 | ## that only one of the following is present: 285 | ## auto, indexed, rgb, theme 286 | tmp <- xml2::xml_attrs(x) 287 | types <- c("auto", "indexed", "rgb", "theme") 288 | i <- types %in% names(tmp) 289 | if (!any(i)) { 290 | return(NA_character_) 291 | } 292 | t <- types[i][[1L]] 293 | v <- tmp[[t]] 294 | ## TODO: I can't find any information indicating what "auto" means 295 | ## in this context. The spec says (at least for fgColor in 296 | ## 18.8.19, p. 1757, but similar words are used elsewhere): 297 | ## 298 | ## > auto: A boolean value indicating the color is automatic and 299 | ## > system color dependent. 300 | ## 301 | ## So it probably depends on exactly _where_ the colour is used 302 | ## (e.g. if it tends to be a fg or a bg colour). So I will return 303 | ## "auto" I think, at least for now. Probably I could return 304 | ## "black" but that's going to be quite lossy. This way I can 305 | ## transform into a sensible colour at use. 306 | col <- switch( 307 | t, 308 | auto="auto", 309 | indexed=palette_color(as.integer(v) + 1L, index), 310 | rgb=argb2rgb(v), 311 | theme=palette_color(as.integer(v) + 1L, theme$palette)) 312 | if ("tint" %in% names(tmp)) { 313 | col <- col_apply_tint(col, as.numeric(tmp[["tint"]])) 314 | } 315 | col 316 | } 317 | } 318 | 319 | ## 18.8.5 borders 320 | xlsx_ct_borders <- function(xml, ns, theme, index) { 321 | process_container(xml, xlsx_name("borders", ns), ns, 322 | xlsx_ct_border, theme, index) 323 | } 324 | 325 | ## 18.8.4 border 326 | ## 327 | ## See also 328 | ## * 18.8.5 (p. 1750) 329 | ## * A.2 l. 3460 (p. 3924) 330 | ## 331 | ## Unfortunately, note that the xsd talks about start / end but the 332 | ## *example* has begin / end. And neither of them indicates what on 333 | ## earth these are for (though the text in the example suggests that 334 | ## end is the right border in that context). In the sheets I am 335 | ## looking at I mostly see left / right / top / bottom / diagonal. 336 | xlsx_ct_border <- function(x, ns, theme, index) { 337 | ## NOTE: I am skipping attributes diagonalUp and diagonalDown along 338 | ## with the element diagonal - it's not the only bit of formatting 339 | ## trivia we won't handle, but it's a fairly unusual thing to see, I 340 | ## believe. 341 | outline <- attr_bool(xml2::xml_attr(x, "outline"), FALSE) 342 | 343 | f <- function(path) { 344 | xlsx_ct_border_pr(xml2::xml_find_first(x, path, ns), ns, theme, index) 345 | } 346 | 347 | tmp <- list(list(outline = outline), 348 | start = f(xlsx_name("start", ns)), 349 | end = f(xlsx_name("end", ns)), 350 | left = f(xlsx_name("left", ns)), 351 | right = f(xlsx_name("right", ns)), 352 | top = f(xlsx_name("top", ns)), 353 | bottom = f(xlsx_name("bottom", ns))) 354 | tmp <- unlist(tmp, FALSE) 355 | names(tmp) <- sub(".", "_", names(tmp), fixed=TRUE) 356 | tibble::as_tibble(tmp) 357 | } 358 | 359 | ## style (ST_BorderStyle) can be one of (18.18.3, p. 2428): 360 | ## 361 | ## * dashDot 362 | ## * dashDotDot 363 | ## * dashed 364 | ## * dotted 365 | ## * double 366 | ## * hair 367 | ## * medium 368 | ## * mediumDashDot 369 | ## * mediumDashDotDot 370 | ## * mediumDashed 371 | ## * none 372 | ## * slantDashDot 373 | ## * thick 374 | ## * thin 375 | ## 376 | ## Note that the various combinations do not cross with one another. 377 | ## 378 | ## This handles: 379 | ## * 18.8.6 bottom 380 | ## * 18.8.16 end 381 | ## * 18.8.37 start 382 | ## * 18.8.43 top 383 | ## as well as left and right which aren't given section numbers in the spec. 384 | xlsx_ct_border_pr <- function(x, ns, theme, index) { 385 | present <- !inherits(x, "xml_missing") 386 | if (present) { 387 | style <- xml2::xml_attr(x, "style") 388 | color <- xlsx_ct_color( 389 | xml2::xml_find_first(x, xlsx_name("color", ns), ns), theme, index) 390 | } else { 391 | color <- style <- NA_character_ 392 | } 393 | list(present=present, style=style, color=color) 394 | } 395 | 396 | ## 18.8.9 cellStyleXfs 397 | xlsx_ct_cell_style_xfs <- function(xml, ns) { 398 | process_container(xml, xlsx_name("cellStyleXfs", ns), ns, xlsx_ct_xf, 399 | classes=TRUE) 400 | } 401 | 402 | ## 18.8.10 cellXfs 403 | xlsx_ct_cell_xfs <- function(xml, ns) { 404 | process_container(xml, xlsx_name("cellXfs", ns), ns, xlsx_ct_xf) 405 | } 406 | 407 | ## 18.8.45 xf (format) 408 | xlsx_ct_xf <- function(x, ns) { 409 | at <- xml_attrs_list(x) 410 | 411 | ## Booleans, indicating if things are applied: 412 | apply_border <- attr_bool(at$applyBorder, FALSE) 413 | apply_fill <- attr_bool(at$applyFill, FALSE) 414 | apply_font <- attr_bool(at$applyFont, FALSE) 415 | apply_number_format <- attr_bool(at$applyNumberFormat, FALSE) 416 | 417 | apply_alignment <- attr_bool(at$applyAlignment, FALSE) 418 | 419 | `%&&%` <- function(a, b) { 420 | if (isTRUE(a)) b else NA_integer_ 421 | } 422 | 423 | xf <- tibble::tibble( 424 | border_id = apply_border %&&% attr_integer(at$borderId) + 1L, 425 | fill_id = apply_fill %&&% attr_integer(at$fillId) + 1L, 426 | font_id = apply_font %&&% attr_integer(at$fontId) + 1L, 427 | num_fmt_id = apply_number_format %&&% attr_integer(at$numFmtId) + 1L, 428 | 429 | ## Not really sure about these, but they don't hurt to keep around: 430 | pivot_button = attr_bool(at$pivotButton, FALSE), 431 | quote_prefix = attr_bool(at$quotePrefix, FALSE), 432 | apply_protection = attr_bool(at$applyProtection, FALSE), 433 | 434 | ## This is a reference against cellStyleXfs 435 | xf_id = attr_integer(at$xfId) + 1L) 436 | 437 | if (is.null(x)) { 438 | alignment <- xlsx_ct_alignment(NULL, ns) 439 | } else { 440 | alignment <- xlsx_ct_alignment( 441 | xml2::xml_find_first(x, xlsx_name("alignment", ns), ns)) 442 | if (!isTRUE(apply_alignment)) { 443 | alignment[] <- lapply(alignment, as_na) 444 | } 445 | } 446 | cbind(xf, alignment) 447 | } 448 | 449 | ## 18.8.1 alignment 450 | ## 451 | ## horizontal: center | centerContinuous | distributed | fill | 452 | ## general | justify | right 453 | ## 454 | ## vertical: bottom | center | distributed | justify | top 455 | xlsx_ct_alignment <- function(x, ns) { 456 | at <- xml_attrs_list(x) 457 | tibble::tibble( 458 | horizontal=attr_character(at$horizontal), 459 | vertical=attr_character(at$vertical), 460 | indent=attr_integer(at$indent), 461 | justify_last_line=attr_bool(at$justifyLastLine, FALSE), 462 | reading_order=attr_integer(at$readingOrder), 463 | ## relativeIndent [used only in a dxf element] 464 | shrink_to_fit=attr_bool(at$shrinkToFit, FALSE), 465 | text_rotation=attr_integer(at$text_rotation), 466 | text_wrap=attr_bool(at$textWrap, FALSE)) 467 | } 468 | 469 | ## 18.8.8 cellStyles 470 | xlsx_ct_cell_styles <- function(xml, ns) { 471 | process_container(xml, xlsx_name("cellStyles", ns), ns, xlsx_ct_cell_style, 472 | classes = TRUE) 473 | } 474 | 475 | ## 18.8.7 cellStyle 476 | xlsx_ct_cell_style <- function(x, ns) { 477 | ## NOTE: Getting this right is really hard because the Annex (G.2) 478 | ## lists information about "built-in" styles but these vary with all 479 | ## things like row position, but no actual information about the 480 | ## styles is given in the annex. So it's not really obvious what we 481 | ## can do here. 482 | 483 | ## NOTE: This element can contain "extension list" elements which 484 | ## are reserved for future use. But we can skip that. 485 | 486 | ## NOTE: xfId: Zero-based index referencing an xf record in the 487 | ## cellStyleXfs collection. This is used to determine the formatting 488 | ## defined for this named cell style (this is converted to base1 for 489 | ## use in R). 490 | 491 | at <- xml_attrs_list(x) 492 | tibble::tibble( 493 | builtin_id = attr_integer(at$builtinId) + 1L, 494 | custom_builtin = attr_bool(at$customBuiltin, FALSE), 495 | hidden = attr_bool(at$hidden, FALSE), 496 | i_level = attr_integer(at$iLevel), 497 | name = attr_character(at$name), 498 | xf_id = attr_integer(at$xfId) + 1L) 499 | } 500 | 501 | ## 18.8.31 numFmts 502 | xlsx_ct_num_fmts <- function(xml, ns) { 503 | classes <- c(num_format_id="integer", format_code="character") 504 | process_container(xml, xlsx_name("numFmts", ns), ns, 505 | xlsx_ct_num_fmt, classes = classes) 506 | } 507 | 508 | ## 18.8.30 numFmt 509 | xlsx_ct_num_fmt <- function(x, ns) { 510 | at <- as.list(xml2::xml_attrs(x)) 511 | tibble::tibble( 512 | num_format_id = attr_integer(at$numFmtId) + 1L, 513 | format_code = attr_character(at$formatCode)) 514 | } 515 | 516 | ## Below here is bits that may move around a bit; code for processing 517 | ## things out into values that R can understand, mostly for colours. 518 | ## We need to do the number formatting thing soon too. 519 | 520 | ## These come from the ECMA Open XML definition, p 1763 (18.8.27). 521 | ## The spec describes this as a "legacy indexing scheme for colors 522 | ## that is still required for some records, and for backwards 523 | ## compatibility with legacy formats" but this seems to be far more 524 | ## widespread than that (and from things generated with Microsoft's 525 | ## current software I think). 526 | ## 527 | ## Indecies 64 and 65 (the 65th and 66th elements) should be treated 528 | ## specially as system foreground and background colour respectively, but 529 | xlsx_indexed_cols <- function() { 530 | c("#000000", "#FFFFFF", "#FF0000", "#00FF00", "#0000FF", 531 | "#FFFF00", "#FF00FF", "#00FFFF", "#000000", "#FFFFFF", 532 | "#FF0000", "#00FF00", "#0000FF", "#FFFF00", "#FF00FF", 533 | "#00FFFF", "#800000", "#008000", "#000080", "#808000", 534 | "#800080", "#008080", "#C0C0C0", "#808080", "#9999FF", 535 | "#993366", "#FFFFCC", "#CCFFFF", "#660066", "#FF8080", 536 | "#0066CC", "#CCCCFF", "#000080", "#FF00FF", "#FFFF00", 537 | "#00FFFF", "#800080", "#800000", "#008080", "#0000FF", 538 | "#00CCFF", "#CCFFFF", "#CCFFCC", "#FFFF99", "#99CCFF", 539 | "#FF99CC", "#CC99FF", "#FFCC99", "#3366FF", "#33CCCC", 540 | "#99CC00", "#FFCC00", "#FF9900", "#FF6600", "#666699", 541 | "#969696", "#003366", "#339966", "#003300", "#333300", 542 | "#993300", "#993366", "#333399", "#333333", 543 | ## Special: 544 | "black", "white") 545 | } 546 | 547 | ## See 18.8.30, p. 1767 548 | xlsx_format_codes <- function() { 549 | ## "Ids not specified in the listing, such as 5, 6, 7, and 8, shall 550 | ## follow the number format specified by the formatCode attribute." 551 | c("General", 552 | "0", 553 | "0.00", 554 | "#,##0", 555 | "#,##0.00", 556 | ## missing 4-8 incl 557 | rep(NA, length(4:8)), 558 | "0%", 559 | "0.00%", 560 | "0.00E+00", 561 | "# ?/?", 562 | "# ??/??", 563 | "mm-dd-yy", 564 | "d-mmm-yy", 565 | "d-mmm mmm-yy", 566 | "h:mm AM/PM", 567 | "h:mm:ss AM/PM", 568 | "h:mm", 569 | "h:mm:ss", 570 | "m/d/yy h:mm", 571 | ## missing 23-36 incl 572 | rep(NA, length(23:36)), 573 | "#,##0 ;(#,##0)", 574 | "#,##0 ;[Red](#,##0)", 575 | "#,##0.00;(#,##0.00)", 576 | "#,##0.00;[Red](#,##0.00)", 577 | ## missing 41-44 incl 578 | rep(NA, length(41:44)), 579 | "mm:ss", 580 | "[h]:mm:ss", 581 | "mmss.0", 582 | "##0.0E+0", 583 | "@") 584 | } 585 | 586 | ## See 18.18.55, p. 2462 587 | xlsx_pattern_type <- function() { 588 | c(## Can process these two 589 | "none", # ignores both fgColor and bgColor 590 | "solid", # renders only the fgColor 591 | ## but not these: 592 | "darkDown", 593 | "darkGray", 594 | "darkGrid", 595 | "darkHorizontal", 596 | "darkTrellis", 597 | "darkUp", 598 | "darkVertical", 599 | "gray0625", 600 | "gray125", 601 | "lightDown", 602 | "lightGray", 603 | "lightGrid", 604 | "lightHorizontal", 605 | "lightTrellis", 606 | "lightUp", 607 | "lightVertical", 608 | "mediumGray") 609 | } 610 | 611 | ## 18.8.27 indexedColors (Color Indexes) 612 | xlsx_ct_indexed_colors <- function(xml, ns) { 613 | ## TODO: I should replace this with the full correct path. 614 | xpath <- paste0("//", xlsx_name("indexedColors", ns)) 615 | indexed_colors <- xml2::xml_find_first(xml, xpath, ns) 616 | if (inherits(indexed_colors, "xml_missing")) { 617 | indexed <- xlsx_indexed_cols() 618 | } else { 619 | ## NOTE, it seems here that "00" is used for full opacity, which 620 | ## is charming. 621 | indexed <- vcapply(xml2::xml_children(indexed_colors), 622 | xlsx_ct_rgbcolor, "00") 623 | if (length(indexed) == 64) { 624 | indexed <- c(indexed, "black", "white") 625 | } 626 | } 627 | indexed 628 | } 629 | 630 | ## 18.8.34 rgbColor (RGB Color) 631 | ## 632 | ## NOTE: the spec is unfortunately a little vague about the 633 | ## interpretation of the alpha channel; in the example colours 634 | ## (p. 1763) they use 00 to indicate opacity but empirically (and 635 | ## conventionally) FF is used. 636 | xlsx_ct_rgbcolor <- function(x, opaque="FF") { 637 | argb2rgb(xml2::xml_attr(x, "rgb"), opaque) 638 | } 639 | 640 | palette_color <- function(i, pal, err="black") { 641 | if (i > length(pal)) err else pal[[i]] 642 | } 643 | -------------------------------------------------------------------------------- /R/read_workbook.R: -------------------------------------------------------------------------------- 1 | xlsx_read_workbook <- function(path) { 2 | ## TODO: Consider what do do when rels is NULL; do we throw? 3 | rels <- xlsx_read_rels(path, "xl/workbook.xml") 4 | xml <- xlsx_read_file(path, "xl/workbook.xml") 5 | ns <- xml2::xml_ns(xml) 6 | 7 | defined_names <- xlsx_ct_external_defined_names(xml, ns) 8 | sheets <- xlsx_ct_sheets(xml, ns, rels) 9 | 10 | list(rels=rels, sheets=sheets, defined_names=defined_names) 11 | } 12 | 13 | xlsx_namespace <- function(ns) { 14 | url <- "http://schemas.openxmlformats.org/spreadsheetml/2006/main" 15 | names(ns)[[match(url, ns)]] 16 | } 17 | xlsx_name <- function(name, ns) { 18 | paste0(xlsx_namespace(ns), ":", name) 19 | } 20 | 21 | ## 18.2.20 sheets 22 | xlsx_ct_sheets <- function(xml, ns, rels) { 23 | ## Apparently new xml2 has some facilities for dealing with 24 | ## namespaces which might make this easier. Or break everything in 25 | ## here. Or perhaps a little of both. 26 | dat <- process_container(xml, xlsx_name("sheets", ns), ns, xlsx_ct_sheet) 27 | 28 | if (is.null(rels)) { 29 | stop("FIXME") 30 | } else { 31 | i <- match(dat$ref, rels$id) 32 | dat <- cbind(dat, rels[i, -1L]) 33 | } 34 | 35 | dat 36 | } 37 | 38 | ## 18.2.19 sheet 39 | xlsx_ct_sheet <- function(xml, ns) { 40 | at <- as.list(xml2::xml_attrs(xml)) 41 | tibble::tibble( 42 | name = attr_character(at$name), 43 | sheet_id = attr_integer(at$sheetId), 44 | state = attr_character(at$state, "visible"), 45 | ref = attr_character(at[["id"]])) 46 | } 47 | 48 | ## 18.14.6 definedName 49 | xlsx_ct_external_defined_names <- function(xml, ns) { 50 | process_container(xml, xlsx_name("definedNames", ns), ns, 51 | xlsx_ct_external_defined_name, classes=TRUE) 52 | } 53 | 54 | ## 18.14.5 definedName 55 | xlsx_ct_external_defined_name <- function(xml, ns) { 56 | at <- xml_attrs_list(xml) 57 | tibble::tibble( 58 | name = attr_character(at$name), 59 | refers_to = attr_character(at$refersTo), 60 | sheet_id = attr_integer(at$sheetId)) 61 | } 62 | -------------------------------------------------------------------------------- /R/read_worksheet.R: -------------------------------------------------------------------------------- 1 | ## [ ] 18.3.1.1 anchor (Object Cell Anchor) 2 | ## [ ] 18.3.1.2 autoFilter (AutoFilter Settings) 3 | ## [ ] 18.3.1.3 brk (Break) 4 | ## [x] 18.3.1.4 c (Cell) -- xlsx_ct_cell 5 | ## [ ] 18.3.1.5 cellSmartTag (Cell Smart Tag) 6 | ## [ ] 18.3.1.6 cellSmartTagPr (Smart Tag Properties) 7 | ## [ ] 18.3.1.7 cellSmartTags (Cell Smart Tags) 8 | ## [ ] 18.3.1.8 cellWatch (Cell Watch Item) 9 | ## [ ] 18.3.1.9 cellWatches (Cell Watch Items) 10 | ## [ ] 18.3.1.10 cfRule (Conditional Formatting Rule) 11 | ## [ ] 18.3.1.11 cfvo (Conditional Format Value Object) 12 | ## [ ] 18.3.1.12 chartsheet (Chart Sheet) 13 | ## [x] 18.3.1.13 col (Column Width & Formatting) -- xlsx_ct_cols 14 | ## [ ] 18.3.1.14 colBreaks (Vertical Page Breaks) 15 | ## [ ] 18.3.1.15 color (Data Bar Color) 16 | ## [ ] 18.3.1.16 colorScale (Color Scale) 17 | ## [x] 18.3.1.17 cols (Column Information) -- xlsx_ct_cols 18 | ## [ ] 18.3.1.18 conditionalFormatting (Conditional Formatting) 19 | ## [ ] 18.3.1.19 control (Embedded Control) 20 | ## [ ] 18.3.1.20 controlPr (Embedded Control Properties) 21 | ## [ ] 18.3.1.21 controls (Embedded Controls) 22 | ## [ ] 18.3.1.22 customPr (Custom Property) 23 | ## [ ] 18.3.1.23 customProperties (Custom Properties) 24 | ## [ ] 18.3.1.24 customSheetView (Custom Chart Sheet View) 25 | ## [ ] 18.3.1.25 customSheetView (Custom Sheet View) 26 | ## [ ] 18.3.1.26 customSheetViews (Custom Chart Sheet Views) 27 | ## [ ] 18.3.1.27 customSheetViews (Custom Sheet Views) 28 | ## [ ] 18.3.1.28 dataBar (Data Bar) 29 | ## [ ] 18.3.1.29 dataConsolidate (Data Consolidate) 30 | ## [ ] 18.3.1.30 dataRef (Data Consolidation Reference) 31 | ## [ ] 18.3.1.31 dataRefs (Data Consolidation References) 32 | ## [ ] 18.3.1.32 dataValidation (Data Validation) 33 | ## [ ] 18.3.1.33 dataValidations (Data Validations) 34 | ## [ ] 18.3.1.34 dialogsheet (Dialog Sheet) 35 | ## [ ] 18.3.1.35 dimension (Worksheet Dimensions) 36 | ## [ ] 18.3.1.36 drawing (Drawing) 37 | ## [ ] 18.3.1.37 drawingHF (Drawing Reference in Header Footer) 38 | ## [ ] 18.3.1.38 evenFooter (Even Page Footer) 39 | ## [ ] 18.3.1.39 evenHeader (Even Page Header) 40 | ## [ ] 18.3.1.40 f (Formula) 41 | ## [ ] 18.3.1.41 firstFooter (First Page Footer) 42 | ## [ ] 18.3.1.42 firstHeader (First Page Header) 43 | ## [ ] 18.3.1.43 formula (Formula) 44 | ## [ ] 18.3.1.44 formula1 (Formula 1) 45 | ## [ ] 18.3.1.45 formula2 (Formula 2) 46 | ## [ ] 18.3.1.46 headerFooter (Header Footer Settings) 47 | ## [ ] 18.3.1.47 hyperlink (Hyperlink) 48 | ## [ ] 18.3.1.48 hyperlinks (Hyperlinks) 49 | ## [ ] 18.3.1.49 iconSet (Icon Set) 50 | ## [ ] 18.3.1.50 ignoredError (Ignored Error) 51 | ## [ ] 18.3.1.51 ignoredErrors (Ignored Errors) 52 | ## [ ] 18.3.1.52 inputCells (Input Cells) 53 | ## [ ] 18.3.1.53 is (Rich Text Inline) 54 | ## [x] 18.3.1.54 mergeCell (Merged Cell) -- xlsx_ct_merge_cell 55 | ## [x] 18.3.1.55 mergeCells (Merge Cells) -- xlsx_read_merged 56 | ## [ ] 18.3.1.56 objectPr (Embedded Object Properties) 57 | ## [ ] 18.3.1.57 oddFooter (Odd Page Footer) 58 | ## [ ] 18.3.1.58 oddHeader (Odd Header) 59 | ## [ ] 18.3.1.59 oleObject (Embedded Object) 60 | ## [ ] 18.3.1.60 oleObjects (Embedded Objects) 61 | ## [ ] 18.3.1.61 outlinePr (Outline Properties) 62 | ## [ ] 18.3.1.62 pageMargins (Page Margins) 63 | ## [ ] 18.3.1.63 pageSetup (Page Setup Settings) 64 | ## [ ] 18.3.1.64 pageSetup (Chart Sheet Page Setup) 65 | ## [ ] 18.3.1.65 pageSetUpPr (Page Setup Properties) 66 | ## [x] 18.3.1.66 pane (View Pane) -- xlsx_ct_pane 67 | ## [ ] 18.3.1.67 picture (Background Image) 68 | ## [ ] 18.3.1.68 pivotArea (Pivot Area) 69 | ## [ ] 18.3.1.69 pivotSelection (PivotTable Selection) 70 | ## [ ] 18.3.1.70 printOptions (Print Options) 71 | ## [ ] 18.3.1.71 protectedRange (Protected Range) 72 | ## [ ] 18.3.1.72 protectedRanges (Protected Ranges) 73 | ## [ ] 18.3.1.73 row (Row) 74 | ## [ ] 18.3.1.74 rowBreaks (Horizontal Page Breaks (Row)) 75 | ## [ ] 18.3.1.75 scenario (Scenario) 76 | ## [ ] 18.3.1.76 scenarios (Scenarios) 77 | ## [ ] 18.3.1.77 securityDescriptor (Security Descriptor) 78 | ## [ ] 18.3.1.78 selection (Selection) 79 | ## [ ] 18.3.1.79 sheetCalcPr (Sheet Calculation Properties) 80 | ## [x] 18.3.1.80 sheetData (Sheet Data) -- xlsx_read_sheet_data 81 | ## [ ] 18.3.1.81 sheetFormatPr (Sheet Format Properties) 82 | ## [ ] 18.3.1.82 sheetPr (Sheet Properties) 83 | ## [ ] 18.3.1.83 sheetPr (Chart Sheet Properties) 84 | ## [ ] 18.3.1.84 sheetProtection (Chart Sheet Protection) 85 | ## [ ] 18.3.1.85 sheetProtection (Sheet Protection Options) 86 | ## [-] 18.3.1.86 sheetView (Chart Sheet View) 87 | ## [x] 18.3.1.87 sheetView (Worksheet View) -- xlsx_ct_worksheet_view 88 | ## [x] 18.3.1.88 sheetViews (Sheet Views) -- xlsx_ct_worksheet_views 89 | ## [-] 18.3.1.89 sheetViews (Chart Sheet Views) 90 | ## [ ] 18.3.1.90 smartTags (Smart Tags) 91 | ## [ ] 18.3.1.91 sortCondition (Sort Condition) 92 | ## [ ] 18.3.1.92 sortState (Sort State) 93 | ## [ ] 18.3.1.93 tabColor (Sheet Tab Color) 94 | ## [ ] 18.3.1.94 tablePart (Table Part) 95 | ## [ ] 18.3.1.95 tableParts (Table Parts) 96 | ## [ ] 18.3.1.96 v (Cell Value) 97 | ## [ ] 18.3.1.97 webPublishItem (Web Publishing Item) 98 | ## [ ] 18.3.1.98 webPublishItems (Web Publishing Items) 99 | ## [ ] 18.3.1.99 worksheet (Worksheet) 100 | 101 | ## 18.3.1.17 cols 102 | xlsx_ct_cols <- function(xml, ns) { 103 | classes <- c(best_fit="logical", collapsed="logical", 104 | custom_width="logical", hidden="logical", 105 | min="integer", max="integer", outline_level="integer", 106 | style="integer", width="numeric") 107 | process_container(xml, xlsx_name("cols", ns), ns, xlsx_ct_col, 108 | classes=classes) 109 | } 110 | 111 | ## NOTE: width is a funny measurement that is a hybrid of pixels and 112 | ## character width: 113 | ## 114 | ## > Column width measured as the number of characters of the maximum 115 | ## > digit width of the numbers 0, 1, 2, ..., 9 as rendered in the 116 | ## > normal style's font. There are 4 pixels of margin padding (two on 117 | ## > each side), plus 1 pixel padding for the gridlines. 118 | ## 119 | ## But the actual calculation of how much space there is will vary 120 | ## with things like the border and padding thickness, boldness, etc. 121 | xlsx_ct_col <- function(xml, ns) { 122 | at <- as.list(xml2::xml_attrs(xml)) 123 | tibble::tibble( 124 | best_fit = attr_bool(at$bestFit, FALSE), 125 | collapsed = attr_bool(at$collapsed, FALSE), 126 | custom_width = attr_bool(at$customWidth, FALSE), 127 | hidden = attr_bool(at$hidden, FALSE), 128 | min = attr_integer(at$min), 129 | max = attr_integer(at$max), 130 | outline_level = attr_integer(at$outlineLevel), 131 | ## phonetic = attr_bool(at$phonetic), 132 | style = attr_integer(at$style), # for new cols only 133 | width = attr_numeric(at$width)) 134 | } 135 | 136 | ## 18.3.1.55 mergeCells (Merge Cells) 137 | xlsx_read_merged <- function(xml, ns) { 138 | merged <- xml2::xml_children( 139 | xml2::xml_find_first(xml, xlsx_name("mergeCells", ns), ns)) 140 | lapply(merged, xlsx_ct_merge_cell) 141 | } 142 | 143 | ## 18.3.1.54 mergeCell (Merged Cell) 144 | xlsx_ct_merge_cell <- function(x) { 145 | cellranger::as.cell_limits(xml2::xml_attr(x, "ref")) 146 | } 147 | 148 | ## 18.3.1.80 sheetData (Sheet Data) 149 | xlsx_read_sheet_data <- function(xml, ns, strings) { 150 | rows <- xml2::xml_children( 151 | xml2::xml_find_first(xml, xlsx_name("sheetData", ns), ns)) 152 | dat <- lapply(rows, xlsx_ct_row, ns, strings) 153 | 154 | cells <- rbind_df(unlist(lapply(dat, "[[", "cells"), FALSE), 155 | c(ref="character", style="integer", type="character", 156 | formula="character", value="list")) 157 | rows <- rbind_df(lapply(dat, "[[", "row"), 158 | vcapply(xlsx_ct_row(NULL)$row, storage.mode)) 159 | 160 | list(rows=rows, cells=cells) 161 | } 162 | 163 | ## 18.3.1.73 row (Row) 164 | xlsx_ct_row <- function(xml, ns, strings) { 165 | ## NOTE: the r attribute is optional and the 'r' attribute is 166 | ## optional so it seems possible that we could end up unable to 167 | ## determine where rows and cells are? Unless the the fact that 168 | ## rows are an xsd:sequence comes in to help? 169 | at <- xml_attrs_list(xml) 170 | 171 | row <- list( 172 | r = attr_integer(at$r), 173 | spans = attr_character(at$spans), 174 | s = attr_integer(at[["s"]]), # style 175 | custom_format = attr_bool(at$customFormat, FALSE), 176 | ht = attr_numeric(at$ht), # height -- vs that uses "width" :-/ 177 | hidden = attr_bool(at$hidden, FALSE), 178 | custom_height = attr_bool(at$customHeight), 179 | outline_level = attr_integer(at$outlineLevel), 180 | collapsed = attr_bool(at$collapsed, FALSE), 181 | ## ph = attr_bool(at$ph, FALSE)) 182 | thick_top = attr_bool(at$thickTop, FALSE), 183 | thick_bot = attr_bool(at$thickBot, FALSE)) 184 | 185 | if (is.null(xml)) { 186 | cells <- list() 187 | } else { 188 | cells <- lapply(xml2::xml_find_all(xml, xlsx_name("c", ns), ns), 189 | xlsx_ct_cell, ns, strings) 190 | } 191 | 192 | list(row=row, cells=cells) 193 | } 194 | 195 | ## 18.3.1.4 c (Cell) 196 | ## 197 | ## > While a cell can have a formula element f and a value element v, 198 | ## > when the cell's type t is inlineStr then only the element is is 199 | ## > allowed as a child element. 200 | ## 201 | ## Ignoring attributes: 202 | ## * cm metadata 203 | ## * vm metadat 204 | ## * ph phonetic information 205 | ## 206 | ## Cell types are enumerated in 18.18.11: 207 | ## 208 | ## b: Boolean 209 | ## d: Date (in the ISO 8601 format) 210 | ## e: Error 211 | ## inlineStr: Inline String (i.e., one not in the shared string 212 | ## table. If this cell type is used, then the cell value 213 | ## is in the is element rather than the v element in the 214 | ## cell (c element).) 215 | ## n: Number 216 | ## s: Shared String 217 | ## str: String (a formula string) 218 | xlsx_ct_cell <- function(xml, ns, strings) { 219 | at <- as.list(xml2::xml_attrs(xml)) 220 | 221 | type <- attr_character(at$t) 222 | if (identical(type, "inlineStr")) { # avoid missingness 223 | formula <- NA_character_ 224 | ## value here _must_ be present, so no conditional (vs below) 225 | value <- xlsx_ct_rst(xml2::xml_find_first(xml, xlsx_name("is", ns), ns), ns) 226 | } else { 227 | formula <- xml2::xml_text(xml2::xml_find_first(xml, xlsx_name("f", ns), ns)) 228 | v <- xml2::xml_find_first(xml, xlsx_name("v", ns), ns) 229 | value <- if (inherits(v, "xml_missing")) NULL else xml2::xml_text(v) 230 | 231 | ## String substitutions from the string table: 232 | if (identical(type, "s")) { 233 | value <- strings[as.integer(unlist(value)) + 1L] 234 | } else if (identical(type, "b")) { 235 | ## TODO: This is probably worth doing? 236 | ## value <- as.logical(as.integer(value)) 237 | value <- as.numeric(value) 238 | } else if (!is.null(value) && 239 | (is.na(type) || identical(type, "n") || identical(type, "d"))) { 240 | ## TODO: Consider being more careful with the 'd' class here? 241 | value <- as.numeric(value) 242 | } 243 | } 244 | 245 | list( 246 | ref = attr_character(at$r), 247 | style = attr_integer(at$s) + 1L, 248 | type = type, 249 | formula = formula, 250 | value = value) 251 | } 252 | 253 | ## 18.3.1.88 sheetViews 254 | xlsx_ct_worksheet_views <- function(xml, ns) { 255 | xpath <- sprintf("./%s/%s", 256 | xlsx_name("sheetViews", ns), xlsx_name("sheetView", ns)) 257 | els <- xml2::xml_find_all(xml, xpath, ns) 258 | if (length(els) == 0L) { 259 | NULL 260 | } else if (length(els) == 1L) { 261 | xlsx_ct_worksheet_view(els[[1L]], ns) 262 | } else { 263 | tmp <- lapply(els, xlsx_ct_worksheet_view, ns) 264 | empty <- vlapply(tmp, is.null) 265 | if (all(empty)) { 266 | NULL 267 | } else { 268 | ## TODO: check the Enron corpus; 269 | ## larry_campbell__21047__EOTT tanks tx NM.xlsx 270 | ## for a file where this is the case. It needs opening in Excel 271 | ## to see what this resolves as. Given we largely ignore these, 272 | ## we'll take the first for now. 273 | tmp[[which(!empty)[[1L]]]] 274 | } 275 | } 276 | } 277 | 278 | ## 18.3.1.87 sheetView 279 | xlsx_ct_worksheet_view <- function(xml, ns) { 280 | pane <- xml2::xml_find_first(xml, xlsx_name("pane", ns), ns) 281 | if (inherits(pane, "xml_missing")) NULL else xlsx_ct_pane(pane) 282 | } 283 | 284 | ## 18.3.1.66 pane (View Pane) 285 | xlsx_ct_pane <- function(xml, ns) { 286 | at <- xml_attrs_list(xml) 287 | ## x_split: Horizontal position of the split, in 1/20th of a point; 288 | ## 0 if none. If the pane is frozen, this value indicates the 289 | ## number of columns visible in the top pane. 290 | ## y_split: Vertical position of the split, in 1/20th of a point; 291 | ## 0 if none. If the pane is frozen, this value indicates the 292 | ## number of columns visible in the left pane. 293 | ## state: one of frozen / frozenSplit / split 294 | state <- attr_character(at$state) 295 | list(x_split = attr_numeric(at$xSplit, 0), 296 | y_split = attr_numeric(at$ySplit, 0), 297 | top_left = attr_character(at$topLeftCell), 298 | state = state, 299 | frozen = state != "split") 300 | } 301 | -------------------------------------------------------------------------------- /R/readxl.R: -------------------------------------------------------------------------------- 1 | ##' Read an Excel spreadsheet the same way as readxl, but slower. 2 | ##' Assumes a well behaved table of data. 3 | ##' @title Read an Excel spreadsheet like readxl 4 | ##' @param path Path to the xlsx file 5 | ##' @param sheet Sheet name or an integer 6 | ##' @param col_names TRUE (the default) indicating we should use the 7 | ##' first row as column names, FALSE, indicating we should generate 8 | ##' names (X1, X2, ..., Xn) or a character vector of names to apply. 9 | ##' @param col_types Either NULL (the default) indicating we should 10 | ##' guess the column types or a vector of column types (values must 11 | ##' be "blank", "numeric", "date" or "text"). 12 | ##' @param na Values indicating missing values (if different from 13 | ##' blank). Not yet used. 14 | ##' @param skip Number of rows to skip. 15 | ##' @export 16 | rexcel_readxl <- function(path, sheet=1L, col_names=TRUE, 17 | col_types=NULL, na="", skip=0) { 18 | dat <- rexcel_read(path, sheet) 19 | linen::worksheet_to_table(dat, col_names, col_types, na, skip) 20 | } 21 | -------------------------------------------------------------------------------- /R/rexcel-package.r: -------------------------------------------------------------------------------- 1 | #' rexcel. 2 | #' 3 | #' @name rexcel 4 | #' @docType package 5 | NULL 6 | -------------------------------------------------------------------------------- /R/shared_strings.R: -------------------------------------------------------------------------------- 1 | ## Shared string table: 2 | ## 3 | ## [-] 18.4.1 charset (Character Set) (ignored for now) 4 | ## [x] 18.4.2 outline (Outline) -- xlsx_ct_boolean_property 5 | ## [-] 18.4.3 phoneticPr (Phonetic Properties) 6 | ## [x] 18.4.4 r (Rich Text Run) -- xlsx_ct_rst 7 | ## [-] 18.4.5 rFont (Font) 8 | ## [-] 18.4.6 rPh (Phonetic Run) 9 | ## [-] 18.4.7 rPr (Run Properties) 10 | ## [x] 18.4.8 si (String Item) -- xlsx_ct_rst 11 | ## [x] 18.4.9 sst (Shared String Table) -- xlsx_read_shared_strings 12 | ## [x] 18.4.10 strike (Strike Through) -- xlsx_ct_boolean_property 13 | ## [x] 18.4.11 sz (Font Size) -- xlsx_ct_font_size 14 | ## [x] 18.4.12 t (Text) -- in xlsx_ct_rst 15 | ## [x] 18.4.13 u (Underline) -- xlsx_ct_underline_property 16 | ## [-] 18.4.14 vertAlign (Vertical Alignment) (ignored for now) 17 | 18 | ## If the format is / then we can just take the text values. 19 | ## Otherwise we'll have to parse out the RTF strings separately. 20 | xlsx_read_shared_strings <- function(path) { 21 | xml <- xlsx_read_file_if_exists(path, "xl/sharedStrings.xml") 22 | if (is.null(xml)) { 23 | return(character(0)) 24 | } 25 | vcapply(xml2::xml_children(xml), xlsx_ct_rst, xml2::xml_ns(xml)) 26 | } 27 | 28 | ## 18.4.8 si 29 | ## 30 | ## This is the core function that reads a string item (si). The spec 31 | ## is a bit vague on this, but it seems most likely that the element 32 | ## can contain either a 't' or a bunch of 'r' elements, but not both. 33 | ## 34 | ## NOTE: Ignoring rPh and phoneticPr which might be part of this 35 | ## element. Terribly anglocentric :( 36 | xlsx_ct_rst <- function(x, ns) { 37 | t <- xml2::xml_find_first(x, xlsx_name("t", ns), ns) 38 | r <- xml2::xml_find_all(x, xlsx_name("r", ns), ns) 39 | if (length(r) == 0L) { 40 | ## Treat as plain text. 41 | ## 18.4.12 t -- ST_Xstring 42 | ## 43 | ## The only complication here is that we *might* contain the flag: 44 | ## xml:space which is a W3C defined thing indicating if whitespace 45 | ## is relevant. 46 | str <- xml2::xml_text(t) 47 | } else { 48 | ## NOTE: we totally ignore sub-string formatting. 49 | str <- if (inherits(t, "xml_missing")) "" else xml2::xml_text(t) 50 | if (length(r) > 0L) { 51 | str <- paste(c(str, xml2::xml_text( 52 | xml2::xml_find_all(r, xlsx_name("t", ns), ns))), 53 | collapse="") 54 | } 55 | } 56 | 57 | ## NOTE: I am still getting slightly different line endings to 58 | ## readxl because I need to convert \n -> \r\n to match 59 | ## 60 | ## Unescape the strings (ST_Xstring) See 22.9.2.19 [p3786] 61 | re <- "_x([[:xdigit:]]{4})_" 62 | i <- regexpr(re, str, perl = TRUE) 63 | len <- nchar(str) 64 | while (i > 0) { 65 | repl <- intToUtf8(as.integer(paste0("0x", substr(str, i + 2, i + 5)))) 66 | str <- sub(re, repl, str) 67 | ## This bit of faffery stops an escaped '_' character being 68 | ## counted as an unescaped '_' character, and is tested. This 69 | ## would be heaps easier in languages with char-by-char string 70 | ## handling 71 | j <- regexpr(re, substr(str, i + 1, len)) 72 | i <- if (j > 0) i + j else j 73 | } 74 | 75 | str 76 | } 77 | 78 | ## 18.4.11 sz (Font Size) 79 | xlsx_ct_font_size <- function(sz) { 80 | as.numeric(xml2::xml_attr(sz, "val")) 81 | } 82 | 83 | ## 18.4.13 u (Underline) 84 | xlsx_ct_underline_property <- function(u, missing="none") { 85 | if (inherits(u, "xml_missing")) { 86 | missing 87 | } else { 88 | val <- xml2::xml_attr(u, "val") 89 | if (is.na(val)) "single" else val 90 | } 91 | } 92 | -------------------------------------------------------------------------------- /R/utils.R: -------------------------------------------------------------------------------- 1 | vlapply <- function(X, FUN, ...) { 2 | vapply(X, FUN, logical(1), ...) 3 | } 4 | viapply <- function(X, FUN, ...) { 5 | vapply(X, FUN, integer(1), ...) 6 | } 7 | vnapply <- function(X, FUN, ...) { 8 | vapply(X, FUN, numeric(1), ...) 9 | } 10 | vcapply <- function(X, FUN, ...) { 11 | vapply(X, FUN, character(1), ...) 12 | } 13 | 14 | attr_bool <- function(x, missing=NA) { 15 | if (is.null(x)) missing else as.logical(as.integer(x)) 16 | } 17 | 18 | attr_integer <- function(x, missing=NA_integer_) { 19 | if (is.null(x)) missing else as.integer(x) 20 | } 21 | 22 | attr_numeric <- function(x, missing=NA_real_) { 23 | if (is.null(x)) missing else as.numeric(x) 24 | } 25 | 26 | attr_character <- function(x, missing=NA_character_) { 27 | if (is.null(x)) missing else x 28 | } 29 | 30 | `%||%` <- function(a, b) { 31 | if (is.null(a)) b else a 32 | } 33 | 34 | process_container <- function(xml, xpath, ns, fun, ..., classes=NULL) { 35 | els <- xml2::xml_children(xml2::xml_find_first(xml, xpath, ns)) 36 | if (isTRUE(classes)) { 37 | if (length(els) == 0L) { 38 | classes <- vcapply(fun(NULL, ns, ...), storage.mode) 39 | } else { 40 | classes <- NULL 41 | } 42 | } 43 | rbind_df(lapply(els, fun, ns, ...), classes) 44 | } 45 | 46 | ## The function below is a faster version of 47 | ## 48 | ## tibble::as_tibble(do.call("rbind", x, quote=TRUE)) 49 | ## 50 | ## But it avoids constructing a very hard to validate, slow to run 51 | ## function (on the order of a second), but it's not terrible nice to 52 | ## look at or understand. 53 | rbind_df <- function(x, classes=NULL) { 54 | if (length(x) == 0L) { 55 | return(empty_tibble(classes)) 56 | } 57 | nms <- names(x[[1L]]) 58 | xx <- unlist(x, FALSE) 59 | dim(xx) <- c(length(nms), length(x)) 60 | if (is.null(classes)) { 61 | preserve <- logical(length(nms)) 62 | } else { 63 | preserve <- classes == "list" 64 | } 65 | ul <- function(i, x) { 66 | if (preserve[[i]]) x else unlist(x) 67 | } 68 | tmp <- stats::setNames(lapply(seq_along(nms), function(i) ul(i, xx[i, ])), nms) 69 | tibble::as_tibble(tmp) 70 | } 71 | 72 | empty_tibble <- function(classes = NULL) { 73 | if (is.null(classes)) { 74 | ## NOTE: Once things settle down, this can be dropped. 75 | stop("deal with me") 76 | tibble::tibble() 77 | } else { 78 | tibble::as_tibble(lapply(classes, vector)) 79 | } 80 | } 81 | 82 | progress <- function(fmt, total, ..., show=TRUE) { 83 | if (show && total > 0L) { 84 | pb <- progress::progress_bar$new(fmt, total=total) 85 | function(len=1) { 86 | invisible(pb$tick(len)) 87 | } 88 | } else { 89 | function(...) {} 90 | } 91 | } 92 | 93 | path_join <- function(a, b) { 94 | na <- length(a) 95 | nb <- length(b) 96 | if (na == 1L && nb != 1L) { 97 | a <- rep_len(a, nb) 98 | } else if (nb == 1L && na != 1L) { 99 | b <- rep_len(b, na) 100 | } else if (na != nb && na != 1L && nb != 1L) { 101 | stop("Can't recycle vectors together") 102 | } 103 | 104 | i <- regexpr("(\\.\\./)+", b) 105 | len <- attr(i, "match.length", exact=TRUE) 106 | j <- len > 0L 107 | if (any(j)) { 108 | b[j] <- substr(b[j], len[j] + 1L, nchar(b[j])) 109 | len[j] <- len[j] / 3 110 | 111 | tmp <- strsplit(a[j], "/", fixed=TRUE) 112 | for (k in seq_along(tmp)) { 113 | ii <- length(tmp[[k]]) - len[j][[k]] 114 | if (ii < 0L) { 115 | ii <- 0L 116 | ## TODO: this turns up once in the Enron corpus 117 | ## warning("Cannot resolve internal reference; above workbook?") 118 | } 119 | tmp[[k]] <- paste(tmp[[k]][seq_len(ii)], collapse="/") 120 | } 121 | a[j] <- unlist(tmp) 122 | } 123 | paste(a, b, sep="/") 124 | } 125 | 126 | ## TODO: replace as.list(xml2::xml_attrs(...)) with this where NULL 127 | ## values are OK. 128 | xml_attrs_list <- function(x) { 129 | if (is.null(x)) { 130 | structure(list(), names=character()) 131 | } else { 132 | as.list(xml2::xml_attrs(x)) 133 | } 134 | } 135 | 136 | as_na <- function(x) { 137 | ret <- NA 138 | storage.mode(ret) <- storage.mode(x) 139 | ret 140 | } 141 | 142 | is_xlsx <- function(path) { 143 | if (!file.exists(path)) { 144 | stop("\n", path, "\ndoes not exist") 145 | } 146 | ## TO DO: verify it's a zip archive? only way I know is unix `file` command 147 | ## http://officeopenxml.com/anatomyofOOXML-xlsx.php 148 | ## https://msdn.microsoft.com/en-us/library/office/gg278316.aspx#MinWBScenario 149 | files <- xlsx_list_files(path) 150 | has_content_types <- "[Content_Types].xml" %in% files$name 151 | has_rels <- "_rels/.rels" %in% files$name 152 | has_workbook_xml <- "xl/workbook.xml" %in% files$name 153 | has_sheet <- any(grepl("xl/worksheets/sheet[0-9]*.xml", files$name)) 154 | has_content_types && has_rels && has_workbook_xml && has_sheet 155 | } 156 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # rexcel 2 | [![Build Status](https://travis-ci.org/rsheets/rexcel.svg?branch=master)](https://travis-ci.org/rsheets/rexcel) 3 | 4 | **Warning: This project is an experiment; do not use for anything other than amusement/frustration purposes.** 5 | 6 | ## Design 7 | 8 | This package implements a **very slow**, but thorough, Excel (xlsx) reader. If you have a rectangular region of cells to read you will be better off with another Excel reading package such as: [`readxl`](http://cran.r-project.org/package=readxl), 9 | [`openxlsx`](http://cran.r-project.org/package=openxlsx), 10 | [`XLConnect`](http://cran.r-project.org/package=XLConnect), 11 | [`xlsx`](http://cran.r-project.org/package=xlsx), 12 | [`gdata`](http://cran.r-project.org/package=gdata), [`RODBC`](http://cran.r-project.org/package=RODBC), or possibly even 13 | [`excel.link`](http://cran.r-project.org/package=excel.link), [`WriteXLS`](http://cran.r-project.org/package=WriteXLS), [`table1xlsx`](http://cran.r-project.org/package=table1xlsx), [`tablaxlsx`](http://cran.r-project.org/package=tablaxlsx) (not clear how current these last 4 are). Mango Solutions has a nice review article, [R: the Excel Connection](http://www.mango-solutions.com/wp/2015/05/r-the-excel-connection/), in which they compare several of the above packages, with a special emphasis on those that can both read and write Excel files (XLConnect, xlsx, openxlsx, excel.link). 14 | 15 | Compared with the above packages, `rexcel` tries to read all the data from an Excel sheet using [`linen`](https://github.com/rsheets/linen) as an intermediate representation in R. The eventual goal is to provide a common receptacle for detailed spreadsheet information from both Excel and Google Sheets. Rather than trying to create a single data.frame in one shot, it allows access to data, formulae and formatting information. Excel type information is preserved, especially for heterogeneous columns. It has no non-R dependencies (e.g. on Perl or Java) and should run on any platform regardless of whether Excel is installed. 16 | 17 | ## Installation 18 | 19 | Requires the development version of `xml2`, as the newer version changes the behaviour of matching functions in fairly large ways. For terminal printing we use the most recent copy of `crayon` which includes more accurate rendering. And we use `linen` as the R spreadsheet representation. 20 | 21 | ```r 22 | devtools::install_github("hadley/xml2") 23 | devtools::install_github("gaborcsardi/crayon") 24 | devtools::install_github("rsheets/linen") 25 | devtools::install_github("rsheets/rexcel") 26 | ``` 27 | 28 | ## Formatting preserved 29 | 30 | * [x] Cell fill colour 31 | * [x] Cell patterns 32 | * [ ] Cell _gradients_ 33 | * [x] Text colour 34 | * [x] Text bold, italic, underline, strikethrough, outline, shadow, condense, extend 35 | * [x] Text font 36 | * [x] Text size 37 | * [x] Text alignment (horizontal, vertical) 38 | * [x] Column/row visibility 39 | * [x] Column/row width/height 40 | * [ ] Styles applied at the column level (though the spec seems vague about if that's a real thing - compare p. 1600 and 1596) 41 | * [ ] Conditional formatting (the rule and the outcome) 42 | * [x] Borders (position and colour) 43 | * [ ] Numeric/date formatting, possibly also formatted text? 44 | * [ ] Table styles (e.g. for pivot tables) 45 | 46 | Of particular concern: 47 | 48 | > A cell can have both direct formatting (e.g., bold) and a cell style (e.g., Explanatory) applied to it. Therefore, both the cell style xf records and cell xf records shall be read to understand the full set of formatting applied to a cell. 49 | 50 | (18.8.10, p.1754) 51 | 52 | Also 53 | 54 | > When the color palette is modified, the indexedColors collection is written. When a custom color has been selected, the mruColors collection is written. 55 | 56 | So we should find a sheet that includes this and see what this looks like empirically. 57 | 58 | ## Other worthwhile things to get 59 | 60 | * [x] named ranges 61 | * [x] comments (author, ref, rich text, visibility) 62 | * [ ] graphs 63 | * [ ] other drawings 64 | * [ ] pivot tables 65 | * [ ] frozen rows / split panes 66 | * [ ] calculation chain 67 | * [ ] header/footer 68 | 69 | ## The Excel XML Spec 70 | 71 | Page and section numbers refer to the "ECMA Office Open XML Part 1 - Fundamentals and Markup Language Reference" document; a massive and fairly unweidly 5026 PDF! The spreadsheet material is mostly in section 18 (pages 1518 - 2508). 72 | 73 | ## Writing Excel files 74 | 75 | Writing workbooks is not currently supported. Before implementing it, we want to see how much information we can preserve while reading. Once we know that, we can start working towards seeing how much can be written and how lossy a read/write cycle will be (it is likely to be *very* lossy as there is an enormous number of things that might be stored in a worksheet. 76 | -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | # DO NOT CHANGE the "init" and "install" sections below 2 | 3 | # Download script file from GitHub 4 | init: 5 | ps: | 6 | $ErrorActionPreference = "Stop" 7 | Invoke-WebRequest http://raw.github.com/krlmlr/r-appveyor/master/scripts/appveyor-tool.ps1 -OutFile "..\appveyor-tool.ps1" 8 | Import-Module '..\appveyor-tool.ps1' 9 | 10 | install: 11 | ps: Bootstrap 12 | 13 | # Adapt as necessary starting from here 14 | 15 | build_script: 16 | - travis-tool.sh install_deps 17 | 18 | test_script: 19 | - travis-tool.sh run_tests 20 | 21 | on_failure: 22 | - travis-tool.sh dump_logs 23 | 24 | artifacts: 25 | - path: '*.Rcheck\**\*.log' 26 | name: Logs 27 | 28 | - path: '*.Rcheck\**\*.out' 29 | name: Logs 30 | 31 | - path: '*.Rcheck\**\*.fail' 32 | name: Logs 33 | 34 | - path: '*.Rcheck\**\*.Rout' 35 | name: Logs 36 | 37 | - path: '\*_*.tar.gz' 38 | name: Bits 39 | 40 | - path: '\*_*.zip' 41 | name: Bits 42 | -------------------------------------------------------------------------------- /inst/sheets/Ekaterinburg_IP_9-RESAVED.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/Ekaterinburg_IP_9-RESAVED.xlsx -------------------------------------------------------------------------------- /inst/sheets/Ekaterinburg_IP_9.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/Ekaterinburg_IP_9.xlsx -------------------------------------------------------------------------------- /inst/sheets/README.md: -------------------------------------------------------------------------------- 1 | Notes on where these sheets came from. 2 | 3 | exp.xlsx: GitHub . OP [wrathematics](https://github.com/wrathematics) says data "was generated by and exported from JMP". Commenter [nacnudus](https://github.com/nacnudus) notes that "The cells don't have the `r` attribute that gives the cell address." 4 | 5 | Ekaterinburg_IP_9.xlsx: GitHub . OP said: 6 | 7 | > readxl version 0.1.0 8 | > There is a problem with reading some xlsx files (not the ones generated by Google Docs - this is some other xlsx generating BI) please try the following: 9 | 10 | download.file("https://raw.githubusercontent.com/e-kotov/tmp/master/Ekaterinburg_IP_9.xlsx", 11 | method = "curl", dest = "Ekaterinburg_IP_9.xlsx") 12 | x <- read_excel("Ekaterinburg_IP_9.xlsx", skip = 5) 13 | 14 | > It returns " Error: Couldn't find 'xl/worksheets/sheet1.xml'". As soon as the file in question is opened in Excel and re-saved, it is loaded with no problem. 15 | 16 | Jenny used the above to download the original file (included here w/ OP's permission), then opened in Excel and did Save As .... to create 17 | `inst/sheets/Ekaterinburg_IP_9-RESAVED.xlsx` in order to compare the underlying XML. 18 | -------------------------------------------------------------------------------- /inst/sheets/defined-names.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/defined-names.xlsx -------------------------------------------------------------------------------- /inst/sheets/exp.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/exp.xlsx -------------------------------------------------------------------------------- /inst/sheets/gabe.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/gabe.xlsx -------------------------------------------------------------------------------- /inst/sheets/gs-test-formula-formatting.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/gs-test-formula-formatting.xlsx -------------------------------------------------------------------------------- /inst/sheets/mini-gap.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/mini-gap.xlsx -------------------------------------------------------------------------------- /inst/sheets/only_numbers.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/inst/sheets/only_numbers.xlsx -------------------------------------------------------------------------------- /internal/ekaterinburg.R: -------------------------------------------------------------------------------- 1 | ## origin story for Ekaterinburg_IP_9.xlsx, found in inst/sheets 2 | 3 | ## https://github.com/hadley/readxl/issues/80 4 | ## OP says: 5 | ## readxl version 0.1.0 6 | ## There is a problem with reading some xlsx files (not the ones generated by 7 | ## Google Docs - this is some other xlsx generating BI) please try the 8 | ## following: 9 | ## download.file("https://raw.githubusercontent.com/e-kotov/tmp/master/Ekaterinburg_IP_9.xlsx", 10 | ## method = "curl", dest = "Ekaterinburg_IP_9.xlsx") 11 | ## x <- read_excel("Ekaterinburg_IP_9.xlsx", skip = 5) 12 | ## It returns " Error: Couldn't find 'xl/worksheets/sheet1.xml'". 13 | ## 14 | ## As soon as the file in question is opened in Excel and re-saved, it is loaded 15 | ## with no problem. 16 | 17 | download.file("https://raw.githubusercontent.com/e-kotov/tmp/master/Ekaterinburg_IP_9.xlsx", 18 | method = "curl", dest = "Ekaterinburg_IP_9.xlsx") 19 | ## then moved to inst/sheets 20 | 21 | ## opened xlsx in Excel and did Save As .... to create 22 | ## inst/sheets/Ekaterinburg_IP_9-RESAVED.xlsx 23 | ## in order to compare the underlying XML 24 | -------------------------------------------------------------------------------- /internal/img/Kinsey-Male.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/internal/img/Kinsey-Male.jpg -------------------------------------------------------------------------------- /internal/img/Kinsey-Spreadsheet.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/internal/img/Kinsey-Spreadsheet.jpg -------------------------------------------------------------------------------- /internal/img/doctorow_2016-May-11.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/internal/img/doctorow_2016-May-11.jpg -------------------------------------------------------------------------------- /internal/img/enron-example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/internal/img/enron-example.png -------------------------------------------------------------------------------- /internal/img/excel-reactivity.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/internal/img/excel-reactivity.gif -------------------------------------------------------------------------------- /internal/img/excel-reactivity.mov: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/rsheets/rexcel/e8dd5d30388b252c91d1cf4baa014a119280e57f/internal/img/excel-reactivity.mov -------------------------------------------------------------------------------- /internal/mini-gap/[Content_Types].xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/_rels/.rels: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/_rels/rels.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/_rels/workbook.xml.rels: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/drawings/worksheetdrawing1.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/drawings/worksheetdrawing2.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/drawings/worksheetdrawing3.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/drawings/worksheetdrawing4.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/drawings/worksheetdrawing5.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/sharedStrings.xml: -------------------------------------------------------------------------------- 1 | countrycontinentyearlifeExppopgdpPercapAlgeriaAfricaAngolaAlbaniaEuropeBeninAustriaArgentinaAmericasBelgiumAustraliaOceaniaBoliviaBosnia and HerzegovinaNew ZealandBulgariaBrazilCanadaAfghanistanAsiaBahrainChileBangladeshBotswanaCambodiaChinaBurkina Faso -------------------------------------------------------------------------------- /internal/mini-gap/xl/styles.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/workbook.xml: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/_rels/sheet1.xml.rels: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/_rels/sheet2.xml.rels: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/_rels/sheet3.xml.rels: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/_rels/sheet4.xml.rels: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/_rels/sheet5.xml.rels: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/sheet1.xml: -------------------------------------------------------------------------------- 1 | 0123459101952.055.231282697.01601.056112101952.066.86927772.06137.076515101952.068.08730405.08343.105119101952.053.822791000.0973.533221101952.059.67274900.02444.2866 -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/sheet2.xml: -------------------------------------------------------------------------------- 1 | 01234516171952.069.128691212.010039.620171952.069.391994794.010556.5816171957.070.339712569.010949.6520171957.070.262229407.012247.416171962.070.931.0794968E712217.23 -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/sheet3.xml: -------------------------------------------------------------------------------- 1 | 01234513141952.062.4851.7876956E75911.31518141952.040.4142883315.02677.32622141952.050.9175.660256E72108.94423141952.068.751.4785584E711367.16127141952.054.7456377619.03939.979 -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/sheet4.xml: -------------------------------------------------------------------------------- 1 | 012345671952.043.0779279525.02449.0082871952.030.0154232095.03520.61031171952.038.2231738315.01062.75222971952.047.622442308.0851.24113271952.031.9754469979.0543.2552 -------------------------------------------------------------------------------- /internal/mini-gap/xl/worksheets/sheet5.xml: -------------------------------------------------------------------------------- 1 | 01234524251952.028.8018425333.0779.445326251952.050.939120447.09867.084828251952.037.4844.6886859E7684.244230251952.039.4174693836.0368.469331251952.044.05.56263527E8400.4486 -------------------------------------------------------------------------------- /man/rexcel.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/rexcel-package.r 3 | \docType{package} 4 | \name{rexcel} 5 | \alias{rexcel} 6 | \alias{rexcel-package} 7 | \title{rexcel.} 8 | \description{ 9 | rexcel. 10 | } 11 | 12 | -------------------------------------------------------------------------------- /man/rexcel_read.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{rexcel_read} 4 | \alias{rexcel_read} 5 | \title{Read an xlsx file that probably contains nontabular data} 6 | \usage{ 7 | rexcel_read(path, sheet = 1L) 8 | } 9 | \arguments{ 10 | \item{path}{Path to the xlsx file to load. xls files are not supported.} 11 | 12 | \item{sheet}{Sheet number (not name at this point). Googlesheets 13 | exported sheets are likely not to do the right thing.} 14 | } 15 | \value{ 16 | An \code{xlsx} object that can be printed. Future methods 17 | might do something sensible. The structure is subject to 18 | complete change so is not documented here. 19 | } 20 | \description{ 21 | This function does not get the data into a usable form but at 22 | least loads it up into R so we can poke about with it. The 23 | resulting loaded data can distinguish between formulae and data, 24 | numbers and text. Merged cells are detected. Style information 25 | is included, though nothing is done with it yet. A summary of the 26 | data is printed if you print the resulting object. 27 | } 28 | 29 | -------------------------------------------------------------------------------- /man/rexcel_read_workbook.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read.R 3 | \name{rexcel_read_workbook} 4 | \alias{rexcel_read_workbook} 5 | \title{Read an Excel workbook} 6 | \usage{ 7 | rexcel_read_workbook(path, sheets = NULL, progress = TRUE) 8 | } 9 | \arguments{ 10 | \item{path}{Path to the xlsx file to load. xls files are not supported.} 11 | 12 | \item{sheets}{Character or integer vector of sheets to read, or 13 | \code{NULL} to read all sheets (the default)} 14 | 15 | \item{progress}{Display a progress bar?} 16 | } 17 | \description{ 18 | Read an entire workbook 19 | } 20 | 21 | -------------------------------------------------------------------------------- /man/rexcel_readxl.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readxl.R 3 | \name{rexcel_readxl} 4 | \alias{rexcel_readxl} 5 | \title{Read an Excel spreadsheet like readxl} 6 | \usage{ 7 | rexcel_readxl(path, sheet = 1L, col_names = TRUE, col_types = NULL, 8 | na = "", skip = 0) 9 | } 10 | \arguments{ 11 | \item{path}{Path to the xlsx file} 12 | 13 | \item{sheet}{Sheet name or an integer} 14 | 15 | \item{col_names}{TRUE (the default) indicating we should use the 16 | first row as column names, FALSE, indicating we should generate 17 | names (X1, X2, ..., Xn) or a character vector of names to apply.} 18 | 19 | \item{col_types}{Either NULL (the default) indicating we should 20 | guess the column types or a vector of column types (values must 21 | be "blank", "numeric", "date" or "text").} 22 | 23 | \item{na}{Values indicating missing values (if different from 24 | blank). Not yet used.} 25 | 26 | \item{skip}{Number of rows to skip.} 27 | } 28 | \description{ 29 | Read an Excel spreadsheet the same way as readxl, but slower. 30 | Assumes a well behaved table of data. 31 | } 32 | 33 | -------------------------------------------------------------------------------- /rexcel.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace,vignette 22 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(rexcel) 3 | 4 | test_check("rexcel") 5 | -------------------------------------------------------------------------------- /tests/testthat/helper-rexcel.R: -------------------------------------------------------------------------------- 1 | here <- rprojroot::find_package_root_file 2 | 3 | TEST_DIR <- here("tests", "testthat") 4 | TEST_REF_DIR <- file.path(TEST_DIR, "reference") 5 | if (!dir.exists(TEST_REF_DIR)) dir.create(TEST_REF_DIR) 6 | 7 | ## Get readxl source in order to get tests and therefore test sheets. 8 | get_readxl <- function(path = TEST_REF_DIR) { 9 | readxl_path <- file.path(path, "readxl") 10 | if (dir.exists(readxl_path)) { 11 | return(readxl_path) 12 | } 13 | if (!has_internet()) { 14 | return(tempfile()) 15 | } 16 | url <- "https://cran.rstudio.com/src/contrib/readxl_0.1.1.tar.gz" 17 | dest <- tempfile() 18 | tryCatch(download.file(url, dest), 19 | error = function(e) skip(e$message)) 20 | on.exit(file.remove(dest)) 21 | untar(dest, exdir = path) 22 | readxl_path 23 | } 24 | 25 | ## This is not terrifically portable (windows does not support it, but 26 | ## it gives a more graceful behaviour on Linux / OSX when there is no 27 | ## internet) 28 | has_internet <- function() { 29 | !is.null(suppressWarnings(nsl("www.google.com"))) 30 | } 31 | -------------------------------------------------------------------------------- /tests/testthat/test-colours.R: -------------------------------------------------------------------------------- 1 | context("colours") 2 | 3 | test_that("rgb2hsl", { 4 | ## These are random colours, ground-truthed with 5 | ## http://www.rapidtables.com/convert/color/rgb-to-hsl.htm 6 | ## Because that website returns things as [degrees, percent, 7 | ## percent] to 3sf I have a little wrapper here. 8 | f <- function(x) { 9 | round(x * c(360, 100, 100), c(0, 1, 1)) 10 | } 11 | 12 | expect_identical(f(rgb2hsl(c(106, 90, 205))), rbind(h=248, s=53.5, l=57.8)) 13 | 14 | cols <- c("gray22", "gray92", "indianred4", "slateblue", 15 | "dodgerblue4", "sienna2", "steelblue", "lightskyblue", 16 | "lightcyan4", "burlywood4") 17 | ans <- f(rgb2hsl(grDevices::col2rgb(cols))) 18 | cmp <- rbind(h=c(0, 0, 0, 248, 210, 19, 207, 203, 180, 33), 19 | s=c(0, 0, 41.1, 53.5, 79.4, 83.5, 44, 92, 6.8, 24.1), 20 | l=c(22, 92.2, 38.6, 57.8, 30.4, 59.6, 49, 75.5, 51.2, 43.9)) 21 | expect_identical(ans, cmp) 22 | 23 | ## Alpha handling: 24 | m <- rbind(grDevices::col2rgb(cols), alpha=runif(length(cols), 0, 255)) 25 | ans2 <- rgb2hsl(m) 26 | expect_identical(ans2[1:3, ], rgb2hsl(grDevices::col2rgb(cols))) 27 | expect_equal(ans2[4, ], m[4, ] / 255) 28 | 29 | expect_equal(dim(rgb2hsl(matrix(numeric(), 3, 0))), c(3, 0)) 30 | expect_equal(dim(rgb2hsl(matrix(numeric(), 4, 0))), c(4, 0)) 31 | 32 | ## And the reverse 33 | m <- grDevices::col2rgb(cols) 34 | expect_equal(hsl2rgb(rgb2hsl(m)), m) 35 | 36 | ## Do them _all_ 37 | mm <- grDevices::col2rgb(colors()) 38 | expect_equal(hsl2rgb(rgb2hsl(mm)), mm) 39 | }) 40 | -------------------------------------------------------------------------------- /tests/testthat/test-gs.R: -------------------------------------------------------------------------------- 1 | context("googlesheets") 2 | 3 | test_that("can read google sheets in right order", { 4 | skip_if_not_installed("googlesheets") 5 | filename <- system.file("mini-gap", "mini-gap.xlsx", package = "googlesheets") 6 | 7 | # expect_equal(xlsx_internal_sheet_name(filename, 1L), 8 | # "xl/worksheets/sheet4.xml") 9 | 10 | ## WHAT'S WRONG? 11 | ## the interface of this function seems to have changed since this test was 12 | ## written 13 | 14 | ## TO DO 15 | ## replace with something that looks at the table returned in linen object 16 | ## that maps worksheet number to target files 17 | }) 18 | -------------------------------------------------------------------------------- /tests/testthat/test-readxl.R: -------------------------------------------------------------------------------- 1 | ## Test reading against readxl 2 | context("readxl") 3 | 4 | files <- dir(file.path(get_readxl(), "tests", "testthat"), 5 | pattern = "\\.xlsx$", full.names = TRUE) 6 | 7 | ## Rich says: I don't see the inline string version opening correctly in 8 | ## numbers so this might be a little beyond our needs to open here. 9 | files <- files[grep("^(inlineStr)", basename(files), invert = TRUE)] 10 | files <- setNames(files, basename(files)) 11 | 12 | for (f in files) { 13 | test_that(basename(f), { 14 | ## Rich says: the as.data.frame here works around something deeply weird 15 | ## with all.equal and tbl_dfs 16 | readxl <- as.data.frame(readxl::read_excel(f), stringsAsFactors = FALSE) 17 | us <- as.data.frame(rexcel_readxl(f)) 18 | if (basename(f) == "new_line_errors.xlsx") { 19 | ## NOTE: I think that xml2 is replacing \r\n -> \n which causes 20 | ## the confusion here. I'm pretty happy about this though as 21 | ## \r\n is not very R-ish. 22 | readxl$column_name[[1]] <- 23 | gsub("\r\r", "\r", readxl$column_name[[1]], fixed = TRUE) 24 | } 25 | expect_equal(us, readxl, label = paste("our import of", basename(f))) 26 | }) 27 | } 28 | -------------------------------------------------------------------------------- /tests/testthat/test-rexcel-read-workbook.R: -------------------------------------------------------------------------------- 1 | ## Can we read some universe of sheets without error? 2 | ## Later we could store reference objects, but just seems frustrating now. 3 | context("read workbooks") 4 | 5 | sheets <- dir(system.file("sheets", package = "rexcel"), 6 | pattern = "\\.xlsx$", full.names = TRUE) 7 | sheets <- setNames(sheets, basename(sheets)) 8 | 9 | ## Jenny: I propose we skip Ekaterinburg until we have a decent, general 10 | ## solution to the non-standard namespacing problem 11 | ## Jenny: I'm skipping both because even resaved one is large. In due course, 12 | ## we should create a scaled down version for testing. 13 | ## Jenny: also skipping exp.xlsx for now 14 | sheets <- 15 | sheets[grep("^Ekaterinburg_IP_9|^exp", names(sheets), invert = TRUE)] 16 | 17 | for (sh in sheets) { 18 | test_that(basename(sh), { 19 | expect_silent(rexcel_read_workbook(sh, progress = FALSE)) 20 | }) 21 | } 22 | 23 | test_that("read one sheet - by name", { 24 | filename <- sheets[["mini-gap.xlsx"]] 25 | d <- rexcel_read_workbook(filename, progress = FALSE) 26 | for (s in d$names) { 27 | expect_equal(d$sheets[[s]]$values(), rexcel_read(filename, s)$values()) 28 | } 29 | }) 30 | 31 | test_that("read one sheet - by index", { 32 | filename <- sheets[["mini-gap.xlsx"]] 33 | d <- rexcel_read_workbook(filename, progress = FALSE) 34 | for (s in seq_along(d$names)) { 35 | expect_equal(d$sheets[[s]]$values(), rexcel_read(filename, s)$values()) 36 | } 37 | }) 38 | -------------------------------------------------------------------------------- /tests/testthat/test-view.R: -------------------------------------------------------------------------------- 1 | context("view") 2 | 3 | ## NOTE: These are really tests of *linen* 4 | 5 | test_that("basic view", { 6 | filename <- here("inst", "sheets", "mini-gap.xlsx") 7 | d <- rexcel::rexcel_read(filename) 8 | tbl <- d$table() 9 | expect_equal(nrow(tbl), 5) 10 | expect_equal(names(tbl)[[1]], "country") 11 | 12 | xr <- cellranger::cell_limits(c(4, 1), c(d$dim[[1]], d$dim[[2]])) 13 | v <- linen::worksheet_view(d, xr) 14 | expect_error(v$table(), "header information not convertable to col_names") 15 | 16 | v2 <- linen::worksheet_view(d, xr, header = letters[seq_len(d$dim[[1]])]) 17 | tbl_v2 <- v2$table() 18 | expect_equal(nrow(tbl_v2), 3) 19 | expect_equal(names(tbl_v2), letters[1:6]) 20 | 21 | tmp <- unname(tbl[3:5, ]) 22 | rownames(tmp) <- NULL 23 | expect_equal(unname(tbl_v2), tmp) 24 | }) 25 | -------------------------------------------------------------------------------- /tests/testthat/test-xlsx-utils.R: -------------------------------------------------------------------------------- 1 | context("xlsx utilities") 2 | 3 | iris_path <- file.path(get_readxl(), "tests", "testthat", "iris-excel.xlsx") 4 | 5 | test_that("xlsx is detected as such and vice versa", { 6 | 7 | expect_true(is_xlsx(iris_path)) 8 | 9 | expect_error(is_xlsx("nonexistent_path"), "does not exist") 10 | expect_error(is_xlsx(system.file(package = "linen")), "cannot be opened") 11 | zf <- tempfile() 12 | utils::zip(zf, 13 | files = dir(system.file(package = "linen"), full.names = TRUE), 14 | extras = "--quiet") 15 | expect_false(is_xlsx(paste0(zf, ".zip"))) 16 | }) 17 | 18 | test_that("xlsx files are listed", { 19 | iris_files <- xlsx_list_files(iris_path)$name 20 | ref <- c("_rels/.rels", "[Content_Types].xml", "docProps/app.xml", 21 | "docProps/core.xml", "xl/_rels/workbook.xml.rels", 22 | "xl/sharedStrings.xml", "xl/styles.xml", "xl/theme/theme1.xml", 23 | "xl/workbook.xml", "xl/worksheets/sheet1.xml") 24 | expect_identical(intersect(iris_files, ref), iris_files) 25 | }) 26 | 27 | --------------------------------------------------------------------------------