├── _pkgdown.yml ├── cleanup ├── tests ├── testthat.R └── testthat │ ├── test_write.R │ └── test_read.R ├── inst ├── extdata │ ├── cars.zsav │ ├── hotel.sav │ ├── iris.sav │ ├── v13.sav │ ├── v14.sav │ ├── repairs.sav │ ├── datetimes.sav │ ├── electric.por │ ├── electric.sav │ ├── physiology.sav │ ├── testdata.sav │ ├── hotel-encrypted.sav │ └── datetimes.sps └── WORDLIST ├── tools ├── version.c └── winlibs.R ├── src ├── Makevars.in ├── tests │ └── soname.h ├── Makevars.win ├── readsav.h ├── fast_factor.cpp ├── write_sav_compress.h ├── write_data.h ├── read_sav_uncompress.h ├── boost_split.cpp ├── read_sav_unknown_n.h ├── read_sav_known_n.h ├── swap_endian.h ├── read_sav_encrypted.cpp ├── read_sav_uncompress.cpp ├── write_sav_compress.cpp ├── RcppExports.cpp ├── read_sav_encrypted.h ├── writepor.cpp ├── read_sav_unknown_n.cpp ├── write_data.cpp ├── writesav.cpp ├── spss.h └── read_sav_known_n.cpp ├── .lintr ├── codecov.yml ├── .gitignore ├── .Rbuildignore ├── NAMESPACE ├── readspss.Rproj ├── man ├── read.spss.Rd ├── write.sav.Rd ├── write.por.Rd ├── read.por.Rd └── read.sav.Rd ├── .github └── workflows │ ├── lint.yaml │ ├── pkgdown.yaml │ ├── R-CMD-check.yaml │ └── test-coverage.yaml ├── R ├── readspss.R ├── tools.R ├── RcppExports.R ├── writepor.R ├── writesav.R ├── readpor.R └── readsav.R ├── DESCRIPTION ├── README.md ├── README.Rmd ├── configure └── vignettes └── readspss.rmd /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: ~ 2 | 3 | -------------------------------------------------------------------------------- /cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | rm -f src/Makevars configure.log autobrew 3 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(readspss) 3 | 4 | test_check("readspss") 5 | -------------------------------------------------------------------------------- /inst/extdata/cars.zsav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/cars.zsav -------------------------------------------------------------------------------- /inst/extdata/hotel.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/hotel.sav -------------------------------------------------------------------------------- /inst/extdata/iris.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/iris.sav -------------------------------------------------------------------------------- /inst/extdata/v13.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/v13.sav -------------------------------------------------------------------------------- /inst/extdata/v14.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/v14.sav -------------------------------------------------------------------------------- /inst/extdata/repairs.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/repairs.sav -------------------------------------------------------------------------------- /inst/extdata/datetimes.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/datetimes.sav -------------------------------------------------------------------------------- /inst/extdata/electric.por: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/electric.por -------------------------------------------------------------------------------- /inst/extdata/electric.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/electric.sav -------------------------------------------------------------------------------- /inst/extdata/physiology.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/physiology.sav -------------------------------------------------------------------------------- /inst/extdata/testdata.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/testdata.sav -------------------------------------------------------------------------------- /inst/extdata/hotel-encrypted.sav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JanMarvin/readspss/HEAD/inst/extdata/hotel-encrypted.sav -------------------------------------------------------------------------------- /tools/version.c: -------------------------------------------------------------------------------- 1 | #include 2 | #if OPENSSL_VERSION_NUMBER < 0x10000000L 3 | #error OpenSSL version too old 4 | #endif 5 | -------------------------------------------------------------------------------- /src/Makevars.in: -------------------------------------------------------------------------------- 1 | PKG_CPPFLAGS = @cflags@ 2 | PKG_LIBS = -lz @libs@ 3 | 4 | all: clean 5 | 6 | $(SHLIB): $(OBJECTS) 7 | 8 | clean: 9 | rm -f $(OBJECTS) $(SHLIB) 10 | -------------------------------------------------------------------------------- /.lintr: -------------------------------------------------------------------------------- 1 | linters: linters_with_defaults( 2 | line_length_linter = NULL, 3 | cyclocomp_linter = NULL, 4 | commented_code_linter = NULL, 5 | object_name_linter = NULL, 6 | indentation_linter = NULL 7 | ) 8 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | codecov: 2 | token: a5a64df7-c822-417c-a81e-2bf443836687 3 | 4 | comment: no # do not comment PR with the result 5 | 6 | coverage: 7 | status: 8 | project: false 9 | patch: false 10 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | build/* 5 | readspss.Rcheck/* 6 | src/*.o 7 | src/*.so 8 | src/*.dll 9 | src/Makevars 10 | windows/* 11 | docs 12 | .deps/ 13 | autobrew 14 | cleanup 15 | configure.log 16 | -------------------------------------------------------------------------------- /src/tests/soname.h: -------------------------------------------------------------------------------- 1 | #include "openssl/opensslv.h" 2 | 3 | #define XSTR(x) STR(x) 4 | #define STR(x) #x 5 | #ifdef SHLIB_VERSION_NUMBER 6 | echo XSTR(SHLIB_VERSION_NUMBER) 7 | #else 8 | echo XSTR(OPENSSL_SHLIB_VERSION) 9 | #endif 10 | -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^\.travis\.yml$ 4 | ^appveyor\.yml$ 5 | ^src/Makevars$ 6 | ^build/$ 7 | ^\.github$ 8 | ^_pkgdown\.yml$ 9 | ^docs$ 10 | ^pkgdown$ 11 | ^codecov\.yml$ 12 | .lintr 13 | ^configure.log$ 14 | ^README.Rmd$ 15 | ^windows$ 16 | ^src/Makevars$ 17 | .*\.so 18 | .*\.dll 19 | .*\.o 20 | .*\.a 21 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(read.por) 4 | export(read.sav) 5 | export(read.spss) 6 | export(write.por) 7 | export(write.sav) 8 | import(Rcpp) 9 | importFrom(stats,na.omit) 10 | importFrom(tools,file_ext) 11 | importFrom(utils,download.file) 12 | importFrom(utils,localeToCharset) 13 | useDynLib(readspss, .registration=TRUE) 14 | -------------------------------------------------------------------------------- /inst/WORDLIST: -------------------------------------------------------------------------------- 1 | CMD 2 | Codecov 3 | PSPP 4 | Rcpp 5 | TDA 6 | autoenc 7 | cflag 8 | charcode 9 | codebase 10 | datalabel 11 | dataview 12 | datestamp 13 | de 14 | decrypt 15 | disppar 16 | doenc 17 | dta 18 | extraproduct 19 | filelabel 20 | fromEncoding 21 | github 22 | haslabel 23 | hexcode 24 | lmissings 25 | longlabel 26 | longmissing 27 | longstring 28 | longstrings 29 | memisc 30 | ownEnc 31 | por 32 | pspp 33 | recoded 34 | sav 35 | spss 36 | swapit 37 | tda 38 | un 39 | unicode 40 | varmatrix 41 | vtype 42 | www 43 | zsav 44 | -------------------------------------------------------------------------------- /readspss.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | ProjectId: ee071c55-283a-4646-997f-e2262c923500 3 | 4 | RestoreWorkspace: Default 5 | SaveWorkspace: Default 6 | AlwaysSaveHistory: Default 7 | 8 | EnableCodeIndexing: Yes 9 | UseSpacesForTab: Yes 10 | NumSpacesForTab: 2 11 | Encoding: UTF-8 12 | 13 | RnwWeave: Sweave 14 | LaTeX: pdfLaTeX 15 | 16 | AutoAppendNewline: Yes 17 | StripTrailingWhitespace: Yes 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageCleanBeforeInstall: No 22 | PackageInstallArgs: --no-multiarch --with-keep.source 23 | PackageRoxygenize: rd,collate,namespace 24 | -------------------------------------------------------------------------------- /man/read.spss.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readspss.R 3 | \name{read.spss} 4 | \alias{read.spss} 5 | \title{read.spss} 6 | \usage{ 7 | read.spss(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{file to import} 11 | 12 | \item{...}{additional arguments passed to read.sav or read.por please see the 13 | documentation for these functions.} 14 | } 15 | \description{ 16 | Function to read a SPSS (z)sav or por file into a data.frame(). This is just 17 | a wrapper around read.sav and read.por for convenience. 18 | } 19 | \seealso{ 20 | \link{read.sav} and \link{read.por} 21 | } 22 | -------------------------------------------------------------------------------- /src/Makevars.win: -------------------------------------------------------------------------------- 1 | PKG_CONFIG ?= $(BINPREF)pkg-config 2 | OPENSSL_LIBS := $(shell $(PKG_CONFIG) --libs openssl) 3 | 4 | ifneq ($(OPENSSL_LIBS),) 5 | $(info using OpenSSL from Rtools) 6 | OPENSSL_CFLAGS := $(shell $(PKG_CONFIG) --cflags openssl) 7 | else 8 | RWINLIB = ../windows/libssl 9 | TARGET = lib$(subst gcc,,$(COMPILED_BY))$(R_ARCH) 10 | OPENSSL_CFLAGS = -I$(RWINLIB)/include -DOPENSSL_SUPPRESS_DEPRECATED 11 | OPENSSL_LIBS = -L$(RWINLIB)/$(TARGET) -L$(RWINLIB)/lib -lssl -lcrypto -lws2_32 -lcrypt32 12 | endif 13 | 14 | PKG_CPPFLAGS = $(OPENSSL_CFLAGS) -DOPENSSL_SUPPRESS_DEPRECATED 15 | PKG_LIBS = $(OPENSSL_LIBS) -lz 16 | 17 | #all: clean 18 | all: $(SHLIB) 19 | 20 | $(OBJECTS): $(RWINLIB) 21 | 22 | $(SHLIB): $(OBJECTS) 23 | 24 | $(RWINLIB): 25 | "${R_HOME}/bin${R_ARCH_BIN}/Rscript.exe" "../tools/winlibs.R" 26 | 27 | clean: 28 | rm -f $(SHLIB) $(OBJECTS) 29 | 30 | .PHONY: all clean 31 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: lint 10 | 11 | permissions: read-all 12 | 13 | jobs: 14 | lint: 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | steps: 19 | - uses: actions/checkout@v4 20 | 21 | - uses: r-lib/actions/setup-r@v2 22 | with: 23 | use-public-rspm: true 24 | 25 | - uses: r-lib/actions/setup-r-dependencies@v2 26 | with: 27 | extra-packages: any::lintr, local::. 28 | needs: lint 29 | 30 | - name: Lint 31 | run: lintr::lint_package() 32 | shell: Rscript {0} 33 | env: 34 | LINTR_ERROR_ON_LINT: true 35 | -------------------------------------------------------------------------------- /src/readsav.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #ifndef READSAV_H 19 | #define READSAV_H 20 | 21 | Rcpp::List readsav(const char * filePath, const bool debug, std::string encStr, 22 | std::string const ownEnc); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /tools/winlibs.R: -------------------------------------------------------------------------------- 1 | if(!file.exists("../windows/libssl/include/openssl/pem.h")){ 2 | unlink("../windows", recursive = TRUE) 3 | url <- if(grepl("aarch", R.version$platform)){ 4 | "https://github.com/r-windows/bundles/releases/download/openssl-3.4.0/openssl-3.4.0-clang-aarch64.tar.xz" 5 | } else if(grepl("clang", Sys.getenv('R_COMPILED_BY'))){ 6 | "https://github.com/r-windows/bundles/releases/download/openssl-3.4.0/openssl-3.4.0-clang-x86_64.tar.xz" 7 | } else if(getRversion() >= "4.2") { 8 | "https://github.com/r-windows/bundles/releases/download/openssl-3.4.0/openssl-3.4.0-ucrt-x86_64.tar.xz" 9 | } else { 10 | "https://github.com/rwinlib/openssl/archive/v3.1.1.tar.gz" 11 | } 12 | download.file(url, basename(url), quiet = TRUE) 13 | dir.create("../windows", showWarnings = FALSE) 14 | untar(basename(url), exdir = "../windows", tar = 'internal') 15 | unlink(basename(url)) 16 | setwd("../windows") 17 | file.rename(list.files(), 'libssl') 18 | } 19 | -------------------------------------------------------------------------------- /src/fast_factor.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | 3 | 4 | template 5 | Rcpp::IntegerVector fast_factor_template( const Rcpp::Vector& x, 6 | const Rcpp::Vector& y) { 7 | Rcpp::IntegerVector out = match(x, y); 8 | 9 | out.attr("levels") = y.attr("names"); 10 | out.attr("class") = "factor"; 11 | return out; 12 | } 13 | 14 | //' creates a factor inspired by an idea of Kevin Ushey in the Rcpp gallery 15 | //' http://gallery.rcpp.org/articles/fast-factor-generation/ 16 | //' 17 | //' @param x vector 18 | //' @param y charactervector with labels 19 | //' @import Rcpp 20 | //' @keywords internal 21 | //' @noRd 22 | // [[Rcpp::export]] 23 | SEXP fast_factor( SEXP x , SEXP y) { 24 | switch( TYPEOF(x) ) { 25 | case INTSXP: return fast_factor_template(x, y); 26 | case REALSXP: return fast_factor_template(x, y); 27 | case STRSXP: return fast_factor_template(x, y); 28 | } 29 | return R_NilValue; 30 | } 31 | -------------------------------------------------------------------------------- /src/write_sav_compress.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2019 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #ifndef WRITE_SAV_UNCOMPRESS_H 19 | #define WRITE_SAV_UNCOMPRESS_H 20 | 21 | void write_sav_compress (std::fstream& sav, const std::string tempstr, 22 | const bool swapit, bool debug); 23 | 24 | #endif 25 | -------------------------------------------------------------------------------- /src/write_data.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2019 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #ifndef WRITE_DATA_H 19 | #define WRITE_DATA_H 20 | 21 | void write_data(Rcpp::DataFrame dat, int32_t cflag, 22 | int64_t n, int32_t kk, info_t *info, 23 | std::fstream& sav, bool swapit); 24 | 25 | #endif 26 | -------------------------------------------------------------------------------- /src/read_sav_uncompress.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #ifndef READ_SAV_UNCOMPRESS_H 19 | #define READ_SAV_UNCOMPRESS_H 20 | 21 | std::string read_sav_uncompress (std::fstream& sav, 22 | const bool swapit, const uint8_t cflag, 23 | bool debug); 24 | 25 | 26 | #endif 27 | -------------------------------------------------------------------------------- /R/readspss.R: -------------------------------------------------------------------------------- 1 | #' read.spss 2 | #' 3 | #' Function to read a SPSS (z)sav or por file into a data.frame(). This is just 4 | #' a wrapper around read.sav and read.por for convenience. 5 | #' @param x file to import 6 | #' @param ... additional arguments passed to read.sav or read.por please see the 7 | #' documentation for these functions. 8 | #' @useDynLib readspss, .registration=TRUE 9 | #' @importFrom tools file_ext 10 | #' @importFrom stats na.omit 11 | #' @importFrom utils download.file localeToCharset 12 | #' @seealso [read.sav] and [read.por] 13 | #' @export 14 | read.spss <- function(x, ...) { 15 | 16 | if (!file.exists(x)) 17 | return(message("File not found.")) 18 | 19 | file <- file_ext(basename(x)) 20 | 21 | if (tolower(file) == "sav" || tolower(file) == "zsav") { 22 | res <- read.sav(x, ...) 23 | } else if (tolower(file) == "por") { 24 | res <- read.por(x, ...) 25 | } else { 26 | message(c("Sorry file extension is either not detected or not known sav.", 27 | "You could try read.sav or read.por with override option.")) 28 | } 29 | 30 | res 31 | 32 | } 33 | -------------------------------------------------------------------------------- /src/boost_split.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018-2025 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #include "spss.h" 19 | 20 | //' split character vector at "=" 21 | //' 22 | //' @param val_s CharacterVector 23 | //' @import Rcpp 24 | //' @keywords internal 25 | //' @noRd 26 | // [[Rcpp::export]] 27 | Rcpp::CharacterVector boost_split(std::string val_s) { 28 | 29 | std::vector vec_r; 30 | 31 | vec_r = split(val_s, "=", true); 32 | 33 | return(Rcpp::wrap(vec_r)); 34 | } 35 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: readspss 2 | Type: Package 3 | Title: Importing and Exporting SPSS Files 4 | Version: 0.19 5 | Authors@R: c( 6 | person("Jan Marvin", "Garbuszus", 7 | email = "jan.garbuszus@ruhr-uni-bochum.de", role = c("aut", "cre")), 8 | person("Ben", "Pfaff", role = c("aut", "cph"), 9 | comment = "author of encryption part"), 10 | person("Jeroen", "Ooms", role ="ctr"), 11 | person("readstata13", role="cph"), 12 | person("R Core Team", role="cph"), 13 | person("Ulrich", "Poetter", role ="ctb"), 14 | person("Goetz", "Rohwer", role="cph") 15 | ) 16 | Maintainer: Jan Marvin Garbuszus 17 | Description: Package to read and write the SPSS file formats. 18 | URL: https://github.com/JanMarvin/readspss 19 | BugReports: https://github.com/JanMarvin/readspss/issues 20 | License: GPL-2 | file LICENSE 21 | LazyData: TRUE 22 | Language: en-US 23 | Imports: 24 | Rcpp (>= 0.11.2) 25 | Suggests: 26 | datasets, 27 | foreign, 28 | knitr, 29 | rmarkdown, 30 | roxygen2, 31 | testthat 32 | LinkingTo: Rcpp 33 | ByteCompile: yes 34 | SystemRequirements: OpenSSL >= 1.0.2 35 | VignetteBuilder: knitr 36 | Encoding: UTF-8 37 | Roxygen: list(markdown = TRUE) 38 | RoxygenNote: 7.3.3 39 | -------------------------------------------------------------------------------- /src/read_sav_unknown_n.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #ifndef READ_SAV_UNKNOWN_N_H 19 | #define READ_SAV_UNKNOWN_N_H 20 | 21 | 22 | int64_t read_sav_unknown_n (std::fstream& sav, 23 | const bool swapit, const int32_t cflag, 24 | const bool debug, 25 | const int32_t kv, 26 | Rcpp::IntegerVector vtyp, 27 | Rcpp::NumericVector res, 28 | std::vector vartype); 29 | 30 | #endif 31 | -------------------------------------------------------------------------------- /src/read_sav_known_n.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #ifndef READ_SAV_KNOWN_N_H 19 | #define READ_SAV_KNOWN_N_H 20 | 21 | Rcpp::List read_sav_known_n (Rcpp::List& df, std::fstream& sav, 22 | const bool swapit, const uint8_t cflag, 23 | const bool debug, 24 | const int64_t n, const int32_t kv, 25 | Rcpp::IntegerVector vtyp, 26 | Rcpp::NumericVector res, 27 | std::vector vartype, 28 | const double lowest, 29 | const double highest, 30 | const int bias); 31 | 32 | #endif 33 | -------------------------------------------------------------------------------- /inst/extdata/datetimes.sps: -------------------------------------------------------------------------------- 1 | 2 | data list list / 3 | d1 (DATE9) 4 | d2 (DATE11) 5 | a1 (ADATE8) 6 | a2 (ADATE10) 7 | e1 (EDATE8) 8 | e2 (EDATE10) 9 | j1 (JDATE5) 10 | j2 (JDATE7) 11 | s1 (SDATE8) 12 | s2 (SDATE10) 13 | q1 (QYR6) 14 | q2 (QYR8) 15 | m1 (MOYR6) 16 | m2 (MOYR8) 17 | w1 (WKYR8) 18 | w2 (WKYR10) 19 | dt1 (DATETIME17) 20 | dt2 (DATETIME20) 21 | dt3 (DATETIME23.2) 22 | y1 (YMDHMS16) 23 | y2 (YMDHMS19) 24 | y3 (YMDHMS19) /* 19.2 . 25 | w3 (WKDAY3) 26 | w4 (WKDAY9) 27 | m3 (MONTH3) 28 | m4 (MONTH9) 29 | mt1 (MTIME5) 30 | mt2 (MTIME8.2) 31 | t1 (TIME5) 32 | t2 (TIME8) 33 | t3 (TIME11.2) 34 | dt4 (DTIME9) 35 | dt5 (DTIME12) 36 | dt6 (DTIME15.2) 37 | . 38 | 39 | begin data. 40 | "31-JAN-13", "31-JAN-2013", "01/31/13", "01/31/2013", "31.01.13", "31.01.2013", "13031", "2013031", "13/01/31", "2013/01/31", "1 Q 13", "1 Q 2013", "JAN 13", "JAN 2013", "5 WK 13", "5 WK 2013", "31-JAN-2013 01:02", "31-JAN-2013 01:02:33", "31-JAN-2013 01:02:33.72", "2013-01-31 1:02", "2013-01-31 1:02:33", "2013-01-31 1:02:33.72", "THU", "THURSDAY", "JAN", "JANUARY", "1754:36", "1754:36.58", "29:14", "29:14:36", "29:14:36.58", "1 05:14", "1 05:14:36", "1 05:14:36.58" 41 | end data. 42 | 43 | save outfile = "/tmp/datetimes.sav" . -------------------------------------------------------------------------------- /src/swap_endian.h: -------------------------------------------------------------------------------- 1 | #ifndef SWAP_ENDIAN 2 | #define SWAP_ENDIAN 3 | 4 | /*#include */ 5 | #include 6 | 7 | #define GCC_VERSION (__GNUC__ * 10000 \ 8 | + __GNUC_MINOR__ * 100 \ 9 | + __GNUC_PATCHLEVEL__) 10 | 11 | /* Test for GCC < 4.8.0 */ 12 | #if GCC_VERSION < 40800 & !__clang__ 13 | static inline unsigned short __builtin_bswap16(unsigned short a) 14 | { 15 | return (a<<8)|(a>>8); 16 | } 17 | #endif 18 | 19 | template 20 | T swap_endian(T t) { 21 | if (typeid(T) == typeid(int16_t)) 22 | return __builtin_bswap16(t); 23 | if (typeid(T) == typeid(uint16_t)) 24 | return __builtin_bswap16(t); 25 | 26 | if (typeid(T) == typeid(int32_t)) 27 | return __builtin_bswap32(t); 28 | if (typeid(T) == typeid(uint32_t)) 29 | return __builtin_bswap32(t); 30 | 31 | if (typeid(T) == typeid(int64_t)) 32 | return __builtin_bswap64(t); 33 | if (typeid(T) == typeid(uint64_t)) 34 | return __builtin_bswap64(t); 35 | 36 | union v { 37 | double d; 38 | float f; 39 | uint32_t i32; 40 | uint64_t i64; 41 | } val; 42 | 43 | if (typeid(T) == typeid(float)){ 44 | val.f = t; 45 | val.i32 = __builtin_bswap32(val.i32); 46 | return val.f; 47 | } 48 | 49 | if (typeid(T) == typeid(double)){ 50 | val.d = t; 51 | val.i64 = __builtin_bswap64(val.i64); 52 | return val.d; 53 | } 54 | 55 | else 56 | return t; 57 | } 58 | 59 | #endif 60 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | permissions: read-all 15 | 16 | jobs: 17 | pkgdown: 18 | runs-on: ubuntu-latest 19 | # Only restrict concurrency for non-PR jobs 20 | concurrency: 21 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 22 | env: 23 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 24 | permissions: 25 | contents: write 26 | steps: 27 | - uses: actions/checkout@v4 28 | 29 | - uses: r-lib/actions/setup-pandoc@v2 30 | 31 | - uses: r-lib/actions/setup-r@v2 32 | with: 33 | use-public-rspm: true 34 | 35 | - uses: r-lib/actions/setup-r-dependencies@v2 36 | with: 37 | extra-packages: any::pkgdown, local::. 38 | needs: website 39 | 40 | - name: Build site 41 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 42 | shell: Rscript {0} 43 | 44 | - name: Deploy to GitHub pages 🚀 45 | if: github.event_name != 'pull_request' 46 | uses: JamesIves/github-pages-deploy-action@v4.5.0 47 | with: 48 | clean: false 49 | branch: gh-pages 50 | folder: docs 51 | -------------------------------------------------------------------------------- /R/tools.R: -------------------------------------------------------------------------------- 1 | # Wrapper Around iconv Calls for Code Readability 2 | # 3 | # @param x element to be converted 4 | # @param encoding encoding to be used. 5 | # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} 6 | # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} 7 | read.encoding <- function(x, fromEncoding, encoding) { 8 | 9 | # avoid iconv errors 10 | if (!is.na(fromEncoding) && is.na(encoding)) 11 | encoding <- fromEncoding 12 | 13 | iconv(x, 14 | from = fromEncoding, 15 | to = encoding, 16 | sub = "byte") 17 | } 18 | 19 | save.encoding <- function(x, encoding) { 20 | iconv(x, 21 | to = encoding, 22 | sub = "byte") 23 | } 24 | # Construct File Path 25 | # 26 | # @param path path to dta file 27 | # @author Jan Marvin Garbuszus \email{jan.garbuszus@@ruhr-uni-bochum.de} 28 | # @author Sebastian Jeworutzki \email{sebastian.jeworutzki@@ruhr-uni-bochum.de} 29 | get.filepath <- function(path = "") { 30 | if (substring(path, 1, 1) == "~") { 31 | filepath <- path.expand(path) 32 | } else { 33 | filepath <- path 34 | } 35 | if (!file.exists(filepath)) { 36 | return("File does not exist.") 37 | } 38 | 39 | return(filepath) 40 | } 41 | 42 | #' Check if numeric vector can be expressed as integer vector 43 | #' 44 | #' Compression can reduce numeric vectors as integers if the vector does only 45 | #' contain integer type data. Same goes for logical values. 46 | #' 47 | #' @param x vector of data frame 48 | #' @keywords internal 49 | #' @noRd 50 | saveToExport <- function(x) { 51 | isTRUE(all.equal(x, as.integer(x))) | 52 | isTRUE(is.factor(x)) | 53 | isTRUE(is.logical(x)) 54 | } 55 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | permissions: read-all 12 | 13 | jobs: 14 | R-CMD-check: 15 | runs-on: ${{ matrix.config.os }} 16 | 17 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 18 | 19 | strategy: 20 | fail-fast: false 21 | matrix: 22 | config: 23 | - {os: macos-latest, r: 'release'} 24 | - {os: windows-latest, r: 'release'} 25 | - {os: windows-latest, r: '4.1'} 26 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 27 | - {os: ubuntu-latest, r: 'release'} 28 | - {os: ubuntu-latest, r: 'oldrel-1'} 29 | 30 | env: 31 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 32 | R_KEEP_PKG_SOURCE: yes 33 | 34 | steps: 35 | - uses: actions/checkout@v4 36 | 37 | - uses: r-lib/actions/setup-pandoc@v2 38 | 39 | - uses: r-lib/actions/setup-r@v2 40 | with: 41 | r-version: ${{ matrix.config.r }} 42 | http-user-agent: ${{ matrix.config.http-user-agent }} 43 | use-public-rspm: true 44 | 45 | - uses: r-lib/actions/setup-r-dependencies@v2 46 | with: 47 | extra-packages: any::rcmdcheck 48 | needs: check 49 | 50 | - uses: r-lib/actions/check-r-package@v2 51 | with: 52 | upload-snapshots: true 53 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 54 | -------------------------------------------------------------------------------- /man/write.sav.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/writesav.R 3 | \name{write.sav} 4 | \alias{write.sav} 5 | \title{write.sav} 6 | \usage{ 7 | write.sav( 8 | dat, 9 | filepath, 10 | label, 11 | add.rownames = FALSE, 12 | compress = FALSE, 13 | convert.dates = TRUE, 14 | tz = "GMT", 15 | debug = FALSE, 16 | is_zsav = FALSE, 17 | disppar 18 | ) 19 | } 20 | \arguments{ 21 | \item{dat}{\emph{data.frame} a data.frame to store as SPSS file.} 22 | 23 | \item{filepath}{\emph{string} full path where and how this file should be 24 | stored} 25 | 26 | \item{label}{\emph{character} if any provided this must be a vector of 27 | labels. It must be of size \code{ncol(dat)}} 28 | 29 | \item{add.rownames}{\emph{logical} If \code{TRUE}, a new variable rownames 30 | will be added to the sav-file.} 31 | 32 | \item{compress}{\emph{logical} should compression be used. If TRUE some 33 | integers will be stored more efficiently. Everything will be stored in 34 | chunks of 8 chars. Reduces memory size of sav-file.} 35 | 36 | \item{convert.dates}{\emph{logical} should dates be converted to SPSS format.} 37 | 38 | \item{tz}{\emph{character} The name of the timezone convert.dates will use.} 39 | 40 | \item{debug}{\emph{logical} print debug information.} 41 | 42 | \item{is_zsav}{\emph{logical} explicitly create a zsav file. If the file 43 | ending zsav is used, this is selected as default.} 44 | 45 | \item{disppar}{optional display parameter matrix. Needs documentation.} 46 | } 47 | \value{ 48 | \code{write.sav} returns nothing 49 | } 50 | \description{ 51 | Function to write an SPSS sav or zsav file from a data.frame(). 52 | } 53 | \details{ 54 | Writing of strings longer than 255 chars is not provided. 55 | } 56 | -------------------------------------------------------------------------------- /man/write.por.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/writepor.R 3 | \name{write.por} 4 | \alias{write.por} 5 | \title{write.por} 6 | \usage{ 7 | write.por( 8 | dat, 9 | filepath, 10 | label, 11 | add.rownames = FALSE, 12 | convert.factors = TRUE, 13 | toEncoding = "CP1252", 14 | convert.dates = TRUE, 15 | tz = "GMT" 16 | ) 17 | } 18 | \arguments{ 19 | \item{dat}{\emph{data.frame} a data.frame to export as por-file.} 20 | 21 | \item{filepath}{\emph{string} full path where and how this file should be 22 | stored} 23 | 24 | \item{label}{\emph{character} vector of labels. must be of size \code{ncol(dat)}} 25 | 26 | \item{add.rownames}{\emph{logical} If \code{TRUE}, a new variable rownames 27 | will be added to the por-file.} 28 | 29 | \item{convert.factors}{\emph{logical} If \code{TRUE}, factors will be converted to 30 | SPSS variables with labels. 31 | SPSS expects strings to be encoded as Windows-1252, so all levels will be 32 | recoded. Character which can not be mapped in Windows-1252 will be saved as 33 | hexcode.} 34 | 35 | \item{toEncoding}{\emph{character} encoding used for the por file. SPSS itself 36 | claims to have problems with unicode and por files, so "CP1252" is the 37 | default.} 38 | 39 | \item{convert.dates}{\emph{logical} should dates be converted to SPSS format} 40 | 41 | \item{tz}{\emph{character} The name of the timezone convert.dates will use.} 42 | } 43 | \value{ 44 | \code{write.por} returns nothing 45 | } 46 | \description{ 47 | Function to write an SPSS por file. Returns an por file that read.por can 48 | read as well as SPSS can. Other packages as foreign, memisc and haven might 49 | fail (fail reading or return wrong values). 50 | } 51 | \details{ 52 | Strings longer than 255 chars are not provided. 53 | File will be stored using "CP1252" encoding. 54 | } 55 | -------------------------------------------------------------------------------- /.github/workflows/test-coverage.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: test-coverage 10 | 11 | permissions: read-all 12 | 13 | jobs: 14 | test-coverage: 15 | runs-on: ubuntu-latest 16 | env: 17 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 18 | 19 | steps: 20 | - uses: actions/checkout@v4 21 | 22 | - uses: r-lib/actions/setup-r@v2 23 | with: 24 | use-public-rspm: true 25 | 26 | - uses: r-lib/actions/setup-r-dependencies@v2 27 | with: 28 | extra-packages: any::covr, any::xml2 29 | needs: coverage 30 | 31 | - name: Test coverage 32 | run: | 33 | cov <- covr::package_coverage( 34 | quiet = FALSE, 35 | clean = FALSE, 36 | install_path = file.path(normalizePath(Sys.getenv("RUNNER_TEMP"), winslash = "/"), "package") 37 | ) 38 | covr::to_cobertura(cov) 39 | shell: Rscript {0} 40 | 41 | - uses: codecov/codecov-action@v4 42 | with: 43 | fail_ci_if_error: ${{ github.event_name != 'pull_request' && true || false }} 44 | file: ./cobertura.xml 45 | plugin: noop 46 | disable_search: true 47 | token: ${{ secrets.CODECOV_TOKEN }} 48 | 49 | - name: Show testthat output 50 | if: always() 51 | run: | 52 | ## -------------------------------------------------------------------- 53 | find '${{ runner.temp }}/package' -name 'testthat.Rout*' -exec cat '{}' \; || true 54 | shell: bash 55 | 56 | - name: Upload test results 57 | if: failure() 58 | uses: actions/upload-artifact@v4 59 | with: 60 | name: coverage-test-failures 61 | path: ${{ runner.temp }}/package 62 | -------------------------------------------------------------------------------- /man/read.por.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readpor.R 3 | \name{read.por} 4 | \alias{read.por} 5 | \title{read.por} 6 | \usage{ 7 | read.por( 8 | file, 9 | convert.factors = TRUE, 10 | generate.factors = TRUE, 11 | encoding = TRUE, 12 | fromEncoding = NULL, 13 | use.missings = TRUE, 14 | debug = FALSE, 15 | override = FALSE, 16 | convert.dates = TRUE, 17 | add.rownames = FALSE 18 | ) 19 | } 20 | \arguments{ 21 | \item{file}{\emph{string} a por-file to import. can be a file on a computer 22 | or an url. in this case the file will be downloaded and read before it is 23 | used.} 24 | 25 | \item{convert.factors}{\emph{logical} if true numeric or character variables 26 | will be converted into a factor in R.} 27 | 28 | \item{generate.factors}{\emph{logical} function to convert variables with 29 | partial labels into factors. e.g. 1 - low and 5 - high are provided, labels 30 | 2, 3 and 4 will be created. especially useful in combination with 31 | \code{use.missings=TRUE}.} 32 | 33 | \item{encoding}{\emph{logical} shall values be converted? If true, \code{read.por()} 34 | will try the charcode stored inside the por-file. If this value is 2 or not 35 | available, \code{fromEncoding} can be used to change encoding.} 36 | 37 | \item{fromEncoding}{\emph{character} encoding of the imported file. This 38 | information is stored inside the por-file, but is currently unused. Still 39 | this option can be used to define the initial encoding by hand.} 40 | 41 | \item{use.missings}{\emph{logical} should missing values be converted. 42 | Defaults to TRUE.} 43 | 44 | \item{debug}{\emph{logical} provides additional debug information. Most 45 | likely not useful to any user.} 46 | 47 | \item{override}{\emph{logical} The filename provided in \code{file} is 48 | checked for the ending por. If the file ending is different, nothing is read. 49 | This option can be used to override this behavior.} 50 | 51 | \item{convert.dates}{\emph{logical} Should dates be converted on the fly?} 52 | 53 | \item{add.rownames}{\emph{logical} If \code{TRUE}, the first column will be 54 | used as rownames. Variable will be dropped afterwards.} 55 | } 56 | \description{ 57 | Function to read a SPSS por file into a data.frame(). 58 | } 59 | \details{ 60 | SPSS files are widely available, though for R long time only foreign 61 | and memisc provided functions to import por-files. Lately haven joined. 62 | This package is an approach to offer another alternative, to document the 63 | por-format and provide additional options to import the data. 64 | } 65 | \note{ 66 | Information to decrypt the por-format was provided by tda 67 | \href{http://www.stat.ruhr-uni-bochum.de/tda.html}{www.stat.rub.de/tda.html} and 68 | pspp \href{http://www.gnu.org/software/pspp/}{www.gnu.org/software/pspp/} 69 | } 70 | \seealso{ 71 | \code{\link[foreign]{read.spss}}, \code{memisc}. 72 | } 73 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #' split character vector at "=" 5 | #' 6 | #' @param val_s CharacterVector 7 | #' @import Rcpp 8 | #' @keywords internal 9 | #' @noRd 10 | boost_split <- function(val_s) { 11 | .Call(`_readspss_boost_split`, val_s) 12 | } 13 | 14 | #' creates a factor inspired by an idea of Kevin Ushey in the Rcpp gallery 15 | #' http://gallery.rcpp.org/articles/fast-factor-generation/ 16 | #' 17 | #' @param x vector 18 | #' @param y charactervector with labels 19 | #' @import Rcpp 20 | #' @keywords internal 21 | #' @noRd 22 | fast_factor <- function(x, y) { 23 | .Call(`_readspss_fast_factor`, x, y) 24 | } 25 | 26 | #' Read encrypted SPSS file 27 | #' 28 | #' @param filePath The full systempath to the dta file you want to import. 29 | #' @param debug print debug information 30 | #' @param encStr encoding string 31 | #' @param ownEnc encoding provided by localeToCharset 32 | #' @param pass passkey required for encoding 33 | #' @import Rcpp 34 | #' @keywords internal 35 | #' @noRd 36 | readencrypted <- function(filePath, debug, encStr, ownEnc, pass) { 37 | .Call(`_readspss_readencrypted`, filePath, debug, encStr, ownEnc, pass) 38 | } 39 | 40 | #' Reads the binary SPSS file 41 | #' 42 | #' @param filePath The full systempath to the dta file you want to import. 43 | #' @param debug print debug information 44 | #' @param encStr encoding string 45 | #' @param override override bool 46 | #' @import Rcpp 47 | #' @keywords internal 48 | #' @noRd 49 | readpor <- function(filePath, debug, encStr, override) { 50 | .Call(`_readspss_readpor`, filePath, debug, encStr, override) 51 | } 52 | 53 | #' Reads the binary SPSS file 54 | #' 55 | #' @param filePath The full systempath to the dta file you want to import. 56 | #' @param debug print debug information 57 | #' @param encStr encoding string 58 | #' @param ownEnc encoding provided by localeToCharset 59 | #' @import Rcpp 60 | #' @keywords internal 61 | #' @noRd 62 | readsav <- function(filePath, debug, encStr, ownEnc) { 63 | .Call(`_readspss_readsav`, filePath, debug, encStr, ownEnc) 64 | } 65 | 66 | #' writes the binary SPSS file 67 | #' 68 | #' @param filePath The full systempath to the dta file you want to import. 69 | #' @param dat the data frame 70 | #' @import Rcpp 71 | #' @keywords internal 72 | #' @noRd 73 | writepor <- function(filePath, dat) { 74 | invisible(.Call(`_readspss_writepor`, filePath, dat)) 75 | } 76 | 77 | #' writes the binary SPSS file 78 | #' 79 | #' @param filePath The full systempath to the dta file you want to import. 80 | #' @param dat the data frame 81 | #' @param compress the file 82 | #' @param debug print debug information 83 | #' @param is_zsav write zsav 84 | #' @import Rcpp 85 | #' @keywords internal 86 | #' @noRd 87 | writesav <- function(filePath, dat, compress, debug, is_zsav) { 88 | invisible(.Call(`_readspss_writesav`, filePath, dat, compress, debug, is_zsav)) 89 | } 90 | 91 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # readspss 3 | 4 | ![R-CMD-check](https://github.com/JanMarvin/readspss/workflows/R-CMD-check/badge.svg) 5 | [![Codecov test 6 | coverage](https://codecov.io/gh/JanMarvin/readspss/branch/main/graph/badge.svg)](https://app.codecov.io/gh/JanMarvin/readspss?branch=main) 7 | [![r-universe](https://janmarvin.r-universe.dev/badges/readspss)](https://janmarvin.r-universe.dev/readspss) 8 | 9 | R package using Rcpp to parse an SPSS file into a data.frame(). 10 | Currently `read.sav` and `read.por` are the main functions and feature 11 | of this package. Writing of SPSS files is provided by `write.por` and 12 | `write.sav`. Writing is limited to uncompressed por and sav files and to 13 | compressed sav file. 14 | 15 | It works. Its read function is extensively tested on approximately 600+ 16 | sav-files and ~100 por-files. The code is maturing and is frequently 17 | tested. The read functions imports everything into a data frame. 18 | Including long strings and labels. Various features such as importing of 19 | value label or missings are tested and are working as intended. 20 | 21 | The package features reading of 22 | 23 | - sav files, 24 | - zsav files, 25 | - encrypted sav files and 26 | - por files 27 | 28 | and (experimental) writing support of (un)compressed 29 | 30 | - sav files and 31 | - zsav files and 32 | - por files. 33 | 34 | Because of the R code wrapped Rcpp functions the package is pretty fast. 35 | The R code for factor conversion slows things down a bit, changing the 36 | encoding a bit more. 37 | 38 | In comparison to `haven` and `foreign` this package preforms pretty 39 | well. It reads more files than each of its predecessors, some are only 40 | readable using `readspss` and it covers a few more cases of missing 41 | values. 42 | 43 | Focus was not so much on winning every benchmark, but reading all 44 | features of an SPSS file and to be as exactly as possible. So some 45 | benchmarks are won and others are lost. It is entirely up to the task. 46 | Besides the data itself `readspss` ships additional information provided 47 | by the SPSS files like the data label, documentation, date and 48 | timestamp. 49 | 50 | Reading of sav and por files is considered feature complete. 51 | 52 | Writing of (un)compressed sav and por files is implemented and 53 | considered working. Unsupported features are reading and writing of 54 | dates (might no longer be true) and writing of long strings. 55 | 56 | ## Installation 57 | 58 | With `remotes`: 59 | 60 | ``` r 61 | remotes::install_github("JanMarvin/readspss") 62 | ``` 63 | 64 | Or from [r-universe](https://r-universe.dev/) with: 65 | 66 | ``` r 67 | options(repos = c( 68 | janmarvin = 'https://janmarvin.r-universe.dev', 69 | CRAN = 'https://cloud.r-project.org')) 70 | install.packages('readspss') 71 | ``` 72 | 73 | ## Usage 74 | 75 | ``` r 76 | fls <- system.file("extdata", "electric.sav", package = "readspss") 77 | flp <- system.file("extdata", "electric.por", package = "readspss") 78 | 79 | df_s <- read.sav(fls) 80 | df_p <- read.por(flp) 81 | 82 | all.equal(df_s, df_p, check.attributes = FALSE) 83 | #> [1] TRUE 84 | ``` 85 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | ```{r setup, include=FALSE} 6 | knitr::opts_chunk$set( 7 | collapse = TRUE, 8 | comment = "#>", 9 | out.width = "100%" 10 | ) 11 | library(readspss) 12 | ``` 13 | 14 | # readspss 15 | ![R-CMD-check](https://github.com/JanMarvin/readspss/workflows/R-CMD-check/badge.svg) [![Codecov test coverage](https://codecov.io/gh/JanMarvin/readspss/branch/main/graph/badge.svg)](https://app.codecov.io/gh/JanMarvin/readspss?branch=main) [![r-universe](https://janmarvin.r-universe.dev/badges/readspss)](https://janmarvin.r-universe.dev/readspss) 16 | 17 | 18 | R package using Rcpp to parse an SPSS file into a data.frame(). Currently 19 | `read.sav` and `read.por` are the main functions and feature of this package. 20 | Writing of SPSS files is provided by `write.por` and `write.sav`. Writing is 21 | limited to uncompressed por and sav files and to compressed sav file. 22 | 23 | It works. Its read function is extensively tested on approximately 600+ 24 | sav-files and ~100 por-files. The code is maturing and is frequently tested. The 25 | read functions imports everything into a data frame. Including long strings and 26 | labels. Various features such as importing of value label or missings are tested 27 | and are working as intended. 28 | 29 | The package features reading of 30 | 31 | * sav files, 32 | * zsav files, 33 | * encrypted sav files and 34 | * por files 35 | 36 | and (experimental) writing support of (un)compressed 37 | 38 | * sav files and 39 | * zsav files and 40 | * por files. 41 | 42 | Because of the R code wrapped Rcpp functions the package is pretty fast. The 43 | R code for factor conversion slows things down a bit, changing the encoding a 44 | bit more. 45 | 46 | In comparison to `haven` and `foreign` this package preforms pretty well. It 47 | reads more files than each of its predecessors, some are only readable using 48 | `readspss` and it covers a few more cases of missing values. 49 | 50 | Focus was not so much on winning every benchmark, but reading all features of 51 | an SPSS file and to be as exactly as possible. So some benchmarks are 52 | won and others are lost. It is entirely up to the task. Besides the data 53 | itself `readspss` ships additional information provided by the SPSS files like 54 | the data label, documentation, date and timestamp. 55 | 56 | Reading of sav and por files is considered feature complete. 57 | 58 | Writing of (un)compressed sav and por files is implemented and considered 59 | working. Unsupported features are reading and writing of dates (might no 60 | longer be true) and writing of long strings. 61 | 62 | ## Installation 63 | 64 | With `remotes`: 65 | ```R 66 | remotes::install_github("JanMarvin/readspss") 67 | ``` 68 | 69 | Or from [r-universe](https://r-universe.dev/) with: 70 | 71 | ``` r 72 | options(repos = c( 73 | janmarvin = 'https://janmarvin.r-universe.dev', 74 | CRAN = 'https://cloud.r-project.org')) 75 | install.packages('readspss') 76 | ``` 77 | 78 | ## Usage 79 | 80 | ```{r} 81 | fls <- system.file("extdata", "electric.sav", package = "readspss") 82 | flp <- system.file("extdata", "electric.por", package = "readspss") 83 | 84 | df_s <- read.sav(fls) 85 | df_p <- read.por(flp) 86 | 87 | all.equal(df_s, df_p, check.attributes = FALSE) 88 | ``` 89 | -------------------------------------------------------------------------------- /src/read_sav_encrypted.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018-2025 Jan Marvin Garbuszus 3 | * Copyright (c) 2013 Ben Pfaff 4 | * 5 | * This program is free software; you can redistribute it and/or modify it 6 | * under the terms of the GNU General Public License as published by the 7 | * Free Software Foundation; either version 2 of the License, or (at your 8 | * option) any later version. 9 | * 10 | * This program is distributed in the hope that it will be useful, but WITHOUT 11 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 13 | * more details. 14 | * 15 | * You should have received a copy of the GNU General Public License along 16 | * with this program. If not, see . 17 | */ 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include 26 | #include 27 | 28 | 29 | #include "spss.h" 30 | #include "read_sav_encrypted.h" 31 | #include "readsav.h" 32 | 33 | 34 | int encryptfile (const char * filePath, std::string &outpath, std::string pass) 35 | { 36 | uint8_t inblock[36]; 37 | char pw[11]; 38 | AES_KEY aes; 39 | 40 | char *argv = (char *)pass.c_str(); 41 | 42 | // currently unused do not know if swapping is required 43 | // bool swapit = false; 44 | 45 | 46 | std::fstream sav(filePath, std::ios::in | std::ios::binary); 47 | if (sav) { 48 | 49 | std::string fileheader(36, '\0'); 50 | fileheader = readstring(fileheader, sav); 51 | 52 | if (fileheader.find("ENCRYPTEDSAV") == std::string::npos) { 53 | Rcpp::stop("The file header indicates that it is not an SPSS sav file."); 54 | } 55 | 56 | /* Read first ciphertext block and use it to verify the password. Try the 57 | password as plaintext first, then try decoding it. */ 58 | 59 | sav.read((char*)inblock, 16); 60 | 61 | if (!init (argv, inblock, &aes) 62 | && !(decode_password (argv, pw) && init (pw, inblock, &aes))) 63 | { 64 | Rprintf ("wrong password, sorry\n"); 65 | return (1); 66 | } else { 67 | // rewind so that a full sav file is returned 68 | sav.seekg(36, std::ios_base::beg); 69 | } 70 | 71 | // file is written into a temp file 72 | const std::string tempstr = ".readspss_enc_tmp_file"; 73 | std::fstream outfile (tempstr, std::ios::out | std::ios::binary); 74 | 75 | /* Decrypt entire input. */ 76 | while (sav.read ((char*)inblock, 16)) 77 | { 78 | uint8_t outblock[16]; 79 | 80 | AES_ecb_encrypt (inblock, outblock, &aes, AES_DECRYPT); 81 | outfile.write((char *)(&outblock[0]), 16); 82 | } 83 | outfile.close(); 84 | 85 | // this is returned as this file is read 86 | outpath = tempstr; 87 | 88 | return 0; 89 | 90 | } else { 91 | return -1; 92 | } 93 | 94 | } 95 | 96 | 97 | //' Read encrypted SPSS file 98 | //' 99 | //' @param filePath The full systempath to the dta file you want to import. 100 | //' @param debug print debug information 101 | //' @param encStr encoding string 102 | //' @param ownEnc encoding provided by localeToCharset 103 | //' @param pass passkey required for encoding 104 | //' @import Rcpp 105 | //' @keywords internal 106 | //' @noRd 107 | // [[Rcpp::export]] 108 | Rcpp::List readencrypted(const char * filePath, const bool debug, 109 | std::string encStr, 110 | std::string const ownEnc, 111 | std::string const pass) { 112 | 113 | std::string outPath; 114 | Rcpp::List df; 115 | 116 | // encrypt the sav-file 117 | if (encryptfile(filePath, outPath, pass) == 0) { 118 | df = readsav(outPath.c_str(), debug, encStr, ownEnc); 119 | 120 | // remove encrypted sav-file 121 | std::remove(outPath.c_str()); 122 | } else { 123 | Rcpp::stop("stopping"); 124 | } 125 | 126 | return df; 127 | } 128 | -------------------------------------------------------------------------------- /src/read_sav_uncompress.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018-2025 Jan Marvin Garbuszus 3 | * 4 | * zlib header information by Evan Miller 5 | * 6 | * This program is free software; you can redistribute it and/or modify it 7 | * under the terms of the GNU General Public License as published by the 8 | * Free Software Foundation; either version 2 of the License, or (at your 9 | * option) any later version. 10 | * 11 | * This program is distributed in the hope that it will be useful, but WITHOUT 12 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 14 | * more details. 15 | * 16 | * You should have received a copy of the GNU General Public License along 17 | * with this program. If not, see . 18 | */ 19 | 20 | #include 21 | #include 22 | #include 23 | 24 | #include "spss.h" 25 | 26 | std::string read_sav_uncompress (std::fstream& sav, 27 | const bool swapit, const uint8_t cflag, 28 | bool debug) { 29 | uint64_t zhead_ofs = 0, ztail_ofs = 0, ztail_len = 0; 30 | 31 | int64_t bias = 0; 32 | int64_t zero = 0; 33 | int32_t block_size = 0; 34 | int32_t n_blocks = 0; 35 | 36 | int64_t uncompr_ofs = 0; 37 | int64_t compr_ofs = 0; 38 | int32_t uncompr_size = 0; 39 | int32_t compr_size = 0; 40 | 41 | // read zheader 42 | zhead_ofs = readbin(zhead_ofs, sav, swapit); 43 | ztail_ofs = readbin(ztail_ofs, sav, swapit); 44 | ztail_len = readbin(ztail_len, sav, swapit); 45 | 46 | 47 | sav.seekg(ztail_ofs, std::ios_base::beg); 48 | 49 | // read ztrailer 50 | bias = readbin(bias, sav, swapit); 51 | zero = readbin(zero, sav, swapit); 52 | block_size = readbin(block_size, sav, swapit); 53 | n_blocks = readbin(n_blocks, sav, swapit); 54 | 55 | std::vector u_ofs(n_blocks), c_ofs(n_blocks); 56 | std::vector u_size(n_blocks), c_size(n_blocks); 57 | 58 | 59 | for (int i = 0; i < n_blocks; ++i) { 60 | // read uncompr and compr ofset and size 61 | u_ofs[i] = readbin(uncompr_ofs, sav, swapit); 62 | c_ofs[i] = readbin(compr_ofs, sav, swapit); 63 | u_size[i] = readbin(uncompr_size, sav, swapit); 64 | c_size[i] = readbin(compr_size, sav, swapit); 65 | 66 | if(debug) { 67 | Rcpp::Rcout << "uofs " << u_ofs[i] << std::endl; 68 | Rcpp::Rcout << "cofs " << c_ofs[i] << std::endl; 69 | Rcpp::Rcout << "usize " << u_size[i] << std::endl; 70 | Rcpp::Rcout << "csize " << c_size[i] << std::endl; 71 | } 72 | } 73 | 74 | if(debug) { 75 | Rcpp::Rcout << "zhead_ofs " << zhead_ofs << 76 | "\nztail_ofs " << ztail_ofs << 77 | "\nztail_len " << ztail_len << 78 | "\nbias " << bias << 79 | "\nzero " << zero << 80 | "\nblock_size " << block_size << 81 | "\nn_blocks " << n_blocks << 82 | std::endl; 83 | } 84 | 85 | 86 | // write to temporary file 87 | const std::string tempstr = ".readspss_unc_tmp_file"; 88 | std::fstream outfile (tempstr, std::ios::out | std::ios::binary); 89 | if (outfile) { 90 | for (int i = 0; i < n_blocks; ++i) { 91 | // seek to compr ofset 92 | sav.seekg(c_ofs[i], std::ios_base::beg); 93 | 94 | // Bytef is unsigned char * 95 | std::vector compr_block(c_size[i], 0); 96 | std::vector uncompr_block(u_size[i], 0); 97 | 98 | // read the complete compr data part 99 | sav.read((char*)&compr_block[0], c_size[i]); 100 | 101 | int32_t status = 0; 102 | uLong uncompr_block_len = u_size[i]; 103 | uLong compr_block_len = c_size[i]; 104 | 105 | // uncompress should be 0 106 | status = uncompress(&uncompr_block[0], &uncompr_block_len, 107 | &compr_block[0], compr_block_len); 108 | 109 | if (status != 0) Rcpp::stop("uncompress failed."); 110 | 111 | outfile.write((char *)(&uncompr_block[0]), uncompr_block_len); 112 | } 113 | if (debug) Rcpp::Rcout << tempstr.c_str() << std::endl; 114 | outfile.close(); 115 | } else { 116 | Rcpp::stop("outfile could not be opend"); 117 | } 118 | 119 | return tempstr; 120 | } 121 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | # Anticonf (tm) script by Jeroen Ooms (2023) 2 | # This script will query 'pkg-config' for the required cflags and ldflags. 3 | # If pkg-config is unavailable or does not find the library, try setting 4 | # INCLUDE_DIR and LIB_DIR manually via e.g: 5 | # R CMD INSTALL --configure-vars='INCLUDE_DIR=/.../include LIB_DIR=/.../lib' 6 | 7 | # Library settings 8 | PKG_CONFIG_NAME="openssl" 9 | PKG_DEB_NAME="libssl-dev" 10 | PKG_RPM_NAME="openssl-devel" 11 | PKG_CSW_NAME="libssl_dev" 12 | PKG_BREW_NAME="openssl" 13 | PKG_TEST_FILE="tools/version.c" 14 | PKG_LIBS="-lssl -lcrypto" 15 | PKG_CFLAGS="" 16 | 17 | # Build against a specific openssl version 18 | # export PKG_CONFIG_PATH="/usr/local/opt/openssl@1.1/lib/pkgconfig" 19 | 20 | # Use pkg-config if available 21 | pkg-config ${PKG_CONFIG_NAME} ${MINVERSION} 2>/dev/null 22 | if [ $? -eq 0 ]; then 23 | PKGCONFIG_CFLAGS=`pkg-config --cflags ${PKG_CONFIG_NAME}` 24 | PKGCONFIG_LIBS=`pkg-config --libs ${PKG_CONFIG_NAME}` 25 | fi 26 | 27 | # Note that cflags may be empty in case of success 28 | if [ "$INCLUDE_DIR" ] || [ "$LIB_DIR" ]; then 29 | echo "Found INCLUDE_DIR and/or LIB_DIR!" 30 | PKG_CFLAGS="-I$INCLUDE_DIR $PKG_CFLAGS" 31 | PKG_LIBS="-L$LIB_DIR $PKG_LIBS" 32 | elif [ "$PKGCONFIG_CFLAGS" ] || [ "$PKGCONFIG_LIBS" ]; then 33 | echo "Found pkg-config cflags and libs!" 34 | PKG_CFLAGS=${PKGCONFIG_CFLAGS} 35 | PKG_LIBS=${PKGCONFIG_LIBS} 36 | elif [ `uname` = "Darwin" ]; then 37 | test ! "$CI" && brew --version 2>/dev/null 38 | if [ $? -eq 0 ]; then 39 | BREWDIR=`brew --prefix` 40 | PKG_CFLAGS="-I$BREWDIR/opt/openssl/include" 41 | PKG_LIBS="-L$BREWDIR/opt/openssl/lib $PKG_LIBS" 42 | else 43 | curl -sfL "https://autobrew.github.io/scripts/$PKG_BREW_NAME" > autobrew 44 | . ./autobrew 45 | fi 46 | fi 47 | 48 | # Fix broken homebrew openssl@3.3.0 49 | if echo "$PKG_LIBS" | grep -q '/opt/homebrew/Cellar/openssl@3/3.3.0 -lssl -lcrypto'; then 50 | PKG_LIBS=$(echo "$PKG_LIBS" | sed 's|/opt/homebrew/Cellar/openssl@3/3.3.0|/opt/homebrew/Cellar/openssl@3/3.3.0/lib|g') 51 | fi 52 | 53 | # Find compiler 54 | CC=`${R_HOME}/bin/R CMD config CC` 55 | CFLAGS=`${R_HOME}/bin/R CMD config CFLAGS` 56 | CPPFLAGS=`${R_HOME}/bin/R CMD config CPPFLAGS` 57 | 58 | # For debugging 59 | echo "Using PKG_CFLAGS=$PKG_CFLAGS" 60 | 61 | # Test configuration 62 | ${CC} ${CPPFLAGS} ${PKG_CFLAGS} ${CFLAGS} -E ${PKG_TEST_FILE} >/dev/null 2>configure.log 63 | 64 | # Customize the error 65 | if [ $? -ne 0 ]; then 66 | echo "--------------------------- [ANTICONF] --------------------------------" 67 | echo "Configuration failed because $PKG_CONFIG_NAME was not found. Try installing:" 68 | echo " * deb: $PKG_DEB_NAME (Debian, Ubuntu, etc)" 69 | echo " * rpm: $PKG_RPM_NAME (Fedora, CentOS, RHEL)" 70 | echo " * csw: $PKG_CSW_NAME (Solaris)" 71 | echo " * brew: $PKG_BREW_NAME (Mac OSX)" 72 | echo "If $PKG_CONFIG_NAME is already installed, check that 'pkg-config' is in your" 73 | echo "PATH and PKG_CONFIG_PATH contains a $PKG_CONFIG_NAME.pc file. If pkg-config" 74 | echo "is unavailable you can set INCLUDE_DIR and LIB_DIR manually via:" 75 | echo "R CMD INSTALL --configure-vars='INCLUDE_DIR=... LIB_DIR=...'" 76 | echo "-------------------------- [ERROR MESSAGE] ---------------------------" 77 | cat configure.log 78 | echo "--------------------------------------------------------------------" 79 | exit 1 80 | fi 81 | 82 | # Try to link against the correct OpenSSL version 83 | if [ -z "$AUTOBREW" ]; then 84 | SONAME=`${CC} -E ${PKG_CFLAGS} src/tests/soname.h | grep 'echo' | sh | xargs` 85 | if [ "$SONAME" ]; then 86 | if [ `uname` = "Darwin" ]; then 87 | PKG_LIBS_VERSIONED=`echo "${PKG_LIBS}" | sed "s/-lssl/-lssl.${SONAME}/" | sed "s/-lcrypto/-lcrypto.${SONAME}/"` 88 | else 89 | PKG_LIBS_VERSIONED=`echo "${PKG_LIBS}" | sed "s/-lssl/-l:libssl.so.${SONAME}/" | sed "s/-lcrypto/-l:libcrypto.so.${SONAME}/"` 90 | fi 91 | 92 | # Test if versioned linking works 93 | ${CC} ${PKG_CFLAGS} src/tests/main.c ${PKG_LIBS_VERSIONED} -o src/main.exe 2>/dev/null 94 | if [ $? -eq 0 ]; then PKG_LIBS="${PKG_LIBS_VERSIONED}"; fi 95 | 96 | # Suppress opensslv3 warnings for now 97 | if [ "$SONAME" = "3" ]; then 98 | PKG_CFLAGS="$PKG_CFLAGS -DOPENSSL_SUPPRESS_DEPRECATED" 99 | fi 100 | 101 | fi #SONAME 102 | fi #AUTOBREW 103 | 104 | echo "Using PKG_LIBS=$PKG_LIBS" 105 | 106 | # Write to Makevars 107 | sed -e "s|@cflags@|$PKG_CFLAGS|" -e "s|@libs@|$PKG_LIBS|" src/Makevars.in > src/Makevars 108 | 109 | # Success 110 | exit 0 111 | -------------------------------------------------------------------------------- /man/read.sav.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/readsav.R 3 | \name{read.sav} 4 | \alias{read.sav} 5 | \title{read.sav} 6 | \usage{ 7 | read.sav( 8 | file, 9 | convert.factors = TRUE, 10 | generate.factors = TRUE, 11 | encoding = TRUE, 12 | fromEncoding = NULL, 13 | use.missings = TRUE, 14 | debug = FALSE, 15 | override = FALSE, 16 | convert.dates = TRUE, 17 | add.rownames = FALSE, 18 | pass 19 | ) 20 | } 21 | \arguments{ 22 | \item{file}{\emph{string} a sav-file to import. can be a file on a computer 23 | or an url. in this case the file will be downloaded and read before it is 24 | used.} 25 | 26 | \item{convert.factors}{\emph{logical} if true numeric or character variables 27 | will be converted into a factor in R.} 28 | 29 | \item{generate.factors}{\emph{logical} function to convert variables with 30 | partial labels into factors. e.g. 1 - low and 5 - high are provided, labels 31 | 2, 3 and 4 will be created. especially useful in combination with 32 | \code{use.missings=TRUE}.} 33 | 34 | \item{encoding}{\emph{logical} shall values be converted? If true, read.sav 35 | will try the charcode stored inside the sav-file. If this value is 2 or not 36 | available, fromEncoding can be used to change encoding.} 37 | 38 | \item{fromEncoding}{\emph{character.} encoding of the imported file. This 39 | information is stored inside the sav-file, but is currently unused. Still 40 | this option can be used to define the initial encoding by hand.} 41 | 42 | \item{use.missings}{\emph{logical} should missing values be converted. 43 | Defaults to TRUE.} 44 | 45 | \item{debug}{\emph{logical} provides additional debug information. Most 46 | likely not useful to any user.} 47 | 48 | \item{override}{\emph{logical}. The filename provided in \code{file} is 49 | checked for the ending sav. If the file ending is different, nothing is read. 50 | This option can be used to override this behavior.} 51 | 52 | \item{convert.dates}{\emph{logical}. Should dates be converted on the fly?} 53 | 54 | \item{add.rownames}{\emph{logical.} If \code{TRUE}, the first column will be 55 | used as rownames. Variable will be dropped afterwards.} 56 | 57 | \item{pass}{\emph{character}. If encrypted sav should be imported, this is a 58 | maximum of ten character encryption key.} 59 | } 60 | \value{ 61 | \code{readspss} returns a data.frame with additional attributes 62 | \itemize{ 63 | \item \emph{row.names} rownames 64 | \item \emph{names} colnames 65 | \item \emph{datalabel} datalabel 66 | \item \emph{datestamp} datestamp 67 | \item \emph{timestamp} timestamp 68 | \item \emph{filelabel} filelabel 69 | \item \emph{class} data.frame 70 | \item \emph{vtype} SPSS type 0 is usually a numeric/integer 71 | \item \emph{disppar} matrix of display parameters if available 72 | \item \emph{missings} a list containing information about the missing variables. if 73 | \code{use.missings=TRUE} this Information will be used to generate missings. 74 | \item \emph{haslabel} list of variables that contain labels 75 | \item \emph{longstring} character vector of long strings if any in file 76 | \item \emph{longmissing} character vector of missings in longstrings if any 77 | \item \emph{longlabel} character vector of long labels 78 | \item \emph{cflag} 0 if uncompressed, 1 if compressed 79 | \item \emph{endian} 2 or 3 if little endian else 0 80 | \item \emph{compression} compression similar to cflag, somehow stored twice in the 81 | sav file 82 | \item \emph{doc} list containing documentation information if any 83 | \item \emph{charcode} encoding string most likely 2 is CP1252 84 | \item \emph{encoding} sometimes sav-file contain encoding as a extra string 85 | \item \emph{ownEnc} encoding of the R-session 86 | \item \emph{doenc} was the file supposed to be encoded? 87 | \item \emph{autoenc} was encoding applied to the file? 88 | \item \emph{swapit} were the bytes swapped? 89 | \item \emph{totals} character string of totals if any 90 | \item \emph{dataview} xml file how the data should be printed 91 | \item \emph{extraproduct} additional string provided 92 | \item \emph{label} list containing label value information 93 | \item \emph{varmatrix} a matrix with information how the data is stored 94 | \item \emph{var.label} variable labels 95 | \item \emph{lmissings} missings table if any in longstrings 96 | } 97 | } 98 | \description{ 99 | Function to read a SPSS sav file into a data.frame(). 100 | } 101 | \details{ 102 | SPSS files are widely available, though for R long time only foreign 103 | and memisc provided functions to import sav-files. Lately haven joined. 104 | This package is an approach to offer another alternative, to document the 105 | sav-format and provide additional options to import the data. 106 | sav-files are stored most exclusively as numerics only in compression mode 107 | are some integers stored as integers. Still they are returned as numerics. 108 | } 109 | \note{ 110 | Information to decrypt the sav-format was provided by tda 111 | \href{http://www.stat.ruhr-uni-bochum.de/tda.html}{www.stat.rub.de/tda.html} and 112 | pspp \href{http://www.gnu.org/software/pspp/}{www.gnu.org/software/pspp/} 113 | } 114 | \examples{ 115 | fl <- system.file("extdata", "electric.sav", package = "readspss") 116 | dd <- read.sav(fl) 117 | 118 | } 119 | \seealso{ 120 | \code{\link[foreign]{read.spss}}, \code{memisc} and 121 | \code{haven}. 122 | } 123 | -------------------------------------------------------------------------------- /src/write_sav_compress.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2019-2025 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | #include 25 | 26 | #include "spss.h" 27 | 28 | 29 | void write_sav_compress (std::fstream& sav, const std::string tempstr, 30 | const bool swapit, bool debug) { 31 | 32 | // open zsav destination 33 | if (sav.is_open()) 34 | { 35 | 36 | // offset positions 37 | uint64_t zhead_ofs = 0, ztail_ofs = 0, ztail_len = 0; 38 | 39 | 40 | std::fstream tmp (tempstr, std::ios::in | std::ios::binary); 41 | if (!tmp.is_open()) Rcpp::stop("tmp not open"); 42 | 43 | // temporary sav file to be removed afterwards 44 | tmp.seekg(0, std::ios_base::beg); 45 | size_t curpos = tmp.tellg(); 46 | tmp.seekg(0, std::ios_base::end); 47 | size_t savlen = tmp.tellg(); 48 | tmp.seekg(0, std::ios_base::beg); 49 | 50 | int64_t bias = -100, zero = 0; 51 | int32_t block_size = 4190208; // bytes 52 | int32_t n_blocks = ceil((double)savlen/block_size); 53 | 54 | if (debug) 55 | Rcpp::Rcout << savlen << " " << curpos << " " << n_blocks << std::endl; 56 | 57 | int64_t uncompr_ofs = 0, compr_ofs = 0; 58 | int32_t uncompr_size = block_size, compr_size = compressBound(uncompr_size); 59 | 60 | std::vector u_ofs(n_blocks), c_ofs(n_blocks); 61 | std::vector u_size(n_blocks), c_size(n_blocks); 62 | 63 | // zlib header 64 | 65 | // write zheader will be replaced with actual values once they are 66 | // known after writing the entire file. 67 | zhead_ofs = sav.tellg(); // is initial u_ofs 68 | 69 | writebin(zhead_ofs, sav, swapit); 70 | writebin(ztail_ofs, sav, swapit); 71 | writebin(ztail_len, sav, swapit); 72 | 73 | // compress sav 74 | for (int32_t i = 0; i < n_blocks; ++i) { 75 | 76 | // modify chunk size for last chunk 77 | if (i == (n_blocks-1)) { 78 | int64_t len = savlen - (block_size * (n_blocks -1)); 79 | 80 | uncompr_size = len; 81 | compr_size = compressBound(uncompr_size); 82 | } 83 | 84 | uncompr_ofs = zhead_ofs; 85 | if (i > 0) uncompr_ofs = u_ofs[i-1] + u_size[i-1]; 86 | compr_ofs = sav.tellg(); 87 | 88 | int32_t status = 0; 89 | uLong uncompr_block_len = uncompr_size; 90 | uLong compr_block_len = compr_size; 91 | 92 | // Bytef is unsigned char * 93 | std::vector uncompr_block(uncompr_size); 94 | std::vector compr_block(compr_size); 95 | 96 | // read the uncompr data part 97 | tmp.read((char*)(&uncompr_block[0]), uncompr_size); 98 | 99 | // uncompress should be 0 100 | status = compress2(&compr_block[0], &compr_block_len, 101 | &uncompr_block[0], uncompr_block_len, 102 | Z_DEFAULT_COMPRESSION); 103 | 104 | if (status != Z_OK) Rcpp::stop("compression failed."); 105 | 106 | 107 | if (debug) 108 | Rcpp::Rcout << "uncompressed: " << uncompr_block_len << 109 | "\ncompressed: " << compr_block_len << std::endl; 110 | 111 | sav.write((char *)(&compr_block[0]), compr_block_len); 112 | 113 | // export 114 | u_ofs[i] = uncompr_ofs; 115 | c_ofs[i] = compr_ofs; 116 | u_size[i] = uncompr_block_len; 117 | c_size[i] = compr_block_len; 118 | } 119 | 120 | // zlib trailer 121 | 122 | ztail_ofs = sav.tellg(); 123 | 124 | writebin(bias, sav, swapit); 125 | writebin(zero, sav, swapit); 126 | writebin(block_size, sav, swapit); 127 | writebin(n_blocks, sav, swapit); 128 | 129 | // write uncompr and compr ofset and size 130 | for (int32_t i = 0; i < n_blocks; ++i) { 131 | writebin(u_ofs[i], sav, swapit); 132 | writebin(c_ofs[i], sav, swapit); 133 | writebin(u_size[i], sav, swapit); 134 | writebin(c_size[i], sav, swapit); 135 | } 136 | 137 | /* get ztail_len */ 138 | uint64_t ztail_end = sav.tellg(); 139 | ztail_len = ztail_end - ztail_ofs; 140 | 141 | /* write ztail_ofs and ztail_len */ 142 | sav.seekg(zhead_ofs, sav.beg); 143 | writebin(zhead_ofs, sav, swapit); 144 | writebin(ztail_ofs, sav, swapit); 145 | writebin(ztail_len, sav, swapit); 146 | 147 | 148 | if(debug) { 149 | Rcpp::Rcout << "zhead_ofs " << zhead_ofs << "\n" << 150 | "ztail_ofs " << ztail_ofs << "\n" << 151 | "ztail_len " << ztail_len << "\n" << std::endl; 152 | } 153 | 154 | tmp.close(); 155 | 156 | } else { 157 | Rcpp::stop("sav file is unexpectedly closed"); 158 | } 159 | 160 | } 161 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | #ifdef RCPP_USE_GLOBAL_ROSTREAM 9 | Rcpp::Rostream& Rcpp::Rcout = Rcpp::Rcpp_cout_get(); 10 | Rcpp::Rostream& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get(); 11 | #endif 12 | 13 | // boost_split 14 | Rcpp::CharacterVector boost_split(std::string val_s); 15 | RcppExport SEXP _readspss_boost_split(SEXP val_sSEXP) { 16 | BEGIN_RCPP 17 | Rcpp::RObject rcpp_result_gen; 18 | Rcpp::RNGScope rcpp_rngScope_gen; 19 | Rcpp::traits::input_parameter< std::string >::type val_s(val_sSEXP); 20 | rcpp_result_gen = Rcpp::wrap(boost_split(val_s)); 21 | return rcpp_result_gen; 22 | END_RCPP 23 | } 24 | // fast_factor 25 | SEXP fast_factor(SEXP x, SEXP y); 26 | RcppExport SEXP _readspss_fast_factor(SEXP xSEXP, SEXP ySEXP) { 27 | BEGIN_RCPP 28 | Rcpp::RObject rcpp_result_gen; 29 | Rcpp::RNGScope rcpp_rngScope_gen; 30 | Rcpp::traits::input_parameter< SEXP >::type x(xSEXP); 31 | Rcpp::traits::input_parameter< SEXP >::type y(ySEXP); 32 | rcpp_result_gen = Rcpp::wrap(fast_factor(x, y)); 33 | return rcpp_result_gen; 34 | END_RCPP 35 | } 36 | // readencrypted 37 | Rcpp::List readencrypted(const char * filePath, const bool debug, std::string encStr, std::string const ownEnc, std::string const pass); 38 | RcppExport SEXP _readspss_readencrypted(SEXP filePathSEXP, SEXP debugSEXP, SEXP encStrSEXP, SEXP ownEncSEXP, SEXP passSEXP) { 39 | BEGIN_RCPP 40 | Rcpp::RObject rcpp_result_gen; 41 | Rcpp::RNGScope rcpp_rngScope_gen; 42 | Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); 43 | Rcpp::traits::input_parameter< const bool >::type debug(debugSEXP); 44 | Rcpp::traits::input_parameter< std::string >::type encStr(encStrSEXP); 45 | Rcpp::traits::input_parameter< std::string const >::type ownEnc(ownEncSEXP); 46 | Rcpp::traits::input_parameter< std::string const >::type pass(passSEXP); 47 | rcpp_result_gen = Rcpp::wrap(readencrypted(filePath, debug, encStr, ownEnc, pass)); 48 | return rcpp_result_gen; 49 | END_RCPP 50 | } 51 | // readpor 52 | Rcpp::List readpor(const char * filePath, const bool debug, std::string encStr, bool override); 53 | RcppExport SEXP _readspss_readpor(SEXP filePathSEXP, SEXP debugSEXP, SEXP encStrSEXP, SEXP overrideSEXP) { 54 | BEGIN_RCPP 55 | Rcpp::RObject rcpp_result_gen; 56 | Rcpp::RNGScope rcpp_rngScope_gen; 57 | Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); 58 | Rcpp::traits::input_parameter< const bool >::type debug(debugSEXP); 59 | Rcpp::traits::input_parameter< std::string >::type encStr(encStrSEXP); 60 | Rcpp::traits::input_parameter< bool >::type override(overrideSEXP); 61 | rcpp_result_gen = Rcpp::wrap(readpor(filePath, debug, encStr, override)); 62 | return rcpp_result_gen; 63 | END_RCPP 64 | } 65 | // readsav 66 | Rcpp::List readsav(const char * filePath, const bool debug, std::string encStr, std::string const ownEnc); 67 | RcppExport SEXP _readspss_readsav(SEXP filePathSEXP, SEXP debugSEXP, SEXP encStrSEXP, SEXP ownEncSEXP) { 68 | BEGIN_RCPP 69 | Rcpp::RObject rcpp_result_gen; 70 | Rcpp::RNGScope rcpp_rngScope_gen; 71 | Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); 72 | Rcpp::traits::input_parameter< const bool >::type debug(debugSEXP); 73 | Rcpp::traits::input_parameter< std::string >::type encStr(encStrSEXP); 74 | Rcpp::traits::input_parameter< std::string const >::type ownEnc(ownEncSEXP); 75 | rcpp_result_gen = Rcpp::wrap(readsav(filePath, debug, encStr, ownEnc)); 76 | return rcpp_result_gen; 77 | END_RCPP 78 | } 79 | // writepor 80 | void writepor(const char * filePath, Rcpp::DataFrame dat); 81 | RcppExport SEXP _readspss_writepor(SEXP filePathSEXP, SEXP datSEXP) { 82 | BEGIN_RCPP 83 | Rcpp::RNGScope rcpp_rngScope_gen; 84 | Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); 85 | Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); 86 | writepor(filePath, dat); 87 | return R_NilValue; 88 | END_RCPP 89 | } 90 | // writesav 91 | void writesav(const char * filePath, Rcpp::DataFrame dat, uint8_t compress, bool debug, bool is_zsav); 92 | RcppExport SEXP _readspss_writesav(SEXP filePathSEXP, SEXP datSEXP, SEXP compressSEXP, SEXP debugSEXP, SEXP is_zsavSEXP) { 93 | BEGIN_RCPP 94 | Rcpp::RNGScope rcpp_rngScope_gen; 95 | Rcpp::traits::input_parameter< const char * >::type filePath(filePathSEXP); 96 | Rcpp::traits::input_parameter< Rcpp::DataFrame >::type dat(datSEXP); 97 | Rcpp::traits::input_parameter< uint8_t >::type compress(compressSEXP); 98 | Rcpp::traits::input_parameter< bool >::type debug(debugSEXP); 99 | Rcpp::traits::input_parameter< bool >::type is_zsav(is_zsavSEXP); 100 | writesav(filePath, dat, compress, debug, is_zsav); 101 | return R_NilValue; 102 | END_RCPP 103 | } 104 | 105 | static const R_CallMethodDef CallEntries[] = { 106 | {"_readspss_boost_split", (DL_FUNC) &_readspss_boost_split, 1}, 107 | {"_readspss_fast_factor", (DL_FUNC) &_readspss_fast_factor, 2}, 108 | {"_readspss_readencrypted", (DL_FUNC) &_readspss_readencrypted, 5}, 109 | {"_readspss_readpor", (DL_FUNC) &_readspss_readpor, 4}, 110 | {"_readspss_readsav", (DL_FUNC) &_readspss_readsav, 4}, 111 | {"_readspss_writepor", (DL_FUNC) &_readspss_writepor, 2}, 112 | {"_readspss_writesav", (DL_FUNC) &_readspss_writesav, 5}, 113 | {NULL, NULL, 0} 114 | }; 115 | 116 | RcppExport void R_init_readspss(DllInfo *dll) { 117 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 118 | R_useDynamicSymbols(dll, FALSE); 119 | } 120 | -------------------------------------------------------------------------------- /R/writepor.R: -------------------------------------------------------------------------------- 1 | #' write.por 2 | #' 3 | #' Function to write an SPSS por file. Returns an por file that read.por can 4 | #' read as well as SPSS can. Other packages as foreign, memisc and haven might 5 | #' fail (fail reading or return wrong values). 6 | #' @param dat _data.frame_ a data.frame to export as por-file. 7 | #' @param filepath _string_ full path where and how this file should be 8 | #' stored 9 | #' @param label _character_ vector of labels. must be of size `ncol(dat)` 10 | #' @param add.rownames _logical_ If `TRUE`, a new variable rownames 11 | #' will be added to the por-file. 12 | #' @param convert.factors _logical_ If `TRUE`, factors will be converted to 13 | #' SPSS variables with labels. 14 | #' SPSS expects strings to be encoded as Windows-1252, so all levels will be 15 | #' recoded. Character which can not be mapped in Windows-1252 will be saved as 16 | #' hexcode. 17 | #' @param toEncoding _character_ encoding used for the por file. SPSS itself 18 | #' claims to have problems with unicode and por files, so "CP1252" is the 19 | #' default. 20 | #' @param convert.dates _logical_ should dates be converted to SPSS format 21 | #' @param tz _character_ The name of the timezone convert.dates will use. 22 | #' @details Strings longer than 255 chars are not provided. 23 | #' File will be stored using "CP1252" encoding. 24 | #' 25 | #' @return `write.por` returns nothing 26 | #' 27 | #' @export 28 | write.por <- function(dat, filepath, label, add.rownames = FALSE, 29 | convert.factors = TRUE, toEncoding = "CP1252", 30 | convert.dates = TRUE, tz = "GMT") { 31 | 32 | filepath <- path.expand(filepath) 33 | 34 | if (missing(filepath)) 35 | stop("need a path") 36 | 37 | attrlab <- attr(dat, "label") 38 | 39 | if (identical(attrlab, character(0))) 40 | attrlab <- NULL 41 | 42 | 43 | if (missing(label) && is.null(attrlab)) 44 | label <- "" 45 | 46 | if (missing(label) && !is.null(attrlab)) 47 | label <- attrlab 48 | 49 | if (!identical(label, "") && (length(label) != ncol(dat))) 50 | stop("label and ncols differ. each col needs a label") 51 | 52 | if (any(nchar(label)) > 255) 53 | stop("longlabels not yet implemented") 54 | 55 | if (add.rownames) { 56 | dat <- data.frame(rownames = rownames(dat), 57 | dat, stringsAsFactors = FALSE) 58 | } 59 | 60 | nams <- names(dat) 61 | 62 | nams <- toupper(nams) 63 | nvarnames <- substr(nams, 0, 8) 64 | names(dat) <- nvarnames 65 | 66 | 67 | if (convert.factors) { 68 | # If our data.frame contains factors, we create a label.table 69 | factors <- which(sapply(dat, is.factor)) 70 | f.names <- attr(factors, "names") 71 | 72 | label.table <- vector("list", length(f.names)) 73 | names(label.table) <- f.names 74 | 75 | i <- 0 76 | for (v in factors) { 77 | i <- i + 1 78 | f.levels <- levels(dat[[v]]) 79 | f.labels <- as.integer(labels(levels(dat[[v]]))) 80 | attr(f.labels, "names") <- f.levels 81 | f.labels <- f.labels[names(f.labels) != ".."] 82 | label.table[[(f.names[i])]] <- f.labels 83 | } 84 | attr(dat, "labtab") <- rev(label.table) 85 | } else { 86 | attr(dat, "labtab") <- NULL 87 | } 88 | 89 | vtyp <- as.integer(sapply(dat, is.character)) 90 | vtyp[vtyp != 0] <- as.integer(sapply(dat[vtyp != 0], 91 | function(x) max(nchar(x), na.rm = TRUE))) 92 | 93 | ff <- which(sapply(dat, is.factor)) 94 | 95 | if (identical(unname(ff), integer(0))) 96 | ff <- unname(ff) 97 | 98 | if (any(vtyp > 255)) { 99 | stop("Strings longer than 255 characters not yet implemented") 100 | } 101 | 102 | vtyp <- ceiling(vtyp / 8) * 8 103 | 104 | fun <- function(vec) { 105 | 106 | vartypes <- NULL 107 | for (i in seq_along(vec)) { 108 | 109 | val <- vtyp[i] 110 | 111 | if (val <= 8) { 112 | vartypes <- c(vartypes, val) 113 | } else { 114 | vartypes <- c(vartypes, c(val, rep(-1, (val / 8 - 1)))) 115 | } 116 | } 117 | 118 | vartypes 119 | 120 | } 121 | 122 | vartypes <- fun(vtyp) 123 | 124 | systime <- Sys.time() 125 | timestamp <- gsub(pattern = ":", replacement = "", 126 | x = substr(systime, 12, 19)) 127 | datestamp <- format(Sys.Date(), "%Y%m%d") 128 | 129 | 130 | itc <- rep(0, NCOL(dat)) 131 | 132 | cc <- sapply(dat, is.character) 133 | 134 | isint <- sapply(dat, function(x) { 135 | is.numeric(x) & is.integer(x) 136 | }) 137 | 138 | vartypen <- sapply(dat, function(x) class(x)[[1]]) 139 | vartyp <- NA 140 | vartyp[vartypen == "numeric" | vartypen == "integer" | 141 | vartypen == "factor"] <- 0 142 | vartyp[vartypen == "character"] <- 1 143 | vartyp[vartypen == "Date"] <- 20 144 | vartyp[vartypen == "POSIXct"] <- 22 145 | 146 | if (convert.dates) { 147 | dates <- which(sapply(dat, 148 | function(x) inherits(x, "Date")) 149 | ) 150 | for (v in dates) 151 | dat[[v]] <- as.vector( 152 | julian(dat[[v]], as.Date("1582-10-14", tz = "GMT")) * 24 * 60 * 60 153 | ) 154 | dates <- which( 155 | sapply(dat, function(x) inherits(x, "POSIXt")) 156 | ) 157 | for (v in dates) 158 | dat[[v]] <- as.vector( 159 | round(julian(dat[[v]], ISOdate(1582, 10, 14, tz = tz))) * 24 * 60 * 60 160 | ) 161 | } 162 | 163 | 164 | 165 | attr(dat, "vtyp") <- vtyp 166 | attr(dat, "vartyp") <- vartyp 167 | attr(dat, "vartypes") <- vartypes 168 | attr(dat, "nvarnames") <- nvarnames 169 | attr(dat, "timestamp") <- timestamp 170 | attr(dat, "datestamp") <- datestamp 171 | attr(dat, "label") <- label 172 | attr(dat, "haslabel") <- ff 173 | attr(dat, "itc") <- itc 174 | attr(dat, "cc") <- cc 175 | attr(dat, "isint") <- isint 176 | attr(dat, "toEncoding") <- toEncoding 177 | 178 | writepor(filepath, dat) 179 | } 180 | -------------------------------------------------------------------------------- /src/read_sav_encrypted.h: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (c) 2013 Ben Pfaff. 3 | * 4 | * Permission is hereby granted, free of charge, to any person obtaining a copy 5 | * of this software and associated documentation files (the "Software"), to 6 | * deal in the Software without restriction, including without limitation the 7 | * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or 8 | * sell copies of the Software, and to permit persons to whom the Software is 9 | * furnished to do so, subject to the following conditions: 10 | * 11 | * The above copyright notice and this permission notice shall be included in 12 | * all copies or substantial portions of the Software. 13 | * 14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 19 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS 20 | * IN THE SOFTWARE. 21 | */ 22 | 23 | #include 24 | #include 25 | 26 | #include 27 | #include 28 | 29 | #define Rf_assert(cond) if (!(cond)) Rcpp::stop(#cond); 30 | 31 | /* Initializes AES from PASSWORD. Returns true if CIPHERTEXT is the first 32 | ciphertext block in an encrypted .sav file for PASSWORD, false if PASSWORD 33 | is wrong. */ 34 | static bool 35 | init(const char *password, const uint8_t ciphertext[16], AES_KEY *aes) 36 | { 37 | /* NIST SP 800-108 fixed data. */ 38 | static const uint8_t fixed[] = { 39 | /* i */ 40 | 0x00, 0x00, 0x00, 0x01, 41 | 42 | /* label */ 43 | 0x35, 0x27, 0x13, 0xcc, 0x53, 0xa7, 0x78, 0x89, 44 | 0x87, 0x53, 0x22, 0x11, 0xd6, 0x5b, 0x31, 0x58, 45 | 0xdc, 0xfe, 0x2e, 0x7e, 0x94, 0xda, 0x2f, 0x00, 46 | 0xcc, 0x15, 0x71, 0x80, 0x0a, 0x6c, 0x63, 0x53, 47 | 48 | /* delimiter */ 49 | 0x00, 50 | 51 | /* context */ 52 | 0x38, 0xc3, 0x38, 0xac, 0x22, 0xf3, 0x63, 0x62, 53 | 0x0e, 0xce, 0x85, 0x3f, 0xb8, 0x07, 0x4c, 0x4e, 54 | 0x2b, 0x77, 0xc7, 0x21, 0xf5, 0x1a, 0x80, 0x1d, 55 | 0x67, 0xfb, 0xe1, 0xe1, 0x83, 0x07, 0xd8, 0x0d, 56 | 57 | /* L */ 58 | 0x00, 0x00, 0x01, 0x00, 59 | }; 60 | 61 | char padded_password[32]; 62 | uint8_t plaintext[16]; 63 | size_t password_len; 64 | uint8_t cmac[16]; 65 | uint8_t key[32]; 66 | size_t cmac_len; 67 | CMAC_CTX *ctx; 68 | int retval; 69 | 70 | /* Truncate password to at most 10 bytes. */ 71 | password_len = strlen (password); 72 | if (password_len > 10) 73 | password_len = 10; 74 | 75 | /* padded_password = password padded with zeros to 32 bytes. */ 76 | memset (padded_password, 0, sizeof padded_password); 77 | memcpy (padded_password, password, password_len); 78 | 79 | /* cmac = CMAC(padded_password, fixed). */ 80 | ctx = CMAC_CTX_new (); 81 | Rf_assert (ctx != NULL); 82 | 83 | retval = CMAC_Init (ctx, padded_password, sizeof padded_password, 84 | EVP_aes_256_cbc (), NULL); 85 | Rf_assert (retval == 1); 86 | 87 | retval = CMAC_Update (ctx, fixed, sizeof fixed); 88 | Rf_assert (retval == 1); 89 | 90 | cmac_len = sizeof cmac; 91 | retval = CMAC_Final (ctx, cmac, &cmac_len); 92 | Rf_assert (retval == 1); 93 | Rf_assert (cmac_len == 16); 94 | 95 | /* The key is the cmac repeated twice. */ 96 | memcpy(key, cmac, 16); 97 | memcpy(key + 16, cmac, 16); 98 | 99 | /* Use key to initialize AES. */ 100 | Rf_assert (sizeof key == 32); 101 | retval = AES_set_decrypt_key (key, sizeof key * 8, aes); 102 | Rf_assert (retval >= 0); 103 | 104 | /* Check for magic number "$FL" always present in SPSS .sav file. */ 105 | AES_ecb_encrypt (ciphertext, plaintext, aes, AES_DECRYPT); 106 | return !memcmp (plaintext, "$FL", 3); 107 | } 108 | 109 | /* Password decoding. */ 110 | 111 | #define b(x) (1 << (x)) 112 | 113 | static const uint16_t m0[4][2] = { 114 | { b(2), b(2) | b(3) | b(6) | b(7) }, 115 | { b(3), b(0) | b(1) | b(4) | b(5) }, 116 | { b(4) | b(7), b(8) | b(9) | b(12) | b(14) }, 117 | { b(5) | b(6), b(10) | b(11) | b(14) | b(15) }, 118 | }; 119 | 120 | static const uint16_t m1[4][2] = { 121 | { b(0) | b(3) | b(12) | b(15), b(0) | b(1) | b(4) | b(5) }, 122 | { b(1) | b(2) | b(13) | b(14), b(2) | b(3) | b(6) | b(7) }, 123 | { b(4) | b(7) | b(8) | b(11), b(8) | b(9) | b(12) | b(13) }, 124 | { b(5) | b(6) | b(9) | b(10), b(10) | b(11) | b(14) | b(15) }, 125 | }; 126 | 127 | static const uint16_t m2[4][2] = { 128 | { b(2), b(1) | b(3) | b(9) | b(11) }, 129 | { b(3), b(0) | b(2) | b(8) | b(10) }, 130 | { b(4) | b(7), b(4) | b(6) | b(12) | b(14) }, 131 | { b(5) | b(6), b(5) | b(7) | b(13) | b(15) }, 132 | }; 133 | 134 | static const uint16_t m3[4][2] = { 135 | { b(0) | b(3) | b(12) | b(15), b(0) | b(2) | b(8) | b(10) }, 136 | { b(1) | b(2) | b(13) | b(14), b(1) | b(3) | b(9) | b(11) }, 137 | { b(4) | b(7) | b(8) | b(11), b(4) | b(6) | b(12) | b(14) }, 138 | { b(5) | b(6) | b(9) | b(10), b(5) | b(7) | b(13) | b(15) }, 139 | }; 140 | 141 | static int 142 | decode_nibble (const uint16_t table[4][2], int nibble) 143 | { 144 | int i; 145 | 146 | for (i = 0; i < 4; i++) { 147 | if (table[i][0] & (1 << nibble)) 148 | return table[i][1]; 149 | } 150 | 151 | return 0; 152 | } 153 | 154 | /* Returns true if X has exactly one 1-bit, false otherwise. */ 155 | static bool 156 | is_pow2 (int x) 157 | { 158 | return x && (x & (x - 1)) == 0; 159 | } 160 | 161 | /* If X has exactly one 1-bit, returns its index, where bit 0 is the LSB. 162 | Otherwise, returns 0. */ 163 | static int 164 | find_1bit (uint16_t x) 165 | { 166 | int i; 167 | 168 | if (!is_pow2 (x)) 169 | return -1; 170 | 171 | for (i = 0; i < 16; i++) 172 | if (x & (1u << i)) 173 | return i; 174 | 175 | // hopefully never reachead. was abort(), but 176 | // check() does not like that 177 | return 0; 178 | } 179 | 180 | /* Attempts to decode a pair of encoded password characters A and B into a 181 | single byte of the plaintext password. Returns 0 if A and B are not a valid 182 | encoded password pair, otherwise a byte of the plaintext password. */ 183 | static int 184 | decode_password_2bytes (uint8_t a, uint8_t b) 185 | { 186 | int x = find_1bit (decode_nibble (m0, a >> 4) & decode_nibble (m2, b >> 4)); 187 | int y = find_1bit (decode_nibble (m1, a & 15) & decode_nibble (m3, b & 15)); 188 | return x < 0 || y < 0 ? 0 : (x << 4) | y; 189 | } 190 | 191 | /* Decodes an SPSS so-called "encrypted" password INPUT into OUTPUT. 192 | 193 | An encoded password is always an even number of bytes long and no longer 194 | than 20 bytes. A decoded password is never longer than 10 bytes plus a null 195 | terminator. 196 | 197 | Returns true if successful, otherwise false. */ 198 | static bool 199 | decode_password (const char *input, char output[11]) 200 | { 201 | size_t len; 202 | 203 | len = strlen (input); 204 | if (len > 20 || len % 2) 205 | return false; 206 | 207 | for (; *input; input += 2) 208 | { 209 | int c = decode_password_2bytes (input[0], input[1]); 210 | if (!c) 211 | return false; 212 | *output++ = c; 213 | } 214 | *output = '\0'; 215 | 216 | return true; 217 | } 218 | -------------------------------------------------------------------------------- /src/writepor.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2014-2025 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | #include 24 | 25 | #include "spss.h" 26 | 27 | //' writes the binary SPSS file 28 | //' 29 | //' @param filePath The full systempath to the dta file you want to import. 30 | //' @param dat the data frame 31 | //' @import Rcpp 32 | //' @keywords internal 33 | //' @noRd 34 | // [[Rcpp::export]] 35 | void writepor(const char * filePath, Rcpp::DataFrame dat) 36 | { 37 | 38 | int32_t k = dat.size(); 39 | int64_t n = dat.nrows(); 40 | 41 | 42 | std::fstream por(filePath, std::ios::out | std::ios::binary); 43 | if (por.is_open()) 44 | { 45 | 46 | bool debug = false; 47 | 48 | 49 | Rcpp::CharacterVector nvarnames = dat.attr("nvarnames"); 50 | Rcpp::IntegerVector vartyp = dat.attr("vartyp"); 51 | Rcpp::IntegerVector vartypes = dat.attr("vartypes"); 52 | Rcpp::IntegerVector vtyp = dat.attr("vtyp"); 53 | std::string timestamp = Rcpp::as(dat.attr("timestamp")); 54 | std::string datestamp = Rcpp::as(dat.attr("datestamp")); 55 | std::string toEncoding = Rcpp::as(dat.attr("toEncoding")); 56 | Rcpp::CharacterVector label = dat.attr("label"); 57 | 58 | Rcpp::IntegerVector haslabel = dat.attr("haslabel"); 59 | Rcpp::List labtabs = dat.attr("labtab"); 60 | 61 | int nolabtab = 0; 62 | 63 | 64 | std::string file = 65 | "ASCII SPSS PORT FILE SCII SPSS PORT FILE" 66 | "ASCII SPSS PORT FILE SCII SPSS PORT FILE" 67 | "ASCII SPSS PORT FILE SCII SPSS PORT FILE" 68 | "ASCII SPSS PORT FILE SCII SPSS PORT FILE" 69 | "ASCII SPSS PORT FILE SCII SPSS PORT FILE" 70 | "0000000000000000000000000000000000000000" 71 | "0000000000000000000000000123456789ABCDEF" 72 | "GHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrst" 73 | "uvwxyz .<(+0&[]!$*);^-/|,%_>?`:#@'=\"000" 74 | "000~000000000000000000000{}\\00000000000" 75 | "0000000000000000000000000000000000000000" 76 | "000000000000000000SPSSPORT"; 77 | 78 | 79 | file += "A"; // vers 80 | 81 | file += writestr(datestamp, 0); 82 | 83 | file += writestr(timestamp, 0); 84 | 85 | file += "1"; 86 | 87 | file += writestr("readspss reads and writes por files", 0); 88 | 89 | file += "4"; // varrec 90 | file += pnum1(k); 91 | file += "/"; 92 | 93 | file += "5"; // prec 94 | file += pnum1(11); 95 | file += "/"; 96 | 97 | 98 | for (int i = 0; i < k; ++i) { 99 | 100 | if (debug) 101 | Rcpp::Rcout << "--- 7 ---" << std::endl; 102 | 103 | file += "7"; //var 104 | 105 | int vartypi = vtyp(i); 106 | int isdate = vartyp(i); 107 | 108 | std::string nvarname = Rcpp::as(nvarnames(i)); 109 | 110 | file += pnum1(vartypi); 111 | file += "/"; 112 | 113 | file += writestr(nvarname, 0); 114 | 115 | int pfmt1 = 0, wfmt1 = 0; 116 | int pfmt2 = 8, wfmt2 = 8; 117 | int pfmt3 = 0, wfmt3 = 0; 118 | 119 | if (vartypi == 0) { 120 | pfmt1 = isdate, wfmt1 = isdate; 121 | pfmt2 = 8, wfmt2 = 8; 122 | pfmt3 = 2, wfmt3 = 2; 123 | } else { 124 | pfmt1 = 1, wfmt1 = 1; 125 | pfmt2 = 8, wfmt2 = 8; 126 | pfmt3 = 0, wfmt3 = 0; 127 | } 128 | 129 | 130 | // printfmt 131 | file += pnum1(pfmt1); 132 | file += "/"; 133 | 134 | file += pnum1(pfmt2); 135 | file += "/"; 136 | 137 | file += pnum1(pfmt3); 138 | file += "/"; 139 | 140 | // writefmt 141 | file += pnum1(wfmt1); 142 | file += "/"; 143 | 144 | file += pnum1(wfmt2); 145 | file += "/"; 146 | 147 | file += pnum1(wfmt3); 148 | file += "/"; 149 | 150 | 151 | if (!Rf_isNull(label) && (Rf_length(label) == k )) { 152 | 153 | if (debug) 154 | Rcpp::Rcout << "--- C ---" << std::endl; 155 | 156 | file += "C"; //var 157 | 158 | std::string lab = Rcpp::as(label(i)); 159 | 160 | file += writestr(lab,0); 161 | } 162 | 163 | } 164 | 165 | if (!Rf_isNull(labtabs) && (Rf_length(labtabs) > 0)) { 166 | 167 | if (debug) 168 | Rcpp::Rcout << "--- D ---" << std::endl; 169 | 170 | file += "D"; 171 | 172 | // labtabnam 173 | Rcpp::CharacterVector labtabnams = labtabs.attr("names"); 174 | Rcpp::IntegerVector labtab = labtabs[nolabtab]; 175 | Rcpp::CharacterVector labtn = labtab.attr("names"); 176 | 177 | const std::string nlabs = Rcpp::as(labtabnams[nolabtab]); 178 | 179 | file += pnum1(1); // nolab 180 | file += "/"; 181 | file += writestr(nlabs, 0); // labelsetnam 182 | 183 | file += pnum1(labtab.size()); // labnums 184 | file += "/"; 185 | 186 | 187 | // numerics requires pnum1() 188 | for (int j = 0; j < labtab.size(); ++j) { 189 | 190 | if (debug) { 191 | Rcpp::Rcout << labtab(j) << std::endl; // val 192 | Rcpp::Rcout << labtn(j) << std::endl; // lab 193 | } 194 | 195 | file += pnum1(labtab(j)); 196 | file += "/"; 197 | file += writestr(Rcpp::as(labtn(j)), 0); 198 | 199 | } 200 | 201 | ++nolabtab; 202 | } 203 | 204 | // start data part 205 | file += "F"; 206 | 207 | 208 | if (debug) 209 | Rcpp::Rcout << "--- F ---" << std::endl; 210 | 211 | 212 | for (int64_t i = 0; i < n; ++i) { 213 | for (int32_t j = 0; j < k; ++j) { 214 | 215 | Rcpp::checkUserInterrupt(); 216 | 217 | 218 | int32_t const type = vtyp[j]; 219 | // Rprintf("vtyp: %d\n", type); 220 | // Rprintf("k: %d; n: %d\n", j, i); 221 | 222 | switch(type) 223 | { 224 | 225 | case 0: 226 | { 227 | double val_d = Rcpp::as(dat[j])[i]; 228 | 229 | // Rcout << pfnum(val_d) << std::endl; 230 | 231 | if ( (R_IsNA(val_d)) | R_IsNaN(val_d) | std::isinf(val_d) ) { 232 | file += "*."; 233 | } else { 234 | file += pfnum(val_d); 235 | file += "/"; 236 | } 237 | 238 | break; 239 | } 240 | 241 | default: 242 | { 243 | 244 | Rcpp::CharacterVector cv_s = NA_STRING; 245 | cv_s = Rcpp::as(dat[j])[i]; 246 | 247 | std::string val_s = ""; 248 | 249 | if (cv_s[0] != NA_STRING) 250 | val_s = Rcpp::as(cv_s); 251 | 252 | file += writestr(val_s, 0); 253 | break; 254 | } 255 | } 256 | } 257 | } 258 | 259 | // file = Riconv2(file, toEncoding); 260 | 261 | // end with a "Z" even if the line is already 262 | // 80 chars long 263 | if ( file.size() % 80 == 0) 264 | file += "Z"; 265 | 266 | while ( file.size() % 80 != 0) 267 | file += "Z"; 268 | 269 | 270 | file = linebreak(file); 271 | file += "\n"; 272 | 273 | // std::locale mylocale(""); 274 | // por.imbue(mylocale); 275 | por << file; 276 | 277 | por.close(); 278 | 279 | } 280 | } 281 | -------------------------------------------------------------------------------- /vignettes/readspss.rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "readspss introduction" 3 | author: "Jan Marvin Garbuszus" 4 | date: "2021-10-30" 5 | output: rmarkdown::html_vignette 6 | vignette: > 7 | %\VignetteIndexEntry{readspss introduction} 8 | %\VignetteEngine{knitr::rmarkdown} 9 | %\VignetteEncoding{UTF-8} 10 | --- 11 | 12 | ```{r setup, include = FALSE} 13 | knitr::opts_chunk$set( 14 | collapse = TRUE, 15 | comment = "#>" 16 | ) 17 | ``` 18 | 19 | # `readspss` -- importing SPSS files to R 20 | 21 | Welcome to the `readspss` package. The package was written from scratch with 22 | additional code for special cases. Starting as a side project for me to learn 23 | Rcpp the package did grow over the years. Today `readspss` is well tested using 24 | every sav, por and zsav file I could lay my hands on. Import is considered 25 | feature complete, write support is available, just not yet for every feature. 26 | 27 | 28 | ## Installation 29 | 30 | Installation is provided using `r-universe` or `remotes`. 31 | 32 | With [r-universe](https://r-universe.dev/): 33 | ```R 34 | options(repos = c( 35 | janmarvin = 'https://janmarvin.r-universe.dev', 36 | CRAN = 'https://cloud.r-project.org')) 37 | install.packages('readspss') 38 | ``` 39 | 40 | With `devtools`: 41 | ```R 42 | remotes::install_github("JanMarvin/readspss") 43 | ``` 44 | 45 | 46 | ## Import 47 | 48 | For the import of (z)sav and por files `read.sav()` and `read.por()` are available. 49 | 50 | ```{R} 51 | library(readspss) 52 | 53 | # example using sav 54 | fl_sav <- system.file("extdata", "electric.sav", package = "readspss") 55 | ds <- read.sav(fl_sav) 56 | 57 | # example using zsav 58 | fl_zsav <- system.file("extdata", "cars.zsav", package = "readspss") 59 | dz <- read.sav(fl_zsav) 60 | 61 | # example using por 62 | fl_por <- system.file("extdata", "electric.por", package = "readspss") 63 | dp <- read.por(fl_por) 64 | ``` 65 | 66 | Both functions return data.frame objects, containing numerics, dates, factors or 67 | characters. 68 | 69 | ### Import options 70 | 71 | For user specific demand the package supports many option available in `foreign` 72 | such as `convert.factors` and `use.missings`. If one is familiar with said 73 | package, one should have no problem adapting to `readspss`. If for example one 74 | does not want to have factors since they work differently in R than in SPSS this 75 | can be achieved using the following code. 76 | 77 | ```{R} 78 | # example using sav 79 | fl_sav <- system.file("extdata", "electric.sav", package = "readspss") 80 | ds <- read.sav(fl_sav, convert.factors = FALSE) 81 | 82 | # example using zsav 83 | fl_zsav <- system.file("extdata", "cars.zsav", package = "readspss") 84 | dz <- read.sav(fl_zsav, convert.factors = FALSE) 85 | 86 | # example using por 87 | fl_por <- system.file("extdata", "electric.por", package = "readspss") 88 | dp <- read.por(fl_por, convert.factors = FALSE) 89 | ``` 90 | 91 | Since many features are self explanatory not all will be explained. Of course 92 | `readspss` can handle sav files in different encodings; it handles file sets 93 | without data; all types of missings SPSS developers over the years came up with; 94 | short, long and longer strings; little and big endian files; sav, por and zsav 95 | compressed files; files without valid header information; old and new SPSS 96 | files. As stated above, every SPSS file I came across and during the development 97 | I came across many. 98 | 99 | Using code by Ben Pfaff `readspss` can handle encrypted SPSS files. 100 | 101 | 102 | ```{R} 103 | flu <- system.file("extdata", "hotel.sav", package="readspss") 104 | fle <- system.file("extdata", "hotel-encrypted.sav", package="readspss") 105 | 106 | df_u <- read.sav(flu) 107 | df_e <- read.sav(fle, pass = "pspp") 108 | ``` 109 | 110 | ### Import details 111 | 112 | If available in the SPSS file, the resulting data.frame will contain attributes 113 | such as the datalabel, timestamp, filelabel, additional documentation and 114 | information regarding missings, labels, file encoding. A complete list of 115 | attributes can be found using `?read.sav` and `?read.por`. 116 | 117 | 118 | ## Export 119 | 120 | R data.frame objects can be exported using `write.sav()` and `write.por()`. 121 | 122 | ```{R} 123 | library(readspss) 124 | 125 | write.sav(cars, filepath = "cars.sav") # optional compress = TRUE 126 | 127 | write.sav(cars, filepath = "cars.zsav") # optional compress = TRUE 128 | 129 | write.por(cars, filepath = "cars.por") 130 | ``` 131 | 132 | Export provides a few options to add a label, for compression of sav and zsav 133 | files and conversion of dates. Currently it is not possible to export strings 134 | longer than 255 chars. Obviously all exported files can be imported using 135 | SPSS and `readspss` (PSPP is expected to work). 136 | 137 | 138 | ## Package development 139 | 140 | One may wonder, why does the world need another package to import SPSS data to 141 | the R world. Similar tasks can be done by `foreign`, `memisc` and `haven` 142 | package. 143 | Well the first two packages use code from older releases of PSPP and R-Core most 144 | likely has neither time nor a need to update their codebase to a newer PSPP 145 | release. Still over the years the SPSS file format has changed. Not drastically 146 | but new features such as long strings were implemented. Features that `foreign` 147 | cannot handle. The newest of the three aforementioned packages, `haven`, is a 148 | wrapper around the `ReadStat` C library. The package development began around 149 | the time we started with [`readstata13`](https://github.com/sjewo/readstata13) 150 | so it is around quite some time now. Contrary to many other people in the R 151 | world, I am not a huge fan of `tibbles` which are an integral part of `haven`. 152 | One can agree that this is a minor problem. My bigger problem with the package 153 | is, that I am not yet convinced that `ReadStat` and `haven` are tested enough. 154 | Even though I am sure that authors of both made sure that in most cases their 155 | software works, there are still cases where it does not. During the development 156 | process of `readspss` I reported a few bugs to the haven package. Among them 157 | were incorrectly trimmed long strings and a severe bug where por-files imported 158 | awfully incorrect values. All errors were found using publicly available data 159 | files, writing unit tests and comparing data across different R-packages, PSPP 160 | and various versions of SPSS. Until I see that such behavior is adopted by other 161 | packages, I simply do not trust them and maybe you should not either. If the 162 | import process of data fails, one does not have to worry about anything else. 163 | 164 | The development of `readspss` began once development of `readstata13` slowed 165 | down. Having written most of the `c++` code to import dta-files, I learned a lot 166 | about binary files and Rcpp development. Since SPSS was another statistical 167 | software used at the university where I worked at that time, it felt natural to 168 | have a look at sav-files. Shortly after I learned that the dta-file 169 | documentation is priceless, not available for SPSS and development ceased for 170 | quite some time. In February 2018 I changed jobs, resulting in many train rides. 171 | A project was needed and development began again. Using the PSPP documentation 172 | and countless hours of trial and error lead to the current state of the package. 173 | 174 | 175 | ## Thanks 176 | 177 | `readspss` uses code of Ben Pfaff for the encryption part. It uses code from 178 | [TDA](http://www.stat.ruhr-uni-bochum.de/tda.html) by Goetz Rohwer and Ulrich 179 | Poetter for the conversion of numerics in the por-parser. The 180 | [PSPP documentation](http://www.gnu.org/software/pspp/pspp-dev/html_node/index.html) 181 | was a huge help. Without the testing by Ulrich Poetter this package would not be 182 | as complete as it is. 183 | 184 | 185 | ## Last words 186 | 187 | Even though `readspss` is tested a lot, it would be great, if users would 188 | test their imports and exports and report their experience. Open an issue on the 189 | github page or write me an email and let me know what you think. Bug free code 190 | does not exist. 191 | -------------------------------------------------------------------------------- /tests/testthat/test_write.R: -------------------------------------------------------------------------------- 1 | #### test 1 #### 2 | if (dir.exists("data")) 3 | unlink("data", recursive = TRUE) 4 | 5 | dir.create("data") 6 | 7 | data(cars) 8 | 9 | write.sav(cars, filepath = "data/cars.sav") 10 | 11 | dd <- read.sav("data/cars.sav") 12 | 13 | test_that("integer/numerics", { 14 | expect_true(all.equal(cars, dd, check.attributes = FALSE)) 15 | }) 16 | 17 | unlink("data", recursive = TRUE) 18 | 19 | #### test 2 #### 20 | if (dir.exists("data")) 21 | unlink("data", recursive = TRUE) 22 | 23 | dir.create("data") 24 | 25 | df <- data.frame(V1 = letters, V2 = 1:26, stringsAsFactors = FALSE) 26 | 27 | write.sav(df, filepath = "data/df.sav") 28 | 29 | dd <- read.sav("data/df.sav") 30 | 31 | test_that("character", { 32 | expect_true(all.equal(dd, df, check.attributes = FALSE)) 33 | }) 34 | 35 | unlink("data", recursive = TRUE) 36 | 37 | #### test 3 #### 38 | if (dir.exists("data")) 39 | unlink("data", recursive = TRUE) 40 | 41 | dir.create("data") 42 | 43 | df <- data.frame(V1 = letters, V2 = 1:26, stringsAsFactors = FALSE) 44 | lab <- paste0("lab", seq_along(df)) 45 | 46 | 47 | write.sav(df, filepath = "data/df.sav", label = lab) 48 | 49 | 50 | dd <- read.sav("data/df.sav") 51 | 52 | test_that("character and letter mix", { 53 | expect_true(all.equal(attr(dd, "var.label"), lab, check.attributes = FALSE)) 54 | }) 55 | 56 | unlink("data", recursive = TRUE) 57 | 58 | 59 | #### test 4 #### 60 | if (dir.exists("data")) 61 | unlink("data", recursive = TRUE) 62 | 63 | dir.create("data") 64 | 65 | fl <- system.file("extdata", "hotel.sav", package = "readspss") 66 | 67 | dd <- read.sav(fl) 68 | write.sav(dd, "data/hotel.sav") 69 | df <- read.sav("data/hotel.sav") 70 | 71 | 72 | 73 | test_that("factor", { 74 | expect_true(all.equal(dd, df, check.attributes = FALSE)) 75 | }) 76 | 77 | unlink("data", recursive = TRUE) 78 | 79 | #### test 5 #### 80 | if (dir.exists("data")) 81 | unlink("data", recursive = TRUE) 82 | 83 | dir.create("data") 84 | 85 | dd <- mtcars 86 | 87 | write.por(dd, "data/mtcars.por", add.rownames = TRUE) 88 | df <- read.por("data/mtcars.por", add.rownames = TRUE) 89 | 90 | 91 | 92 | test_that("por", { 93 | expect_true(all.equal(dd, df, check.attributes = FALSE)) 94 | }) 95 | 96 | unlink("data", recursive = TRUE) 97 | 98 | 99 | #### test 6 #### 100 | if (dir.exists("data")) 101 | unlink("data", recursive = TRUE) 102 | 103 | dir.create("data") 104 | 105 | dd <- mtcars 106 | dd$am <- factor(x = dd$am, levels = c(0, 1), labels = c("auto", "man")) 107 | 108 | write.por(dd, "data/mtcars1.por", convert.factors = TRUE) 109 | df1 <- read.por("data/mtcars1.por", convert.factors = TRUE) 110 | 111 | write.por(dd, "data/mtcars2.por", convert.factors = FALSE) 112 | df2 <- read.por("data/mtcars2.por", convert.factors = TRUE) 113 | df2$AM <- df2$AM - 1 # was not stored as factor, but was a factor previous 114 | 115 | test_that("por", { 116 | expect_true(all.equal(dd, df1, check.attributes = FALSE)) 117 | expect_true(all.equal(mtcars, df2, check.attributes = FALSE)) 118 | }) 119 | 120 | unlink("data", recursive = TRUE) 121 | 122 | 123 | #### test 7 #### 124 | 125 | ### locale test disabled. this breaks constantly on either windows and/or linux 126 | # if (dir.exists("data")) 127 | # unlink("data", recursive = TRUE) 128 | # 129 | # dir.create("data") 130 | # 131 | # lab <- c("ümläuts", "ÜMLÄUTS") 132 | # dd <- data.frame(v1 = c("ä","ö","ü"), v2 = c("Ä","Ö","Ü"), 133 | # stringsAsFactors = FALSE) 134 | # attr(dd, "label") <- lab 135 | # 136 | # write.por(dd, "data/umlauts.por", toEncoding = "CP1252") 137 | # df1 <- read.por("data/umlauts.por") 138 | # df2 <- read.por("data/umlauts.por", fromEncoding = "CP1252") 139 | # 140 | # 141 | # test_that("umlauts", { 142 | # # unsure how to test that it might be true (depending on the os's encoding) 143 | # # expect_false(isTRUE(all.equal(dd, df1, check.attributes = FALSE))) 144 | # expect_true(all.equal(dd, df2, check.attributes = FALSE)) 145 | # expect_true(identical(lab, attr(df2, "label"))) 146 | # }) 147 | # 148 | # unlink("data", recursive = TRUE) 149 | 150 | #### test 8 #### 151 | if (dir.exists("data")) 152 | unlink("data", recursive = TRUE) 153 | 154 | dir.create("data") 155 | 156 | # create a more complex test with integers, missings missing integers, 157 | # characters and more iris data 158 | dd <- cbind(iris, 159 | as.integer(seq_len(nrow(iris))), 160 | NA, 161 | as.integer(NA), 162 | "a", 163 | iris) 164 | names(dd) <- letters[seq_len(ncol(dd))] 165 | write.sav(dd, "data/iris_unc.sav", compress = FALSE) 166 | write.sav(dd, "data/iris_com.sav", compress = TRUE) 167 | df_unc <- read.sav("data/iris_unc.sav") 168 | df_com <- read.sav("data/iris_com.sav") 169 | 170 | # this is a known difference in the import and we can ignore it 171 | df_unc$g <- as.logical(df_unc$g) 172 | df_com$g <- as.logical(df_com$g) 173 | 174 | # check filesize of both 175 | fs_unc <- file.info("data/iris_unc.sav")[["size"]] 176 | fs_com <- file.info("data/iris_com.sav")[["size"]] 177 | 178 | 179 | test_that("compression", { 180 | expect_true(all.equal(dd, df_unc, check.attributes = FALSE)) 181 | expect_true(all.equal(dd, df_com, check.attributes = FALSE)) 182 | expect_true(fs_com < fs_unc) 183 | }) 184 | 185 | unlink("data", recursive = TRUE) 186 | 187 | #### test 9 #### 188 | if (dir.exists("data")) 189 | unlink("data", recursive = TRUE) 190 | 191 | dir.create("data") 192 | 193 | dd <- data.frame( 194 | N = 1:2, 195 | A25 = c(paste0("a1", paste(rep("A", 22), collapse = ""), "a"), 196 | paste0("a2", paste(rep("X", 10), collapse = ""))), 197 | A255 = c(paste0("a1", paste(rep("A", 252), collapse = ""), "a"), 198 | paste0("a2", paste(rep("X", 10), collapse = ""))), 199 | # A258 = c(paste0("b1", paste(rep("B", 256), collapse = "")), 200 | # paste0("b2", paste(rep("Y", 256), collapse = ""))), 201 | # A2000 = c(paste0("c1", paste(rep("C", 1998), collapse = "")), 202 | # paste0("c2", paste(rep("Z", 1998), collapse = ""))), 203 | stringsAsFactors = FALSE 204 | ) 205 | 206 | write.sav(dd, "data/dd_u.sav", label = c("A numeric", "A not so long string", "A long string"), compress = FALSE) 207 | write.sav(dd, "data/dd_c.sav", label = c("A numeric", "A not so long string", "A long string"), compress = TRUE) 208 | 209 | write.por(dd, "data/dd_p.por") 210 | 211 | df_u <- read.sav("data/dd_u.sav") 212 | df_c <- read.sav("data/dd_c.sav") 213 | df_p <- read.por("data/dd_p.por") 214 | 215 | 216 | test_that("strings", { 217 | expect_true(all.equal(dd, df_u, check.attributes = FALSE)) 218 | expect_true(all.equal(dd, df_c, check.attributes = FALSE)) 219 | expect_true(all.equal(dd, df_p, check.attributes = FALSE)) 220 | }) 221 | 222 | unlink("data", recursive = TRUE) 223 | 224 | #### dates #### 225 | if (dir.exists("data")) 226 | unlink("data", recursive = TRUE) 227 | 228 | dir.create("data") 229 | 230 | dd <- data.frame(dat = Sys.Date()) 231 | 232 | write.sav(dd, "data/dd.sav", compress = TRUE) 233 | write.por(dd, "data/dd.por") 234 | ds <- read.sav("data/dd.sav") 235 | dp <- read.por("data/dd.por") 236 | 237 | 238 | test_that("dates", { 239 | expect_true(all.equal(dd, ds, check.attributes = FALSE)) 240 | expect_true(all.equal(dd, dp, check.attributes = FALSE)) 241 | }) 242 | 243 | unlink("data", recursive = TRUE) 244 | 245 | 246 | 247 | #### zsav #### 248 | 249 | if (dir.exists("data")) 250 | unlink("data", recursive = TRUE) 251 | 252 | dir.create("data") 253 | 254 | dd <- iris 255 | 256 | write.sav(dd, "data/dd_unc.sav", compress = FALSE, is_zsav = FALSE) 257 | write.sav(dd, "data/dd_com.sav", compress = TRUE, is_zsav = FALSE) 258 | write.sav(dd, "data/dd_unc.zsav", compress = FALSE, is_zsav = TRUE) 259 | write.sav(dd, "data/dd_com.zsav", compress = TRUE, is_zsav = TRUE) 260 | 261 | ds_unc <- read.sav("data/dd_unc.sav") 262 | ds_com <- read.sav("data/dd_com.sav") 263 | dz_unc <- read.sav("data/dd_unc.zsav") 264 | dz_com <- read.sav("data/dd_com.zsav") 265 | 266 | 267 | test_that("zsav", { 268 | expect_true(all.equal(dd, ds_unc, check.attributes = FALSE)) 269 | expect_true(all.equal(dd, ds_com, check.attributes = FALSE)) 270 | expect_true(all.equal(dd, dz_unc, check.attributes = FALSE)) 271 | expect_true(all.equal(dd, dz_com, check.attributes = FALSE)) 272 | }) 273 | 274 | unlink("data", recursive = TRUE) 275 | -------------------------------------------------------------------------------- /src/read_sav_unknown_n.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | #include "spss.h" 23 | 24 | int64_t read_sav_unknown_n (std::fstream& sav, 25 | bool swapit, int32_t cflag, bool debug, 26 | int32_t kv, 27 | Rcpp::IntegerVector vtyp, 28 | Rcpp::NumericVector res, 29 | std::vector vartype) { 30 | 31 | size_t curpos = sav.tellg(); 32 | 33 | sav.seekg(0, std::ios_base::end); 34 | std::streamoff endoffile = sav.tellg(); 35 | sav.seekg(curpos); 36 | 37 | int32_t n = 0; 38 | 39 | 40 | bool eof = 0; 41 | uint8_t val_b = 0; 42 | int64_t nn = 0, kk = 0; 43 | 44 | // data is read in 8 byte chunks. k*n/8 (data remains) 45 | double chunk = 0, val_d = 0; 46 | 47 | 48 | if (debug) { 49 | Rprintf("cflag: %d\n", cflag); 50 | } 51 | 52 | 53 | 54 | // cflag 1 = compression int8_t - bias 55 | if ((cflag == 1) | (cflag == 2)) { 56 | 57 | std::string start = ""; 58 | int32_t res_i = 0, res_kk = 0, kk_i = 0; 59 | 60 | while (!(sav.tellg() == endoffile) && !eof) { // data import until nn = n 61 | 62 | Rcpp::checkUserInterrupt(); 63 | 64 | 65 | // data is stored rowwise-ish. 66 | 67 | // chunk is 8 bit long. it gives the structure of the data. If it contains 68 | // only uint8_t it stores 8 vals. If data contains doubles it stores a 69 | // 253 and the next 8 byte will be the double. 70 | 71 | chunk = readbin(val_d, sav, 0); 72 | 73 | Rcpp::IntegerVector chunkvec(8); 74 | 75 | // therefor with respect to the required data structure (numerics and 76 | // strings) the data has to be read. 77 | // e.g. if there are 2 vals, in the first 8 bit may be 4 rows. 78 | 79 | union { 80 | double d; 81 | uint8_t byte[8]; 82 | } u; 83 | 84 | u.d = chunk; 85 | 86 | for (int8_t i=0; i<8; ++i) 87 | { 88 | 89 | val_b = u.byte[i]; 90 | 91 | // 0 = empty 92 | // 1:251 = numeric/string 93 | // each 253 follow up on a string or double in next block 94 | 95 | int32_t len = 0; 96 | int32_t const type = vartype[kk_i]; 97 | len = type; 98 | 99 | // kk_i is index of the original number of variables 100 | // kk_i is reset once kv the new number of varialbes is reachead 101 | ++kk_i; 102 | 103 | 104 | // res_kk is the amount of chunks required to read until the 105 | // string is completely read 106 | res_kk = res[kk]; 107 | 108 | switch (val_b) 109 | { 110 | 111 | case 0: 112 | { 113 | --kk; 114 | break; 115 | // ignored 116 | } 117 | 118 | default: // (val_b >= 1 & val_b <= 251) { 119 | { 120 | 121 | switch(type) 122 | { 123 | 124 | case 0: 125 | { 126 | break; 127 | } 128 | 129 | default: 130 | { 131 | 132 | if (len==-1 || (len !=0 && len !=8) ) 133 | len = 8; 134 | 135 | // beginning of a new string 136 | std::string val_s (len, '\0'); 137 | readstring(val_s, sav); 138 | 139 | // if res_i == res_kk the full string was read and 140 | // can be written else continue the string 141 | if (res_i == res_kk-1) { 142 | // string completly written, reset start and res_i 143 | // and switch to next cell 144 | start = ""; 145 | res_i = 0; 146 | } else { 147 | // string will be continued 148 | ++res_i; 149 | } 150 | 151 | break; 152 | } 153 | 154 | } 155 | 156 | break; 157 | } 158 | 159 | case 252: 160 | { 161 | // 252 should be end of file, but as many things 162 | // it is not required to be inside the file 163 | eof = true; 164 | break; 165 | } 166 | 167 | case 253: 168 | { 169 | // Rcpp::Rcout << "## Debug ... 253" << std::endl; 170 | // Rprintf("nn %d & kk %d \n", nn, kk); 171 | switch(type) 172 | { 173 | 174 | case 0: 175 | { 176 | readbin(val_d, sav, swapit); 177 | break; 178 | } 179 | 180 | default: 181 | { 182 | 183 | // spss length 1:251 indicate a string. the value is the string 184 | // size. obvious spss uses the size to determine the size of the 185 | // string. there are two possible problems. 186 | // 1. len can be 1:7 in this case we know the max string size of the 187 | // variable is less than 8 bit long. still the field to read is 8 bit 188 | // long. 189 | // 2. the string is spread across different internal strings. in this 190 | // case we know the max size, still have to read each 8bit field. 191 | // maybe the max size can be used to have a second opinion wheather 192 | // or not a field contains a numeric or character. Following fields 193 | // have len -1. 194 | 195 | if (len==-1 || (len !=0 && len !=8) ) 196 | len = 8; 197 | 198 | std::string val_s (len, '\0'); 199 | val_s = readstring(val_s, sav); 200 | start.append( val_s ); 201 | 202 | 203 | if (res_i == res_kk-1) { 204 | // reset 205 | start = ""; 206 | res_i = 0; 207 | } else { 208 | ++res_i; 209 | } 210 | 211 | 212 | break; 213 | } 214 | 215 | } 216 | 217 | break; 218 | } 219 | 220 | case 254: 221 | { 222 | // 254 indicates that string chunks read before should be 223 | // interpreted as a single string. 224 | 225 | if (res_i == res_kk-1) { 226 | // reset start 227 | start = ""; 228 | res_i = 0; 229 | } else { 230 | start.append(" "); 231 | ++res_i; 232 | } 233 | 234 | break; 235 | } 236 | 237 | case 255: 238 | { 239 | // 255 is a missing value in spss files. 240 | // 241 | switch(type) 242 | { 243 | 244 | case 0: 245 | { 246 | break; 247 | } 248 | default: 249 | { 250 | break; 251 | } 252 | break; 253 | } 254 | 255 | } 256 | } 257 | 258 | 259 | 260 | // variable is read 261 | if (res_i == 0) 262 | ++kk; 263 | 264 | // Update kk iterator. If kk is k, update nn to start in next row. 265 | if (kk == kv) { 266 | ++nn; 267 | 268 | // Rprintf("nn: %d", nn); 269 | // some files are not ended with 252, ensure that no out of bounds 270 | // error occures. 271 | if (nn == n) { 272 | eof = true; 273 | 274 | if (debug) 275 | Rcpp::Rcout << "stop: eof" << std::endl; 276 | 277 | break; 278 | } 279 | 280 | // reset k and res_kk 281 | kk = 0; 282 | kk_i = 0; 283 | } 284 | 285 | } 286 | 287 | } 288 | 289 | } else { 290 | 291 | kk = 0; 292 | 293 | std::string val_s = ""; 294 | 295 | while(!(sav.tellg() == endoffile)) { 296 | 297 | int32_t const type = vtyp[kk]; 298 | 299 | switch(type) 300 | { 301 | 302 | case 0: 303 | { 304 | readbin(val_d, sav, swapit); 305 | break; 306 | } 307 | 308 | default: 309 | { 310 | 311 | double len = type; 312 | 313 | len = ceil(len/8) * 8; 314 | 315 | std::string val_s ((int32_t)len, '\0'); 316 | readstring(val_s, sav); 317 | 318 | // shorten the string to the actual size reported by SPSS 319 | val_s.erase(type, std::string::npos); 320 | 321 | break; 322 | } 323 | 324 | } 325 | 326 | ++kk; 327 | 328 | if (kk == kv) { 329 | ++nn; 330 | kk = 0; 331 | } 332 | 333 | } 334 | } 335 | 336 | sav.seekg(curpos); 337 | 338 | if (debug) { 339 | Rcpp::Rcout << "n is " << nn << std::endl; 340 | } 341 | 342 | return(nn); 343 | 344 | } 345 | -------------------------------------------------------------------------------- /src/write_data.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2014-2025 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "spss.h" 25 | 26 | void write_data(Rcpp::DataFrame dat, int32_t cflag, 27 | int64_t n, int32_t kk, info_t *infos, 28 | std::fstream& sav, bool swapit) { 29 | 30 | if (!sav.is_open()) 31 | Rcpp::stop("Could not write data"); 32 | 33 | 34 | if (cflag) { 35 | 36 | // data is read in 8 byte chunks. k*n/8 (data remains) 37 | double chunk = 0; 38 | 39 | int8_t iter = 0; 40 | 41 | 42 | unsigned char chnk[8] = ""; 43 | 44 | // set chnk to 0 45 | for (int8_t itr = 0; itr < 8; ++itr) { 46 | chnk[itr] = 0; 47 | } 48 | 49 | uint8_t val_b = 0; 50 | int32_t val_i = 0; 51 | // double val_d = 0.0; 52 | 53 | 54 | std::vector buf_d; 55 | std::vector buf_s; 56 | std::vector flush_type(8); 57 | 58 | for (int64_t i = 0; i < n; ++i) { 59 | for (int32_t j = 0; j < kk; ++j) { 60 | 61 | 62 | int32_t const type = infos->vtyp[j]; 63 | int32_t const ITC = infos->itc[j]; 64 | int32_t const CC = infos->cc[j]; 65 | 66 | // Rprintf("n %d & k %d\n", i, j); 67 | // Rprintf("iter: %d\n", iter); 68 | 69 | // compression val_b 70 | if ((type == 0) & (ITC == 1) & (CC == 0)) { 71 | flush_type[iter] = 0; 72 | 73 | val_i = Rcpp::as(dat[j])[i]; 74 | 75 | // Rprintf("val_i: %d\n", val_i); 76 | 77 | val_b = val_i + 100; // add bias 78 | 79 | if (val_i == NA_INTEGER) 80 | val_b = 255; 81 | 82 | chnk[iter] = val_b; 83 | 84 | ++iter; 85 | } 86 | 87 | // write compressed 88 | if ((type == 0) & (ITC == 0) & (CC == 0)) { 89 | 90 | const double val_d = Rcpp::as(dat[j])[i]; 91 | 92 | 93 | flush_type[iter] = 0; 94 | chnk[iter] = 255; 95 | 96 | if (!( (R_IsNA(val_d)) | R_IsNaN(val_d) | std::isinf(val_d) )) { 97 | buf_d.push_back(val_d); 98 | flush_type[iter] = 1; 99 | chnk[iter] = 253; 100 | } 101 | 102 | ++iter; 103 | } 104 | 105 | // strings 106 | if ((type >= 0) & (ITC == 0) & (CC == 1)) { 107 | // Rcout << "--- string ---" << std::endl; 108 | 109 | 110 | std::string val_s = Rcpp::as(Rcpp::as(dat[j])[i]); 111 | 112 | int strlen = type; 113 | if (strlen == 255) strlen = 256; 114 | 115 | 116 | val_s.resize(strlen, ' '); 117 | 118 | // begin writing of the string 119 | 120 | int8_t fills = strlen/8; 121 | 122 | // Rprintf("type: %d\n", type); 123 | // Rprintf("fills: %d\n", fills); 124 | 125 | int totiter = 0; 126 | 127 | while (fills > 0) { 128 | 129 | // Rprintf("fills: %d\n", fills); 130 | // Rprintf("iter: %d\n", iter); 131 | 132 | // fill the chunk 133 | while (iter < 8) { 134 | 135 | int pos = totiter * 8; 136 | 137 | int strt = 253, strl = 8; 138 | 139 | buf_s.push_back(val_s.substr(pos, strl)); 140 | flush_type[iter] = 2; 141 | 142 | chnk[iter] = strt; 143 | 144 | ++iter; 145 | ++totiter; 146 | --fills; 147 | 148 | // Rprintf("iter: %d\n", iter); 149 | if (fills == 0) 150 | break; 151 | } 152 | 153 | // Rprintf("fills: %d\n", fills); 154 | 155 | // after a full cicle set iter to 0 156 | if (iter == 8) { 157 | // Rcout << "--- iter == 8 ---" << std::endl; 158 | 159 | // Rcout << "full chunk: writing" << std::endl; 160 | std::memcpy(&chunk, chnk, sizeof(double)); 161 | writebin(chunk, sav, swapit); 162 | iter = 0; 163 | 164 | int di = 0, ds = 0; 165 | for (auto flush = 0; flush < 8; ++flush) { 166 | 167 | int ft = flush_type[flush]; 168 | // Rprintf("%d: ft: %d\n", flush, ft); 169 | 170 | if (ft == 1) { 171 | double val_d = buf_d[di]; 172 | writebin(val_d, sav, swapit); 173 | ++di; 174 | } 175 | if (ft == 2) { 176 | std::string vs = buf_s[ds]; 177 | writestr(vs, 8, sav); 178 | ++ds; 179 | } 180 | 181 | } 182 | 183 | buf_d.clear(); 184 | buf_s.clear(); 185 | flush_type = {0, 0, 0, 0, 0, 0, 0, 0}; 186 | 187 | } 188 | } 189 | 190 | } 191 | 192 | // write chunk of eight and clear chnk 193 | if (iter == 8) { 194 | // Rcout << "--- iter == 8 ---" << std::endl; 195 | 196 | std::memcpy(&chunk, chnk, sizeof(double)); 197 | writebin(chunk, sav, swapit); 198 | iter = 0; 199 | 200 | int di = 0, ds = 0; 201 | for (auto flush = 0; flush < 8; ++flush) { 202 | 203 | int ft = flush_type[flush]; 204 | // Rprintf("%d: ft: %d\n", flush, ft); 205 | 206 | if (ft == 1) { 207 | double val_d = buf_d[di]; 208 | writebin(val_d, sav, swapit); 209 | ++di; 210 | } 211 | if (ft == 2) { 212 | std::string vs = buf_s[ds]; 213 | writestr(vs, 8, sav); 214 | ++ds; 215 | } 216 | 217 | } 218 | 219 | buf_d.clear(); 220 | buf_s.clear(); 221 | flush_type = {0, 0, 0, 0, 0, 0, 0, 0}; 222 | 223 | // reset chnk 224 | for (int8_t itr = 0; itr < 8; ++itr) { 225 | chnk[itr] = 0; 226 | } 227 | 228 | } 229 | 230 | // write end of file 231 | if ((i == n-1) & (j == kk -1)) { 232 | 233 | // Rcout << "--- EOF ---" << std::endl; 234 | // Rprintf("buf_s.size() = %d\n", buf_s.size()); 235 | // Rprintf("buf_d.size() = %d\n", buf_d.size()); 236 | 237 | 238 | // Rprintf("iter: %d\n", iter); 239 | 240 | // chunk is not yet completely written. 241 | if (iter > 0) { 242 | 243 | for (int8_t itr = iter; itr < 8; ++itr) { 244 | chnk[itr] = 0; 245 | } 246 | 247 | std::memcpy(&chunk, chnk, sizeof(double)); 248 | writebin(chunk, sav, swapit); 249 | 250 | iter = 0; 251 | } 252 | 253 | 254 | int di = 0, ds = 0; 255 | for (auto flush = 0; flush < 8; ++flush) { 256 | 257 | int ft = flush_type[flush]; 258 | // Rprintf("%d: ft: %d\n", flush, ft); 259 | 260 | if (ft == 1) { 261 | double val_d = buf_d[di]; 262 | writebin(val_d, sav, swapit); 263 | ++di; 264 | } 265 | if (ft == 2) { 266 | std::string vs = buf_s[ds]; 267 | writestr(vs, 8, sav); 268 | ++ds; 269 | } 270 | 271 | } 272 | 273 | 274 | // write EOF 275 | val_b = 252; 276 | chnk[iter] = val_b; 277 | ++iter; 278 | 279 | if (cflag != 2) /* not for zsav */ { 280 | for (int8_t itr = iter; itr < 8; ++itr) { 281 | chnk[itr] = 0; 282 | } 283 | 284 | std::memcpy(&chunk, chnk, sizeof(double)); 285 | 286 | writebin(chunk, sav, swapit); 287 | } 288 | } 289 | 290 | } 291 | } 292 | 293 | 294 | } else { 295 | 296 | for (int64_t i = 0; i < n; ++i) { 297 | for (int32_t j = 0; j < kk; ++j) { 298 | 299 | const int32_t type = infos->vtyp[j]; 300 | 301 | // Rprintf("k: %d; n: %d\n", j, i); 302 | // 303 | // Rprintf("vtyp: %d\n", type); 304 | 305 | 306 | switch(type) 307 | { 308 | 309 | case 0: 310 | { 311 | double val_d = Rcpp::as(dat[j])[i]; 312 | 313 | if ( (val_d == NA_REAL) | R_IsNA(val_d) | R_IsNaN(val_d) | std::isinf(val_d) ) 314 | val_d = -DBL_MAX; 315 | 316 | writebin(val_d, sav, swapit); 317 | break; 318 | } 319 | 320 | default: 321 | { 322 | 323 | Rcpp::CharacterVector cv_s = NA_STRING; 324 | cv_s = Rcpp::as(dat[j])[i]; 325 | 326 | std::string val_s = ""; 327 | 328 | if (cv_s[0] != NA_STRING) 329 | val_s = Rcpp::as(cv_s); 330 | 331 | int size = type; 332 | if (size == 255) 333 | size += 1; 334 | 335 | val_s.resize(size, ' '); 336 | 337 | writestr(val_s, val_s.size(), sav); 338 | break; 339 | } 340 | 341 | } 342 | } 343 | } 344 | } 345 | } 346 | -------------------------------------------------------------------------------- /R/writesav.R: -------------------------------------------------------------------------------- 1 | resize_vartyp <- function(vec, var = NULL) { 2 | 3 | out <- NULL 4 | for (i in seq_along(vec)) { 5 | 6 | val <- vec[i] 7 | 8 | if (is.null(var)) { 9 | if (val <= 8) { 10 | out <- c(out, val) 11 | } else { 12 | out <- c(out, c(val, rep(-1, (ceiling(val / 8) - 1)))) 13 | } 14 | } else { 15 | if (val <= 8) { 16 | out <- c(out, var[i]) 17 | } else { 18 | out <- c(out, c(var[i], rep(var[i], (ceiling(val / 8) - 1)))) 19 | } 20 | } 21 | 22 | } 23 | 24 | out 25 | } 26 | 27 | #' write.sav 28 | #' 29 | #' Function to write an SPSS sav or zsav file from a data.frame(). 30 | #' @param dat _data.frame_ a data.frame to store as SPSS file. 31 | #' @param filepath _string_ full path where and how this file should be 32 | #' stored 33 | #' @param label _character_ if any provided this must be a vector of 34 | #' labels. It must be of size `ncol(dat)` 35 | #' @param add.rownames _logical_ If `TRUE`, a new variable rownames 36 | #' will be added to the sav-file. 37 | #' @param compress _logical_ should compression be used. If TRUE some 38 | #' integers will be stored more efficiently. Everything will be stored in 39 | #' chunks of 8 chars. Reduces memory size of sav-file. 40 | #' @param convert.dates _logical_ should dates be converted to SPSS format. 41 | #' @param tz _character_ The name of the timezone convert.dates will use. 42 | #' @param debug _logical_ print debug information. 43 | #' @param is_zsav _logical_ explicitly create a zsav file. If the file 44 | #' ending zsav is used, this is selected as default. 45 | #' @param disppar optional display parameter matrix. Needs documentation. 46 | #' @details Writing of strings longer than 255 chars is not provided. 47 | #' 48 | #' @return `write.sav` returns nothing 49 | #' 50 | #' @export 51 | write.sav <- function(dat, filepath, label, add.rownames = FALSE, 52 | compress = FALSE, convert.dates = TRUE, tz = "GMT", 53 | debug = FALSE, is_zsav = FALSE, disppar) { 54 | 55 | filepath <- path.expand(filepath) 56 | 57 | if (missing(filepath)) 58 | stop("need a path") 59 | 60 | attrlab <- attr(dat, "var.label") 61 | 62 | if (identical(attrlab, character(0))) 63 | attrlab <- NULL 64 | 65 | 66 | if (missing(label) && is.null(attrlab)) 67 | label <- "" 68 | 69 | if (missing(label) && !is.null(attrlab)) 70 | label <- attrlab 71 | 72 | if (!identical(label, "") && (length(label) != ncol(dat))) 73 | stop("label and ncols differ. each col needs a label") 74 | 75 | if (any(nchar(label)) > 255) 76 | stop("longlabels not yet implemented") 77 | 78 | if (add.rownames) { 79 | dat <- data.frame(rownames = rownames(dat), 80 | dat, stringsAsFactors = FALSE) 81 | } 82 | 83 | nams <- names(dat) 84 | 85 | # get labtab prior to any modification due to string sizes 86 | ff <- which(sapply(dat, is.factor)) 87 | 88 | labtab <- lapply(ff, function(x) { 89 | 90 | ll <- levels(dat[[x]]) 91 | 92 | x <- as.integer(labels(ll)) 93 | names(x) <- ll 94 | 95 | x 96 | }) 97 | 98 | LONGVAR <- FALSE 99 | 100 | if (all(nchar(nams) <= 8) && (identical(toupper(nams), nams))) { 101 | nams <- toupper(nams) 102 | nvarnames <- substr(nams, 0, 8) 103 | } else { 104 | nvarnames <- paste0("VAR", seq_along(nams)) 105 | LONGVAR <- TRUE 106 | } 107 | 108 | vtyp <- as.integer(sapply(dat, is.character)) 109 | vtyp[vtyp != 0] <- as.integer(sapply(dat[vtyp != 0], 110 | function(x) max(nchar(x), na.rm = TRUE))) 111 | 112 | 113 | if (any(vtyp > 255)) { 114 | message("if you really need this, split the string into segments of 255") 115 | stop("Strings longer than 255 characters not yet implemented") 116 | } 117 | 118 | vtyp <- ceiling(vtyp / 8) * 8 119 | 120 | vtyp[vtyp > 255] <- 255 121 | 122 | vartypes <- resize_vartyp(vtyp) 123 | 124 | vartypes[vartypes > 255] <- 255 125 | 126 | nams <- vector("character", length(vartypes)) 127 | nams[vartypes > -1] <- nvarnames 128 | 129 | nvarnames <- nams 130 | 131 | # update factor position with new varnames 132 | pos <- which(nvarnames != "") 133 | 134 | if (length(ff) > 0) { 135 | ff <- sapply(ff, function(x) { 136 | # newnam <- nvm[x] 137 | x <- pos[x] 138 | # names(x) <- newnam 139 | 140 | x 141 | }) 142 | } 143 | 144 | longvarnames <- "" 145 | if ((length(nvarnames) > length(names(dat))) || LONGVAR) 146 | longvarnames <- paste( 147 | paste0(nvarnames[nvarnames != ""], "=", names(dat)), 148 | collapse = "\t") 149 | 150 | systime <- Sys.time() 151 | timestamp <- substr(systime, 12, 19) 152 | lct <- Sys.getlocale("LC_TIME") 153 | Sys.setlocale("LC_TIME", "C") 154 | datestamp <- format(Sys.Date(), "%d %b %y") 155 | Sys.setlocale("LC_TIME", lct) 156 | 157 | 158 | ii <- sapply(dat, is.integer) 159 | nn <- sapply(dat, function(x) { 160 | is.numeric(x) | is.factor(x) 161 | }) 162 | itc <- rep(0, NCOL(dat)) 163 | 164 | # get vartyp used for display parameters. has to be selected prior to 165 | # compression. otherwise factor will be wrongfully identified as integer. 166 | vartypen <- sapply(dat, function(x) class(x)[[1]]) 167 | 168 | # if compression is selected, try to store numeric, logical and factor as 169 | # integer and try to compress integer as uint8 (with bias). Since R does 170 | # only know numeric and integer, this needs additional testing if a 171 | # conversion is safe. 172 | if (compress) { 173 | message("Compression is still experimental. Testing is welcome!") 174 | # check if numeric can be stored as integer 175 | numToCompress <- sapply(dat[nn], saveToExport) 176 | 177 | # convert numeric to integer without loss of information 178 | if (any(numToCompress)) { 179 | saveToConvert <- names(numToCompress[numToCompress]) 180 | # replace numeric as integer 181 | dat[saveToConvert] <- sapply(dat[saveToConvert], as.integer) 182 | } 183 | 184 | # ii integer and not all missing 185 | ii <- sapply(dat, function(x) { 186 | (is.logical(x) | is.integer(x)) 187 | }) 188 | 189 | gg <- FALSE 190 | dat_ii <- dat[names(ii)[ii]] # might have length 0 191 | # gg check for ii if is.integer and min >= 100 and max < 151 (in range of) 192 | # uint8 +100 bias. Values > 250 are missing. 193 | if (length(dat_ii) > 0) 194 | gg <- sapply(dat_ii, function(x) { 195 | z <- NULL 196 | # if all values are missing, return TRUE: will write 255 in output 197 | if (all(is.na(x))) { 198 | z <- TRUE 199 | } else { 200 | # check if value can be stored as uint8 with bias 201 | z <- (min(x, na.rm = TRUE) >= -100 & max(x, na.rm = TRUE) < 151) 202 | } 203 | z 204 | }) 205 | 206 | # adjust gg to the length of dat 207 | gg <- gg[names(dat)] 208 | # logical matrix: is integer and good for compression? 209 | checkll <- rbind(ii, gg) 210 | 211 | # logical for integer compression 212 | itc <- apply(checkll, 2, all) 213 | 214 | } 215 | 216 | cc <- sapply(dat, is.character) 217 | 218 | vartyp <- NA 219 | vartyp[vartypen == "factor" | vartypen == "logical"] <- -1 220 | vartyp[vartypen == "numeric" | vartypen == "integer"] <- 0 221 | vartyp[vartypen == "character"] <- 1 222 | vartyp[vartypen == "Date"] <- 20 223 | vartyp[vartypen == "POSIXct"] <- 22 224 | 225 | if (convert.dates) { 226 | dates <- which(sapply(dat, 227 | function(x) inherits(x, "Date")) 228 | ) 229 | for (v in dates) 230 | dat[[v]] <- as.vector( 231 | julian(dat[[v]], as.Date("1582-10-14", tz = "GMT")) * 24 * 60 * 60 232 | ) 233 | dates <- which( 234 | sapply(dat, function(x) inherits(x, "POSIXt")) 235 | ) 236 | for (v in dates) 237 | dat[[v]] <- as.vector( 238 | round(julian(dat[[v]], ISOdate(1582, 10, 14, tz = tz))) * 24 * 60 * 60 239 | ) 240 | } 241 | 242 | # optional disppar parameter. if none is passed to the function, create a 243 | # default one with a few selected parameters. 244 | # TODO: add a similar logic for varmatrix 245 | if (missing(disppar)) { 246 | 247 | measure <- rep(NA, ncol(dat)) 248 | # nominal if factor, logical or character; else metric 249 | # (nominal 1, ordinal 2, metric 3) 250 | sel <- vartyp == -1 | vartyp == 1 251 | measure[sel] <- 1 252 | measure[!sel] <- 3 253 | 254 | colwidth <- rep(NA, ncol(dat)) 255 | # colwidth 10 if date; else 8 256 | sel <- vartyp == 20 | vartyp == 22 257 | colwidth[sel] <- 10 258 | colwidth[!sel] <- 8 259 | 260 | alignment <- rep(NA, ncol(dat)) 261 | # characters left aligned; else right 262 | # (1 right, 2 center, 3 left) 263 | sel <- vartyp == 1 264 | alignment[sel] <- 3 265 | alignment[!sel] <- 1 266 | 267 | # create disppar matrix 268 | disppar <- matrix(c(measure, colwidth, alignment), 269 | ncol = 3) 270 | } 271 | 272 | # make it flat 273 | disppar <- c(t(disppar)) 274 | 275 | # resize vartyp for long strings 276 | if (length(vartyp) != length(vartypes)) { 277 | vartyp <- resize_vartyp(vtyp, vartyp) 278 | } 279 | 280 | if (length(label) != length(vartypes)) { 281 | label <- resize_vartyp(vtyp, label) 282 | } 283 | 284 | attr(dat, "vtyp") <- vtyp 285 | attr(dat, "vartyp") <- vartyp 286 | attr(dat, "vartypes") <- vartypes 287 | attr(dat, "nvarnames") <- nvarnames 288 | attr(dat, "longvarnames") <- longvarnames 289 | attr(dat, "timestamp") <- timestamp 290 | attr(dat, "datestamp") <- datestamp 291 | attr(dat, "label") <- label 292 | attr(dat, "haslabel") <- ff 293 | attr(dat, "labtab") <- labtab 294 | attr(dat, "itc") <- itc 295 | attr(dat, "cc") <- cc 296 | attr(dat, "disppar") <- disppar 297 | 298 | if (file_ext(filepath) == "zsav") 299 | is_zsav <- TRUE 300 | 301 | if (is_zsav) 302 | message("Zsav compression is still experimental. Testing is welcome!") 303 | 304 | writesav(filepath, dat, compress, debug, is_zsav) 305 | } 306 | -------------------------------------------------------------------------------- /R/readpor.R: -------------------------------------------------------------------------------- 1 | #' read.por 2 | #' 3 | #' Function to read a SPSS por file into a data.frame(). 4 | #'@param file _string_ a por-file to import. can be a file on a computer 5 | #' or an url. in this case the file will be downloaded and read before it is 6 | #' used. 7 | #'@param convert.factors _logical_ if true numeric or character variables 8 | #' will be converted into a factor in R. 9 | #'@param generate.factors _logical_ function to convert variables with 10 | #' partial labels into factors. e.g. 1 - low and 5 - high are provided, labels 11 | #' 2, 3 and 4 will be created. especially useful in combination with 12 | #' `use.missings=TRUE`. 13 | #'@param encoding _logical_ shall values be converted? If true, `read.por()` 14 | #' will try the charcode stored inside the por-file. If this value is 2 or not 15 | #' available, `fromEncoding` can be used to change encoding. 16 | #'@param fromEncoding _character_ encoding of the imported file. This 17 | #' information is stored inside the por-file, but is currently unused. Still 18 | #' this option can be used to define the initial encoding by hand. 19 | #'@param use.missings _logical_ should missing values be converted. 20 | #' Defaults to TRUE. 21 | #' @param debug _logical_ provides additional debug information. Most 22 | #' likely not useful to any user. 23 | #'@param override _logical_ The filename provided in `file` is 24 | #' checked for the ending por. If the file ending is different, nothing is read. 25 | #' This option can be used to override this behavior. 26 | #'@param convert.dates _logical_ Should dates be converted on the fly? 27 | #'@param add.rownames _logical_ If `TRUE`, the first column will be 28 | #' used as rownames. Variable will be dropped afterwards. 29 | #' 30 | #'@details SPSS files are widely available, though for R long time only foreign 31 | #' and memisc provided functions to import por-files. Lately haven joined. 32 | #' This package is an approach to offer another alternative, to document the 33 | #' por-format and provide additional options to import the data. 34 | #' 35 | #'@note Information to decrypt the por-format was provided by tda 36 | #' [www.stat.rub.de/tda.html](http://www.stat.ruhr-uni-bochum.de/tda.html) and 37 | #' pspp [www.gnu.org/software/pspp/](http://www.gnu.org/software/pspp/) 38 | #' 39 | #'@seealso \code{\link[foreign]{read.spss}}, \code{memisc}. 40 | #' 41 | #' @useDynLib readspss, .registration=TRUE 42 | #' @importFrom tools file_ext 43 | #' @importFrom stats na.omit 44 | #' @importFrom utils download.file localeToCharset 45 | #' @export 46 | read.por <- function(file, convert.factors = TRUE, generate.factors = TRUE, 47 | encoding = TRUE, fromEncoding = NULL, use.missings = TRUE, 48 | debug = FALSE, override = FALSE, convert.dates = TRUE, 49 | add.rownames = FALSE) { 50 | 51 | 52 | # Check if path is a url 53 | if (length(grep("^(http|ftp|https)://", file))) { 54 | tmp <- tempfile() 55 | download.file(file, tmp, quiet = TRUE, mode = "wb") 56 | filepath <- tmp 57 | on.exit(unlink(filepath)) 58 | } else { 59 | # construct filepath and read file 60 | filepath <- get.filepath(file) 61 | } 62 | if (!file.exists(filepath)) 63 | return(message("File not found.")) 64 | 65 | file <- file_ext(basename(filepath)) 66 | 67 | if ((tolower(file) != "por") && 68 | !isTRUE(override)) { 69 | warning("Filending is not por. 70 | Use Override if this check should be ignored.") 71 | return(NULL) 72 | } 73 | 74 | encStr <- "" 75 | ownEnc <- localeToCharset(locale = Sys.getlocale("LC_CTYPE"))[1] 76 | 77 | if (!is.null(fromEncoding)) { 78 | encStr <- fromEncoding 79 | } 80 | 81 | if (encoding == FALSE) 82 | encStr <- "NA" 83 | 84 | # import data using an rcpp routine 85 | data <- readpor(filepath, debug, encStr, override) 86 | 87 | attribs <- attributes(data) 88 | 89 | if (NROW(data) == 0) { 90 | message("file contains no data") 91 | use.missings <- FALSE 92 | convert.factors <- FALSE 93 | } 94 | 95 | 96 | labtab <- attribs$labtab 97 | label <- attribs$label 98 | labnames <- names(labtab) 99 | varnames <- attribs$names 100 | vartypes <- attribs$vartypes 101 | fmt <- attribs$fmt 102 | 103 | fmt <- do.call(rbind, fmt) 104 | attr(data, "fmt") <- fmt 105 | 106 | 107 | # convert NAs by missing information provided by SPSS. 108 | # these are just different missing values in Stata and NA in R. 109 | if (use.missings) { 110 | if (!identical(attribs$missings, list())) { 111 | 112 | mvtab <- attribs$missings 113 | 114 | for (i in seq_along(mvtab)) { 115 | mvtabi <- mvtab[[i]] 116 | 117 | missinf <- names(mvtabi) 118 | naval <- mvtabi[[1]] 119 | 120 | data[missinf][data[missinf] == naval] <- NA 121 | 122 | } 123 | } 124 | 125 | varrange <- attribs$varrange 126 | 127 | if (!identical(varrange, list())) { 128 | 129 | for (i in seq_along(varrange)) { 130 | 131 | # range 132 | mvtabi <- varrange[[i]] 133 | missinf <- names(varrange[i]) 134 | 135 | minval <- mvtabi[1] 136 | maxval <- mvtabi[2] 137 | 138 | data[missinf][data[missinf] >= minval & 139 | data[missinf] <= maxval] <- NA 140 | 141 | } 142 | 143 | } 144 | 145 | lothrux <- attribs$lothrux 146 | 147 | if (!identical(lothrux, list())) { 148 | for (i in seq_along(lothrux)) { 149 | 150 | # range 151 | mvtabi <- lothrux[[i]] 152 | missinf <- names(mvtabi) 153 | 154 | maxval <- mvtabi 155 | 156 | data[missinf][data[missinf] <= maxval] <- NA 157 | 158 | } 159 | 160 | } 161 | 162 | xthruhi <- attribs$xthruhi 163 | 164 | if (!identical(xthruhi, list())) { 165 | for (i in seq_along(xthruhi)) { 166 | 167 | # range 168 | mvtabi <- xthruhi[[i]] 169 | missinf <- names(mvtabi) 170 | 171 | maxval <- mvtabi 172 | 173 | data[missinf][data[missinf] >= maxval] <- NA 174 | 175 | } 176 | 177 | } 178 | } 179 | 180 | 181 | # if autoenc labels were not encoded during readsav() so encode now 182 | if (encoding) { 183 | 184 | # label 185 | for (i in seq_along(labtab)) 186 | names(labtab[[i]]) <- read.encoding(names(labtab[[i]]), 187 | fromEncoding = encStr, 188 | encoding = ownEnc) 189 | 190 | label <- read.encoding(label, 191 | fromEncoding = encStr, 192 | encoding = ownEnc) 193 | 194 | 195 | for (v in (seq_along(data))[vartypes > 0]) { 196 | data[, v] <- read.encoding(data[, v], 197 | fromEncoding = encStr, 198 | encoding = ownEnc) 199 | } 200 | } 201 | 202 | 203 | # FixME: unsure 204 | if (convert.factors) { 205 | # vnames <- names(data) 206 | for (i in seq_along(labtab)) { 207 | 208 | labname <- labnames[[i]] 209 | labtable <- labtab[[i]] 210 | 211 | for (j in labname) { 212 | varname <- varnames[which(varnames == j)] 213 | isNum <- is.numeric(data[, varname]) 214 | anyNA <- any(is.na(labtable)) 215 | 216 | # get unique values / omit NA unless NA already in labtable 217 | if (anyNA) { 218 | varunique <- unique(data[[varname]]) 219 | } else { 220 | varunique <- na.omit(unique(data[[varname]])) 221 | } 222 | 223 | if (isNum || all(is.na(labtable))) { 224 | nam <- names(labtable) 225 | labtable <- as.numeric(labtable) 226 | names(labtable) <- nam 227 | } 228 | 229 | # assign label if label set is complete 230 | if (all(varunique %in% labtable)) { 231 | data[[varname]] <- fast_factor(data[[varname]], y = labtable) 232 | 233 | # else generate labels from codes 234 | } else { 235 | if (generate.factors) { 236 | 237 | names(varunique) <- as.character(varunique) 238 | 239 | gen.lab <- 240 | sort(c(varunique[!varunique %in% labtable], labtable), 241 | na.last = TRUE) 242 | 243 | if (isNum) { 244 | nam <- names(gen.lab) 245 | gen.lab <- as.integer(gen.lab) 246 | names(gen.lab) <- nam 247 | } 248 | 249 | data[[varname]] <- fast_factor(data[[varname]], y = gen.lab) 250 | } else { 251 | warning( 252 | paste( 253 | names(data)[i], "Missing factor labels - no labels assigned. 254 | Set option generate.factors=T to generate labels." 255 | ) 256 | ) 257 | } 258 | } 259 | } 260 | } 261 | } 262 | 263 | 264 | 265 | 266 | if (convert.dates) { 267 | 268 | nams <- names(data) 269 | isdate <- fmt[, 1] %in% c(20, 23, 24, 28, 29, 30, 38, 39) 270 | isdatetime <- fmt[, 1] %in% c(22, 41) 271 | istime <- fmt[, 1] %in% c(21, 25, 40) 272 | 273 | if (any(isdate)) { 274 | for (nam in nams[isdate]) { 275 | # the tda function does not always provide integers. in rare cases the 276 | # date conversion might be off by a day e.g. "13770950400". This is 277 | # avoided by rounding the value first 278 | data[[nam]] <- as.Date(as.POSIXct( 279 | round(data[[nam]]), origin = "1582-10-14")) 280 | } 281 | } 282 | if (any(isdatetime)) { 283 | for (nam in nams[isdatetime]) { 284 | data[[nam]] <- as.POSIXct( 285 | data[[nam]], 286 | origin = "1582-10-14", 287 | tz = "GMT") 288 | } 289 | } 290 | if (any(istime)) { 291 | message( 292 | "time format found for:\n", 293 | paste(nams[istime], collapse = "\n"), 294 | "\ntime variables are not dates and thus not converted." 295 | ) 296 | } 297 | 298 | } 299 | 300 | 301 | attr(data, "labtab") <- labtab 302 | attr(data, "label") <- label 303 | 304 | if (add.rownames) { 305 | rownames(data) <- data[[1]] 306 | data[[1]] <- NULL 307 | } 308 | 309 | # return 310 | return(data) 311 | 312 | } 313 | -------------------------------------------------------------------------------- /src/writesav.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2014-2025 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | #include 22 | #include 23 | 24 | #include "spss.h" 25 | #include "write_data.h" 26 | #include "write_sav_compress.h" 27 | 28 | //' writes the binary SPSS file 29 | //' 30 | //' @param filePath The full systempath to the dta file you want to import. 31 | //' @param dat the data frame 32 | //' @param compress the file 33 | //' @param debug print debug information 34 | //' @param is_zsav write zsav 35 | //' @import Rcpp 36 | //' @keywords internal 37 | //' @noRd 38 | // [[Rcpp::export]] 39 | void writesav(const char * filePath, Rcpp::DataFrame dat, uint8_t compress, 40 | bool debug, bool is_zsav) 41 | { 42 | int32_t kk = dat.size(), k = 0; 43 | int64_t n = dat.nrows(); 44 | 45 | 46 | std::fstream sav (filePath, std::ios::out | std::ios::binary); 47 | if (sav.is_open()) 48 | { 49 | 50 | bool swapit = 0; 51 | 52 | int32_t rtype = 0, subtyp = 0, size = 0, count = 0; 53 | std::string empty = ""; 54 | 55 | info_t info; 56 | 57 | info.vtyp = dat.attr("vtyp"); 58 | info.cc = dat.attr("cc"); 59 | info.itc = dat.attr("itc"); 60 | info.vartypes = dat.attr("vartypes"); 61 | info.vartyp = dat.attr("vartyp"); 62 | 63 | info.nvarnames = dat.attr("nvarnames"); 64 | info.label = dat.attr("label"); 65 | 66 | info.haslabel = dat.attr("haslabel"); 67 | info.labtab = dat.attr("labtab"); 68 | info.disppar = dat.attr("disppar"); 69 | 70 | std::string timestamp = Rcpp::as(dat.attr("timestamp")); 71 | std::string datestamp = Rcpp::as(dat.attr("datestamp")); 72 | std::string longvarname = Rcpp::as(dat.attr("longvarnames")); 73 | 74 | // write correct k for string variables with nchar() > 8 75 | if (info.nvarnames.size() > kk) 76 | k = info.nvarnames.size(); 77 | else 78 | k = kk; 79 | 80 | // Rprintf("k is: %d - kk is: %d\n", k, kk); 81 | 82 | std::string spss = "$FL2@(#)"; 83 | if (is_zsav) 84 | spss = "$FL3@(#)"; 85 | writestr(spss, spss.size(), sav); 86 | 87 | std::string datalabel = "readspss 0.15.2"; 88 | writestr(datalabel, 56, sav); 89 | 90 | int32_t arch = 2; 91 | writebin(arch, sav, swapit); 92 | 93 | // int32_t kk = -1; 94 | writebin(k, sav, swapit); 95 | 96 | int32_t cflag=0, cwvariables = 0; 97 | 98 | if (compress) 99 | cflag = 1; 100 | if (is_zsav) 101 | cflag = 2; 102 | 103 | writebin(cflag, sav, swapit); 104 | writebin(cwvariables, sav, swapit); 105 | 106 | // int64_t nn = -1; 107 | writebin((int32_t)n, sav, swapit); 108 | 109 | double bias = 100; 110 | writebin(bias, sav, swapit); 111 | 112 | writestr(datestamp, datestamp.size(), sav); 113 | 114 | writestr(timestamp, timestamp.size(), sav); 115 | 116 | std::string filelabel (67, ' '); 117 | writestr(filelabel, filelabel.size(), sav); 118 | 119 | // rtype 2 ----------------------------------------------------------------- 120 | // start variable part 121 | for (int i = 0; i < info.vartypes.size(); ++i) { 122 | rtype = 2; 123 | writebin(rtype, sav, swapit); 124 | 125 | int32_t isdate = info.vartyp[i]; 126 | int32_t subtyp = info.vartypes[i]; 127 | writebin(subtyp, sav, swapit); 128 | 129 | int32_t vlflag = 0; 130 | 131 | if (k == info.label.size()) 132 | vlflag = 1; 133 | 134 | writebin(vlflag, sav, swapit); // Label flag 135 | 136 | int32_t nmiss = 0; 137 | writebin(nmiss, sav, swapit); 138 | 139 | uint8_t unk41 = 0, unk42 = 0, unk43 = 0, unk44 = 0; 140 | // numeric 141 | if (subtyp == 0) { 142 | // factor 143 | if (isdate == -1) { 144 | unk41 = 0; // digits print format 145 | unk42 = 8; // field width 146 | unk43 = 5; // column format 147 | unk44 = 0; // not used? 148 | } 149 | // digit value 150 | if (isdate == 0) { 151 | unk41 = 2; // digits print format 152 | unk42 = 8; // field width 153 | unk43 = 5; // column format 154 | unk44 = 0; // not used? 155 | } 156 | // date 157 | if (isdate == 20 || isdate == 22) { 158 | unk41 = 0; 159 | unk42 = 10; 160 | unk43 = 39; 161 | unk44 = 0; 162 | } 163 | } else if (subtyp > 0) { 164 | // character 165 | unk41 = 0; 166 | unk42 = subtyp; 167 | unk43 = 1; 168 | unk44 = 0; 169 | } else if (subtyp == -1) { 170 | unk41 = 1; 171 | unk42 = 29; 172 | unk43 = 1; 173 | unk44 = 0; 174 | } 175 | writebin(unk41, sav, 0); 176 | writebin(unk42, sav, 0); 177 | writebin(unk43, sav, 0); 178 | writebin(unk44, sav, 0); 179 | 180 | uint8_t unk51 = 0, unk52 = 0, unk53 = 0, unk54 = 0; 181 | // numeric 182 | if (subtyp == 0) { 183 | // factor value 184 | if (isdate == -1) { 185 | unk51 = 0; // digits format 186 | unk52 = 8; // field width 187 | unk53 = 5; // column format 188 | unk54 = 0; // not used? 189 | } 190 | // digit value 191 | if (isdate == 0) { 192 | unk51 = 2; // digits format 193 | unk52 = 8; // field width 194 | unk53 = 5; // column format 195 | unk54 = 0; // not used? 196 | } 197 | // date 198 | if (isdate == 20 || isdate == 22) { 199 | unk51 = 0; 200 | unk52 = 10; 201 | unk53 = 39; 202 | unk54 = 0; 203 | } 204 | } else if (subtyp > 0) { 205 | // character 206 | unk51 = 0; 207 | unk52 = subtyp; 208 | unk53 = 1; 209 | unk54 = 0; 210 | } else if (subtyp == -1) { 211 | unk41 = 1; 212 | unk42 = 29; 213 | unk43 = 1; 214 | unk44 = 0; 215 | } 216 | writebin(unk51, sav, 0); 217 | writebin(unk52, sav, 0); 218 | writebin(unk53, sav, 0); 219 | writebin(unk54, sav, 0); 220 | 221 | std::string nvarname = Rcpp::as(info.nvarnames[i]); 222 | writestr(nvarname, 8, sav); 223 | 224 | if (vlflag == 1) { 225 | 226 | std::string lab = Rcpp::as(info.label[i]); 227 | 228 | int32_t origlen = 0; 229 | origlen = lab.size(); 230 | origlen = ceil((double)origlen/4) * 4; 231 | 232 | writebin(origlen, sav, swapit); 233 | writestr(lab, origlen, sav); 234 | } 235 | 236 | } 237 | 238 | if(!Rf_isNull(info.haslabel)) 239 | { 240 | 241 | // rtype 3 --------------------------------------------------------------- 242 | 243 | int32_t nolabels = info.haslabel.size(); 244 | 245 | for (int i = 0; i < nolabels; ++i) { 246 | 247 | rtype = 3; 248 | writebin(rtype, sav, swapit); 249 | 250 | Rcpp::IntegerVector code = info.labtab(i); 251 | 252 | std::vector labs = code.attr("names"); 253 | 254 | int32_t nolab = code.size(); 255 | writebin(nolab, sav, swapit); 256 | 257 | for (int j = 0; j < nolab; ++j) { 258 | 259 | double coden = code[j]; 260 | std::string lab = labs[j]; 261 | 262 | writebin(coden, sav, swapit); 263 | uint8_t lablen = lab.size(); 264 | if (lablen > 120) { 265 | lablen = 120; 266 | Rcpp::warning("Label longer than 120 characters found. Trimmed to 120."); 267 | } 268 | 269 | writebin(lablen, sav, swapit); 270 | 271 | lablen = ( ceil((double)(lablen+1)/8) * 8 ) - 1; 272 | writestr(lab, lablen, sav); 273 | 274 | } 275 | 276 | 277 | // rtype 4 ------------------------------------------------------------- 278 | 279 | rtype = 4; 280 | writebin(rtype, sav, swapit); 281 | 282 | 283 | // if multipe variables share a single value this will be a vector 284 | int32_t nolabel = 1; 285 | 286 | writebin(nolabel, sav, swapit); 287 | 288 | 289 | int32_t lab_id = 0; 290 | 291 | lab_id = info.haslabel[i]; 292 | 293 | writebin(lab_id, sav, swapit); 294 | } 295 | 296 | } 297 | 298 | 299 | 300 | // rtype 7 ----------------------------------------------------------------- 301 | 302 | // beign disppar 303 | rtype = 7; 304 | writebin(rtype, sav, swapit); 305 | 306 | subtyp = 11; 307 | writebin(subtyp, sav, swapit); 308 | 309 | size = 4; 310 | writebin(size, sav, swapit); 311 | 312 | count = info.disppar.size(); 313 | writebin(count, sav, swapit); 314 | 315 | for (auto i = 0; i < count; ++i) { 316 | int32_t measure = info.disppar[i]; 317 | writebin(measure, sav, swapit); 318 | } 319 | // end disppar 320 | 321 | if (longvarname.compare(empty) != 0) { 322 | // beign longvarnames 323 | rtype = 7; 324 | writebin(rtype, sav, swapit); 325 | 326 | subtyp = 13; 327 | writebin(subtyp, sav, swapit); 328 | 329 | size = 1; 330 | writebin(size, sav, swapit); 331 | 332 | count = longvarname.size(); 333 | writebin(count, sav, swapit); 334 | 335 | writestr(longvarname, longvarname.size(), sav); 336 | // end longvarnames 337 | } 338 | 339 | rtype = 999; 340 | 341 | writebin(rtype, sav, swapit); 342 | 343 | int32_t unk8 = 0; 344 | writebin(unk8, sav, swapit); 345 | 346 | if (is_zsav) { 347 | 348 | // write to temporary file 349 | // in this logic outfile = sav and sav = zsav 350 | const std::string tempstr = ".readspss_zsa_tmp_file"; 351 | std::fstream tmp (tempstr, std::ios::out | std::ios::binary); 352 | if (!tmp.is_open()) Rcpp::stop("tmp not open"); 353 | write_data(dat, cflag, n, kk, &info, tmp, swapit); 354 | tmp.close(); 355 | 356 | // write zsav body 357 | write_sav_compress(sav, tempstr, swapit, debug); 358 | 359 | // remove tempfile 360 | if (debug) Rcpp::Rcout << tempstr.c_str() << std::endl; 361 | std::remove(tempstr.c_str()); 362 | 363 | } else { 364 | // write data part 365 | write_data(dat, cflag, n, kk, &info, sav, swapit); 366 | } 367 | 368 | sav.close(); 369 | } 370 | 371 | } 372 | -------------------------------------------------------------------------------- /tests/testthat/test_read.R: -------------------------------------------------------------------------------- 1 | 2 | # prepare packages ############################################################# 3 | 4 | require(testthat) 5 | library(readspss) 6 | require(foreign) 7 | 8 | # foreign test files ########################################################### 9 | 10 | # electric 11 | 12 | df <- df_r <- df_h <- df_f <- NULL 13 | df <- system.file("extdata", "electric.sav", package = "readspss") 14 | 15 | df_r <- read.sav(df, convert.factors = FALSE, use.missings = FALSE) 16 | 17 | df_f <- foreign::read.spss(df, to.data.frame = TRUE, use.value.labels = FALSE, 18 | use.missings = FALSE, stringsAsFactors = FALSE) 19 | 20 | test_that("electric", { 21 | expect_true(all.equal(df_r, df_f, check.attributes = FALSE)) 22 | }) 23 | 24 | 25 | # testdata 26 | 27 | df <- df_r <- df_h <- df_f <- NULL 28 | df <- system.file("extdata", "testdata.sav", package = "readspss") 29 | 30 | df_r <- read.sav(df, convert.factors = FALSE, use.missings = FALSE, 31 | convert.dates = FALSE) 32 | 33 | suppressWarnings(# caused by foreign 34 | df_f <- foreign::read.spss(df, to.data.frame = TRUE, use.value.labels = FALSE, 35 | use.missings = FALSE, stringsAsFactors = FALSE, 36 | trim_values = TRUE, trim.factor.names = TRUE) 37 | ) 38 | 39 | 40 | df_f[["string_500"]] <- paste0(df_f[["string_500"]], df_f[["STRIN0"]]) 41 | df_f$STRIN0 <- NULL 42 | 43 | 44 | # trim_values does not work? so we trim 45 | chars <- sapply(df_f, is.character) 46 | chars <- names(chars)[chars] 47 | 48 | for (char in chars) { 49 | df_f[[char]] <- trimws(df_f[[char]]) 50 | } 51 | 52 | 53 | test_that("testdata", { 54 | expect_true(all.equal(df_r, df_f, check.attributes = FALSE)) 55 | }) 56 | 57 | 58 | # pspp testfiles ############################################################### 59 | 60 | # v13 61 | 62 | df <- df_r <- df_h <- df_f <- NULL 63 | df <- system.file("extdata", "v13.sav", package = "readspss") 64 | 65 | df_r <- read.sav(df, convert.factors = FALSE, use.missings = FALSE) 66 | 67 | res <- data.frame( 68 | N = 1:2, 69 | A255 = c(paste0("a1", paste(rep("A", 253), collapse = "")), 70 | paste0("a2", paste(rep("X", 253), collapse = ""))), 71 | A258 = c(paste0("b1", paste(rep("B", 256), collapse = "")), 72 | paste0("b2", paste(rep("Y", 256), collapse = ""))), 73 | A2000 = c(paste0("c1", paste(rep("C", 1998), collapse = "")), 74 | paste0("c2", paste(rep("Z", 1998), collapse = ""))), 75 | stringsAsFactors = FALSE 76 | ) 77 | 78 | test_that("third-test", { 79 | expect_true(all.equal(df_r, res, check.attributes = FALSE)) 80 | }) 81 | 82 | 83 | # v14 84 | 85 | df <- df_r <- df_h <- df_f <- res <- NULL 86 | df <- system.file("extdata", "v14.sav", package = "readspss") 87 | 88 | df_r <- read.sav(df, convert.factors = FALSE, use.missings = FALSE) 89 | 90 | 91 | res <- data.frame( 92 | vl255 = c(paste(rep("M", 255), collapse = "")), 93 | vl256 = c(paste(rep("M", 256), collapse = "")), 94 | vl1335 = c(paste(rep("M", 1335), collapse = "")), 95 | vl2000 = c(paste(rep("M", 2000), collapse = "")), 96 | stringsAsFactors = FALSE 97 | ) 98 | 99 | test_that("fourth-test", { 100 | expect_true(all.equal(res, df_r, check.attributes = FALSE)) 101 | }) 102 | 103 | 104 | # haven testfile ############################################################### 105 | 106 | # iris 107 | 108 | df <- df_r <- df_h <- df_f <- NULL 109 | df <- system.file("extdata", "iris.sav", package = "readspss") 110 | 111 | df_r <- read.sav(df, convert.factors = TRUE, use.missings = FALSE) 112 | 113 | data(iris) 114 | 115 | 116 | test_that("sixth-test", { 117 | expect_true(all.equal(df_r, iris, check.attributes = FALSE)) 118 | }) 119 | 120 | # factors ###################################################################### 121 | 122 | # electric 123 | fl <- system.file("extdata", "electric.sav", package = "readspss") 124 | 125 | df_r <- read.sav(fl) 126 | 127 | df_f <- foreign::read.spss(fl, to.data.frame = TRUE) 128 | 129 | test_that("factors", { 130 | expect_true(all.equal(df_r, df_f, check.attributes = FALSE)) 131 | }) 132 | 133 | # hotel 134 | fl <- system.file("extdata", "hotel.sav", package = "readspss") 135 | 136 | df_r <- read.sav(fl) 137 | 138 | df_f <- foreign::read.spss(fl, to.data.frame = TRUE) 139 | 140 | test_that("factors", { 141 | expect_true(all.equal(df_r, df_f, check.attributes = FALSE)) 142 | }) 143 | 144 | 145 | # physiology 146 | fl <- system.file("extdata", "physiology.sav", package = "readspss") 147 | 148 | df_r <- read.sav(fl) 149 | 150 | df_f <- foreign::read.spss(fl, to.data.frame = TRUE) 151 | 152 | test_that("factors", { 153 | expect_true(all.equal(df_r, df_f, check.attributes = FALSE)) 154 | }) 155 | 156 | # repairs 157 | fl <- system.file("extdata", "repairs.sav", package = "readspss") 158 | 159 | df_r <- read.sav(fl) 160 | 161 | df_f <- foreign::read.spss(fl, to.data.frame = TRUE) 162 | 163 | test_that("factors", { 164 | expect_true(all.equal(df_r, df_f, check.attributes = FALSE)) 165 | }) 166 | 167 | 168 | #### zsav test ##### 169 | 170 | fl <- system.file("extdata", "cars.zsav", package = "readspss") 171 | 172 | df_r <- read.sav(fl) 173 | 174 | test_that("zsav", { 175 | expect_true(all.equal(df_r, cars, check.attributes = FALSE)) 176 | }) 177 | 178 | #### encryption test ##### 179 | 180 | flu <- system.file("extdata", "hotel.sav", package = "readspss") 181 | fle <- system.file("extdata", "hotel-encrypted.sav", package = "readspss") 182 | 183 | df_u <- read.sav(flu) 184 | df_e <- read.sav(fle, pass = "pspp") 185 | 186 | test_that("encrypted", { 187 | expect_true(all.equal(df_u, df_e, check.attributes = FALSE)) 188 | }) 189 | 190 | #### por test #### 191 | 192 | 193 | f_sav <- system.file("extdata", "electric.sav", package = "readspss") 194 | f_por <- system.file("extdata", "electric.por", package = "readspss") 195 | 196 | df_sav <- read.sav(f_sav) 197 | df_por <- read.por(f_por) 198 | 199 | 200 | test_that("por_vs_sav", { 201 | expect_true(all.equal(df_sav, df_por, check.attributes = FALSE)) 202 | }) 203 | 204 | #### read.spps test #### 205 | 206 | f_sav <- system.file("extdata", "electric.sav", package = "readspss") 207 | f_por <- system.file("extdata", "electric.por", package = "readspss") 208 | 209 | df_sav <- readspss::read.spss(f_sav) 210 | df_por <- readspss::read.spss(f_por) 211 | 212 | 213 | test_that("por_vs_sav", { 214 | expect_true(all.equal(df_sav, df_por, check.attributes = FALSE)) 215 | }) 216 | 217 | #### time date formats #### 218 | 219 | f_sav <- system.file("extdata", "datetimes.sav", package = "readspss") 220 | 221 | df_sav <- readspss::read.spss(f_sav) 222 | 223 | exp <- structure( 224 | list( 225 | d1 = structure(15736, class = "Date"), d2 = structure(15736, class = "Date"), 226 | a1 = structure(15736, class = "Date"), a2 = structure(15736, class = "Date"), 227 | e1 = structure(15736, class = "Date"), e2 = structure(15736, class = "Date"), 228 | j1 = structure(15736, class = "Date"), j2 = structure(15736, class = "Date"), 229 | s1 = structure(15736, class = "Date"), s2 = structure(15736, class = "Date"), 230 | q1 = structure(15706, class = "Date"), q2 = structure(15706, class = "Date"), 231 | m1 = structure(15706, class = "Date"), m2 = structure(15706, class = "Date"), 232 | w1 = structure(15734, class = "Date"), w2 = structure(15734, class = "Date"), 233 | dt1 = structure(1359594120, tzone = "GMT", class = c("POSIXct", "POSIXt")), 234 | dt2 = structure(1359594153, tzone = "GMT", class = c("POSIXct", "POSIXt")), 235 | dt3 = structure(1359594153.72, tzone = "GMT", class = c("POSIXct", "POSIXt")), 236 | y1 = structure(1359594120, tzone = "GMT", class = c("POSIXct", "POSIXt")), 237 | y2 = structure(1359594153, tzone = "GMT", class = c("POSIXct", "POSIXt")), 238 | y3 = structure(1359594153.72, tzone = "GMT", class = c("POSIXct", "POSIXt")), 239 | w3 = 5, w4 = 5, m3 = 1, m4 = 1, mt1 = 105276, 240 | mt2 = 105276.58, t1 = 105240, t2 = 105276, t3 = 105276.58, 241 | dt4 = 105240, dt5 = 105276, dt6 = 105276.58), 242 | row.names = c(NA, 1L), 243 | datalabel = "SPSS DATA FILE GNU pspp 1.4.1 - x86_64-pc-linux-gnu", 244 | datestamp = "27 May 22", timestamp = "18:27:45", filelabel = "", 245 | vtype = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 246 | 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), 247 | disppar = structure( 248 | c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 249 | 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 250 | 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 251 | 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 8L, 252 | 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 253 | 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), 254 | dim = c(34L, 3L)), 255 | missings = list(), 256 | haslabel = list(), 257 | longstring = "", 258 | longvarname = c("D1=d1", "D2=d2", "A1=a1", "A2=a2", "E1=e1", "E2=e2", "J1=j1", 259 | "J2=j2", "S1=s1", "S2=s2", "Q1=q1", "Q2=q2", "M1=m1", "M2=m2", 260 | "W1=w1", "W2=w2", "DT1=dt1", "DT2=dt2", "DT3=dt3", "Y1=y1", 261 | "Y2=y2", "Y3=y3", "W3=w3", "W4=w4", "M3=m3", "M4=m4", 262 | "MT1=mt1", "MT2=mt2", "T1=t1", "T2=t2", "T3=t3", "DT4=dt4", 263 | "DT5=dt5", "DT6=dt6"), 264 | longmissing = list(), longlabel = list(), cflag = 1L, endian = 2L, 265 | compression = 1L, doc = list(), charcode = 65001L, encoding = "UTF-8", 266 | encStr = "UTF-8", ownEnc = "UTF-8", doenc = FALSE, autoenc = FALSE, 267 | swapit = FALSE, totals = "", dataview = "", extraproduct = "", 268 | class = "data.frame", label = list(), 269 | varmatrix = structure( 270 | c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 271 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 272 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 273 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 274 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 275 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 276 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 277 | 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 9, 11, 8, 10, 8, 10, 5, 278 | 7, 8, 10, 6, 8, 6, 8, 8, 10, 17, 20, 23, 20, 20, 20, 279 | 3, 9, 3, 9, 5, 8, 5, 8, 11, 9, 12, 15, 20, 20, 23, 23, 280 | 38, 38, 24, 24, 39, 39, 29, 29, 28, 28, 30, 30, 22, 22, 281 | 22, 41, 41, 41, 26, 26, 27, 27, 40, 40, 21, 21, 21, 25, 282 | 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 283 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 284 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 285 | 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 9, 11, 8, 10, 286 | 8, 10, 5, 7, 8, 10, 6, 8, 6, 8, 8, 10, 17, 20, 23, 20, 287 | 20, 20, 3, 9, 3, 9, 5, 8, 5, 8, 11, 9, 12, 15, 20, 20, 288 | 23, 23, 38, 38, 24, 24, 39, 39, 29, 29, 28, 28, 30, 30, 289 | 22, 22, 22, 41, 41, 41, 26, 26, 27, 27, 40, 40, 21, 21, 290 | 21, 25, 25, 25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 291 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 292 | 0, 0, 0), 293 | dim = c(34L, 11L)), 294 | var.label = character(0), 295 | lmissing = list() 296 | ) 297 | 298 | # fix for non UTF-8 R (oldrel linux on github atm) 299 | attr(df_sav, "autoenc") <- FALSE 300 | attr(df_sav, "doenc") <- FALSE 301 | attr(df_sav, "ownEnc") <- "UTF-8" 302 | 303 | test_that("time dates", { 304 | expect_equal(exp, df_sav) 305 | }) 306 | -------------------------------------------------------------------------------- /src/spss.h: -------------------------------------------------------------------------------- 1 | #ifndef SPSS_H 2 | #define SPSS_H 3 | 4 | 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include "swap_endian.h" 10 | 11 | inline void rtrim(std::string& s) { 12 | s.erase(std::find_if(s.rbegin(), s.rend(), 13 | [](unsigned char ch) { return !std::isspace(ch); }).base(), 14 | s.end()); 15 | } 16 | 17 | inline void trim(std::string& s) { 18 | // Trim leading spaces 19 | s.erase(s.begin(), std::find_if(s.begin(), s.end(), 20 | [](unsigned char ch) { return !std::isspace(ch); })); 21 | // Trim trailing spaces 22 | rtrim(s); 23 | } 24 | 25 | inline std::vector split( 26 | const std::string& input, 27 | const std::string& delimiters, 28 | bool compress = true 29 | ) { 30 | std::vector result; 31 | std::string token; 32 | std::unordered_set delims(delimiters.begin(), delimiters.end()); 33 | 34 | for (char c : input) { 35 | if (delims.count(c)) { 36 | if (!token.empty() || !compress) { 37 | result.push_back(token); 38 | token.clear(); 39 | } 40 | // if compress == true, skip consecutive delimiters 41 | } else { 42 | token += c; 43 | } 44 | } 45 | if (!token.empty() || !compress) 46 | result.push_back(token); 47 | 48 | if (result.empty()) 49 | result.push_back(""); 50 | 51 | return result; 52 | } 53 | 54 | struct info_t { 55 | Rcpp::IntegerVector vtyp; 56 | Rcpp::IntegerVector cc; 57 | Rcpp::IntegerVector itc; 58 | Rcpp::IntegerVector vartypes; 59 | Rcpp::IntegerVector vartyp; 60 | Rcpp::CharacterVector nvarnames; 61 | Rcpp::CharacterVector label; 62 | Rcpp::IntegerVector haslabel; 63 | Rcpp::List labtab; 64 | Rcpp::IntegerVector disppar; 65 | }; 66 | 67 | template 68 | T readbin( T t , std::istream& sav, bool swapit) 69 | { 70 | if (sav.peek() == EOF) { 71 | Rcpp::stop("Reached EOF"); 72 | } 73 | 74 | if (!sav.read ((char*)&t, sizeof(t))) 75 | Rcpp::stop("readbin: a binary read error occurred"); 76 | if (swapit==0) 77 | return(t); 78 | else 79 | return(swap_endian(t)); 80 | } 81 | 82 | 83 | inline const std::string codepage(int cp) { 84 | std::string res; 85 | 86 | switch(cp) { 87 | case 874: 88 | res = "CP874"; 89 | break; 90 | case 936: 91 | res = "CP936"; 92 | break; 93 | case 1200: 94 | res = "UCS-2LE"; 95 | break; 96 | case 1201: 97 | res = "UCS-2BE"; 98 | break; 99 | case 1250: 100 | res = "CP1250"; 101 | break; 102 | case 1251: 103 | res = "CP1251"; 104 | break; 105 | case 1252: 106 | res = "CP1252"; 107 | break; 108 | case 1253: 109 | res = "CP1253"; 110 | break; 111 | case 1254: 112 | res = "CP1254"; 113 | break; 114 | case 1255: 115 | res = "CP1255"; 116 | break; 117 | case 1256: 118 | res = "CP1256"; 119 | break; 120 | case 1257: 121 | res = "CP1257"; 122 | break; 123 | case 1258: 124 | res = "CP1258"; 125 | break; 126 | case 10000: 127 | res = "macroman"; 128 | break; 129 | case 12000: 130 | res = "UCS-4LE"; 131 | break; 132 | case 12001: 133 | res = "UCS-4BE"; 134 | break; 135 | case 20127: 136 | res = "ASCII"; 137 | break; 138 | case 20866: 139 | res = "koi8-r"; 140 | break; 141 | case 21866: 142 | res = "koi8-u"; 143 | break; 144 | case 28591: 145 | res = "latin1"; 146 | break; 147 | case 28592: 148 | res = "latin2"; 149 | break; 150 | case 28593: 151 | res = "latin3"; 152 | break; 153 | case 28594: 154 | res = "latin4"; 155 | break; 156 | case 28605: 157 | res = "latin-9"; 158 | break; 159 | case 50221: 160 | res = "ISO-2022-JP"; 161 | break; 162 | case 51932: 163 | res = "euc-jp"; 164 | break; 165 | case 65001: 166 | res = "UTF-8"; 167 | break; 168 | } 169 | 170 | return(res); 171 | } 172 | 173 | 174 | template 175 | inline std::string readstring(std::string &mystring, T& sav) 176 | { 177 | 178 | if (!sav.read(&mystring[0], mystring.size())) 179 | Rcpp::stop("char: a binary read error occurred"); 180 | 181 | return(mystring); 182 | } 183 | 184 | 185 | template 186 | inline std::string readstringsize(std::string &mystring, T& sav, int size) 187 | { 188 | 189 | if (!sav.read(&mystring[0], size)) 190 | Rcpp::stop("char: a binary read error occurred"); 191 | 192 | return(mystring); 193 | } 194 | 195 | template 196 | inline std::string readtostring(T& sav) 197 | { 198 | 199 | std::string res(1, '\0'); 200 | res = readstring(res, sav); 201 | 202 | // run until EOF is reached. file does not end on "/" but "Z" 203 | while (sav.peek() != EOF) 204 | { 205 | 206 | std::string next(1, '\0'); 207 | next = readstring(next, sav); 208 | 209 | if ( (res.compare("*") == 0) && 210 | (((next.compare(".") == 0)) || (next.compare("1") == 0) ) ) { 211 | // missing (combine so we can check for "*.") 212 | res = res + next; 213 | 214 | break; 215 | 216 | } else if (next.compare("/") == 0) { 217 | // slash reached return w/o slash 218 | 219 | break; 220 | 221 | } else { 222 | // all good read another value 223 | res = res + next; 224 | } 225 | 226 | } 227 | 228 | return(res); 229 | } 230 | 231 | 232 | // Part of TDA. Program for Transition Data Analysis, written by Goetz Rohwer. 233 | // Copyright (C) 1989,1991-97 Goetz Rohwer. GPL-2 234 | inline int getdigit(char *p, int *err) 235 | { 236 | *err = 0; 237 | int32_t z = 0; 238 | if (*p >= '0' && *p <= '9') 239 | z = (int32_t)(*p - '0'); 240 | else if (*p >= 'A' && *p <= 'T') 241 | z = (int32_t)(10 + *p - 'A'); 242 | else 243 | *err = 1; 244 | 245 | return(z); 246 | } 247 | 248 | 249 | // modified Part of TDA. Program for Transition Data Analysis, written by Goetz 250 | // Rohwer. 251 | // Copyright (C) 1989,1991-97 Goetz Rohwer. GPL-2 252 | inline double dnum(std::string strng) 253 | { 254 | double x = 0.0, man = 0.0, mex = 0.0; /* result, mantissa and exponent */ 255 | int32_t err, neg = 0, pnt = 0, ex = 0, k = 0, n = 0; 256 | char *p = &strng[0], *q; 257 | 258 | q = p; 259 | while (*p && *p == ' ') { 260 | p++; 261 | } 262 | 263 | if (*p == '*') { /* check for internal missing value */ 264 | p += 2; 265 | x = NA_REAL; 266 | return(x); 267 | } 268 | 269 | if (*p == '-') { 270 | neg = 1; 271 | p++; 272 | } 273 | while (*p && *p != '/') { 274 | 275 | if (*p == '.') { 276 | pnt = 1; 277 | n = k; 278 | } 279 | else if (*p == '+') 280 | ex = 1; 281 | else if (*p == '-') 282 | ex = -1; 283 | else { 284 | if (!ex) { 285 | man *= 30.0; 286 | man += (double) getdigit(p, &err); 287 | 288 | if (err) { 289 | Rcpp::stop("Unk0: %d\n", q); 290 | return(0); 291 | } 292 | k++; 293 | if (k > 13) 294 | Rcpp::stop("Warning: found entry with %2d (base-30) digits.\n",k); 295 | } 296 | else { 297 | mex *= 30.0; 298 | mex += (double) getdigit(p, &err); 299 | 300 | if (err) { 301 | Rcpp::stop("Unk1: %d\n", q); 302 | return(0); 303 | } 304 | } 305 | } 306 | p++; 307 | } 308 | if (neg) 309 | man = -man; 310 | 311 | if (pnt) { 312 | k -= n; 313 | while (k--) 314 | man /= 30.0; 315 | } 316 | if (ex == 1) 317 | man *= pow(30.0, mex); 318 | else if (ex == -1) 319 | man /= pow(30.0, mex); 320 | x = man; 321 | 322 | return(x); 323 | } 324 | 325 | 326 | /*--------------------------------------------------------------------------*/ 327 | /* pnum1(fd,n) print integer n to fd (base 30). Update SPSSPtr. */ 328 | 329 | static char DIG30[] = {'0','1','2','3','4','5','6','7','8','9', 330 | 'A','B','C','D','E','F','G','H','I','J', 331 | 'K','L','M','N','O','P','Q','R','S','T'}; 332 | 333 | inline std::string pnum1(int32_t n) 334 | { 335 | int32_t m, r; 336 | char *p, buf[100]; 337 | std::string val_s; 338 | 339 | if (n < 0) { 340 | val_s +="-"; 341 | n = -n; 342 | } 343 | p = buf; 344 | while (n >= 30) { 345 | m = (int32_t) (n / 30); 346 | r = n - 30 * m; 347 | snprintf(p++, 100, "%c",DIG30[r]); 348 | n = m; 349 | } 350 | val_s += DIG30[n]; 351 | while (p > buf) { 352 | val_s += *--p; 353 | } 354 | 355 | return(val_s); 356 | } 357 | 358 | /*--------------------------------------------------------------------------*/ 359 | /* pfnum(fd,x) print floating point number x to fd (base 30). */ 360 | /* Update SPSSPtr. Note: x >= 0.0 */ 361 | 362 | inline std::string pfnum(double x) 363 | { 364 | int32_t i; 365 | double a, b, c, d, e; 366 | double EPSI = std::numeric_limits::epsilon(); 367 | 368 | std::string val_s; 369 | 370 | if (x == 0) 371 | return ("0"); 372 | 373 | e = floor(log(x) / log(30.0)); 374 | b = x / pow(30.0, e); 375 | c = floor(b); 376 | if (c < 0.0 || c >= 30.0) 377 | Rcpp::stop("74"); // no clue what this supposed be 378 | 379 | val_s = DIG30[(int32_t)c]; 380 | b -= c; 381 | if (b > EPSI) { 382 | val_s += "."; 383 | c = 30.0; 384 | for (i = 0; i < 10 ; ++i) { 385 | a = b * c; 386 | d = floor(a); 387 | val_s += DIG30[(int32_t)d]; 388 | b -= d / c; 389 | if (b <= EPSI) 390 | break; 391 | c *= 30; 392 | } 393 | } 394 | i = (int32_t)e; 395 | if (i) { 396 | if (i < 0) { 397 | val_s += "-"; 398 | i = -i; 399 | } 400 | else { 401 | val_s += "+"; 402 | } 403 | val_s += pnum1(i); 404 | }; 405 | 406 | return(val_s); 407 | } 408 | 409 | inline std::string linebreak(std::string& str) 410 | { 411 | for (size_t i = 80; i < str.size(); i += 81) 412 | str.insert(i, "\n"); 413 | 414 | return(str); 415 | } 416 | 417 | inline std::string writestr(std::string mystring, bool slash) { 418 | 419 | std::string val; 420 | 421 | val += pnum1(mystring.size()); 422 | val += "/"; 423 | val += mystring; 424 | 425 | if (slash) 426 | val += "/"; 427 | 428 | return(val); 429 | 430 | } 431 | 432 | 433 | template 434 | inline T Riconv(T &mystring, std::string &encStr) { 435 | 436 | std::string empty = ""; 437 | 438 | if (encStr.compare(empty) != 0) { 439 | 440 | Rcpp::Environment base("package:base"); 441 | Rcpp::Function iconv = base["iconv"]; 442 | 443 | mystring = Rcpp::as( 444 | iconv(mystring, Rcpp::Named("from", encStr), Rcpp::Named("to","")) 445 | ); 446 | } 447 | 448 | return(mystring); 449 | 450 | } 451 | 452 | template 453 | inline T Riconv2(T &mystring, std::string &encStr) { 454 | 455 | std::string empty = ""; 456 | 457 | if (encStr.compare(empty) != 0) { 458 | 459 | Rcpp::Environment base("package:base"); 460 | Rcpp::Function iconv = base["iconv"]; 461 | 462 | mystring = Rcpp::as( 463 | iconv(mystring, Rcpp::Named("from", ""), Rcpp::Named("to",encStr)) 464 | ); 465 | } 466 | 467 | return(mystring); 468 | 469 | } 470 | 471 | 472 | template 473 | inline void writebin(T t, std::fstream& sav, bool swapit) 474 | { 475 | if (swapit==1){ 476 | T t_s = swap_endian(t); 477 | sav.write((char*)&t_s, sizeof(t_s)); 478 | } else { 479 | sav.write((char*)&t, sizeof(t)); 480 | } 481 | } 482 | 483 | 484 | inline void writestr(std::string val_s, int32_t len, std::fstream& sav) 485 | { 486 | 487 | std::stringstream val_stream; 488 | val_stream << std::left << std::setw(len) << std::setfill(' ') << val_s; 489 | std::string val_strl = val_stream.str(); 490 | 491 | sav.write(val_strl.c_str(),val_strl.length()); 492 | 493 | } 494 | 495 | inline std::string b30str (std::string &val_s) { 496 | return std::to_string(std::strtol(val_s.c_str(), NULL, 30)); 497 | } 498 | 499 | inline int32_t b30int (std::string &val_s) { 500 | return std::strtol(val_s.c_str(), NULL, 30); 501 | } 502 | 503 | #endif 504 | -------------------------------------------------------------------------------- /src/read_sav_known_n.cpp: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright (C) 2018-2025 Jan Marvin Garbuszus 3 | * 4 | * This program is free software; you can redistribute it and/or modify it 5 | * under the terms of the GNU General Public License as published by the 6 | * Free Software Foundation; either version 2 of the License, or (at your 7 | * option) any later version. 8 | * 9 | * This program is distributed in the hope that it will be useful, but WITHOUT 10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 | * more details. 13 | * 14 | * You should have received a copy of the GNU General Public License along 15 | * with this program. If not, see . 16 | */ 17 | 18 | #include 19 | #include 20 | #include 21 | 22 | #include "spss.h" 23 | 24 | Rcpp::List read_sav_known_n (Rcpp::List& df, std::fstream& sav, 25 | const bool swapit, const uint8_t cflag, 26 | const bool debug, 27 | int64_t n, int32_t kv, 28 | Rcpp::IntegerVector vtyp, 29 | Rcpp::NumericVector res, 30 | std::vector vartype, 31 | const double lowest, 32 | const double highest, 33 | const int bias) { 34 | 35 | // final position 36 | auto curpos = sav.tellg(); 37 | sav.seekg(0, std::ios_base::end); 38 | auto endoffile = sav.tellg(); 39 | sav.seekg(curpos); 40 | 41 | bool eof = 0; 42 | uint8_t val_b = 0; 43 | int32_t kk = 0; 44 | int64_t nn = 0; 45 | 46 | // data is read in 8 byte chunks. k*n/8 (data remains) 47 | double chunk = 0, val_d = 0; 48 | 49 | 50 | if (debug) { 51 | Rcpp::Rcout << "cflag: " << cflag 52 | << "\ncurpos: " << curpos 53 | << "\nendpos: " << endoffile 54 | << std::endl; 55 | } 56 | 57 | 58 | // cflag 1 = compression int8_t - bias 59 | if ((cflag == 1) | (cflag == 2)) { 60 | std::string start = ""; 61 | int32_t res_i = 0, res_kk = 0, kk_i = 0; 62 | 63 | while (!(sav.tellg() == endoffile) && !eof) { // data import until nn = n 64 | 65 | Rcpp::checkUserInterrupt(); 66 | 67 | 68 | // data is stored rowwise-ish. 69 | 70 | // chunk is 8 bit long. it gives the structure of the data. If it contains 71 | // only uint8_t it stores 8 vals. If data contains doubles it stores a 72 | // 253 and the next 8 byte will be the double. 73 | 74 | chunk = readbin(val_d, sav, 0); 75 | 76 | Rcpp::IntegerVector chunkvec(8); 77 | 78 | // therefor with respect to the required data structure (numerics and 79 | // strings) the data has to be read. 80 | // e.g. if there are 2 vals, in the first 8 bit may be 4 rows. 81 | 82 | union { 83 | double d; 84 | uint8_t byte[8]; 85 | } u; 86 | 87 | u.d = chunk; 88 | 89 | for (int8_t i=0; i<8; ++i) 90 | { 91 | 92 | val_b = u.byte[i]; 93 | 94 | // 0 = empty 95 | // 1:251 = numeric/string 96 | // each 253 follow up on a string or double in next block 97 | 98 | int32_t len = 0; 99 | int32_t const type = vartype[kk_i]; 100 | len = type; 101 | 102 | // kk_i is index of the original number of variables 103 | // kk_i is reset once kv the new number of varialbes is reachead 104 | ++kk_i; 105 | 106 | 107 | // if (debug) { 108 | // Rprintf("val_b: %d - type: %d - kk: %d - nn: %d\n", 109 | // val_b, type, kk+1, nn+1); 110 | // 111 | // Rprintf("res_i: %d\n", res_i); 112 | // } 113 | 114 | 115 | // res_kk is the amount of chunks required to read until the 116 | // string is completely read 117 | res_kk = res[kk]; 118 | 119 | switch (val_b) 120 | { 121 | 122 | case 0: 123 | { 124 | --kk_i; 125 | --kk; 126 | break; 127 | // ignored 128 | } 129 | 130 | default: // (val_b >= 1 & val_b <= 251) { 131 | { 132 | 133 | switch(type) 134 | { 135 | 136 | case 0: 137 | { 138 | // SPSS compression 139 | if (val_b < lowest || val_b > highest) 140 | val_b = NA_REAL; 141 | 142 | REAL(VECTOR_ELT(df,kk))[nn] = val_b - bias; 143 | 144 | if (debug) 145 | Rprintf("val_b: %d\n", val_b - bias); 146 | 147 | break; 148 | } 149 | 150 | default: 151 | { 152 | 153 | if (len==-1 || (len !=0 && len !=8) ) 154 | len = 8; 155 | 156 | // beginning of a new string 157 | std::string val_s (len, '\0'); 158 | val_s = readstring(val_s, sav); 159 | start.append( val_s ); 160 | 161 | // if res_i == res_kk the full string was read and 162 | // can be written else continue the string 163 | if (res_i == res_kk-1) { 164 | 165 | // trim additional whitespaces to the right 166 | rtrim(start); 167 | 168 | Rcpp::as(df[kk])[nn] = start; 169 | 170 | if (debug) 171 | Rcpp::Rcout << start << std::endl; 172 | 173 | // string completly written, reset start and res_i 174 | // and switch to next cell 175 | start = ""; 176 | res_i = 0; 177 | } else { 178 | // string will be continued 179 | ++res_i; 180 | } 181 | 182 | break; 183 | } 184 | 185 | } 186 | 187 | break; 188 | } 189 | 190 | case 252: 191 | { 192 | // 252 should be end of file, but as many things 193 | // it is not required to be inside the file 194 | eof = true; 195 | 196 | if (debug) 197 | Rcpp::Rcout << "eof: found" << std::endl; 198 | 199 | break; 200 | } 201 | 202 | case 253: 203 | { 204 | // Rcpp::Rcout << "## Debug ... 253" << std::endl; 205 | // Rprintf("nn %d & kk %d \n", nn, kk); 206 | switch(type) 207 | { 208 | 209 | case 0: 210 | { 211 | val_d = readbin(val_d, sav, swapit); 212 | 213 | if (val_d < lowest || val_d > highest) 214 | val_d = NA_REAL; 215 | 216 | REAL(VECTOR_ELT(df,kk))[nn] = val_d; 217 | 218 | if (debug) 219 | Rprintf("%f \n", val_d); 220 | 221 | break; 222 | } 223 | 224 | default: 225 | { 226 | 227 | // spss length 1:251 indicate a string. the value is the string 228 | // size. obvious spss uses the size to determine the size of the 229 | // string. there are two possible problems. 230 | // 1. len can be 1:7 in this case we know the max string size of the 231 | // variable is less than 8 bit long. still the field to read is 8 bit 232 | // long. 233 | // 2. the string is spread across different internal strings. in this 234 | // case we know the max size, still have to read each 8bit field. 235 | // maybe the max size can be used to have a second opinion wheather 236 | // or not a field contains a numeric or character. Following fields 237 | // have len -1. 238 | 239 | if (len==-1 || (len !=0 && len !=8) ) 240 | len = 8; 241 | 242 | std::string val_s (len, '\0'); 243 | val_s = readstring(val_s, sav); 244 | start.append( val_s ); 245 | 246 | 247 | if (res_i == res_kk-1) { 248 | 249 | // trim additional whitespaces to the right 250 | rtrim(start); 251 | 252 | Rcpp::as(df[kk])[nn] = start; 253 | 254 | if (debug) 255 | Rcpp::Rcout << start << std::endl; 256 | 257 | // reset 258 | start = ""; 259 | res_i = 0; 260 | } else { 261 | ++res_i; 262 | } 263 | 264 | 265 | break; 266 | } 267 | 268 | } 269 | 270 | break; 271 | } 272 | 273 | case 254: 274 | { 275 | switch(type) 276 | { 277 | 278 | case 0: 279 | { 280 | // --kk_i; 281 | res_i = 0; 282 | break; 283 | } 284 | 285 | default: 286 | { 287 | // 254 indicates that string chunks read before should be 288 | // interpreted as a single string. 289 | 290 | if (res_i == res_kk-1) { 291 | 292 | // trim additional whitespaces to the right 293 | rtrim(start); 294 | 295 | Rcpp::as(df[kk])[nn] = start; 296 | 297 | if (debug) 298 | Rcpp::Rcout << start << std::endl; 299 | 300 | // reset start 301 | start = ""; 302 | res_i = 0; 303 | } else { 304 | start.append(" "); 305 | ++res_i; 306 | } 307 | 308 | break; 309 | } 310 | } 311 | break; 312 | } 313 | 314 | case 255: 315 | { 316 | // 255 is a missing value in spss files. 317 | // 318 | switch(type) 319 | { 320 | 321 | case 0: 322 | { 323 | // Rcout << NA_REAL << std::endl; 324 | REAL(VECTOR_ELT(df,kk))[nn] = NA_REAL; 325 | break; 326 | } 327 | default: 328 | { 329 | Rcpp::as(df[kk])[nn] = NA_STRING; 330 | break; 331 | } 332 | break; 333 | } 334 | 335 | } 336 | } 337 | 338 | 339 | 340 | // variable is read 341 | if (res_i == 0) 342 | ++kk; 343 | 344 | // Rprintf("kk : %d\n", kk); 345 | 346 | 347 | // Update kk iterator. If kk is k, update nn to start in next row. 348 | if (kk == kv) { 349 | ++nn; 350 | 351 | // if (debug) 352 | // Rprintf("nn : %d - n: %d\n", nn, n); 353 | 354 | // Rprintf("nn: %d", nn); 355 | // some files are not ended with 252, ensure that no out of bounds 356 | // error occures. 357 | if (nn == n) { 358 | eof = true; 359 | 360 | if (debug) 361 | Rcpp::Rcout << "stop: eof" << std::endl; 362 | } 363 | 364 | // reset k and res_kk 365 | kk = 0; 366 | kk_i = 0; 367 | } 368 | 369 | if (eof) 370 | break; 371 | 372 | } 373 | 374 | } 375 | 376 | } 377 | 378 | if (cflag == 0) { 379 | 380 | kk = 0; 381 | 382 | std::string val_s = ""; 383 | 384 | for (int64_t ii = 0; ii < n*kv; ++ii) { 385 | 386 | int32_t const type = vtyp[kk]; 387 | switch(type) 388 | { 389 | 390 | case 0: 391 | { 392 | val_d = readbin(val_d, sav, swapit); 393 | 394 | // Not sure why, but -DBL_MAX is a missing in some sav-files 395 | if (val_d < lowest || val_d > highest || val_d == -DBL_MAX) 396 | val_d = NA_REAL; 397 | 398 | REAL(VECTOR_ELT(df,kk))[nn] = val_d; 399 | break; 400 | } 401 | 402 | default: 403 | { 404 | 405 | double len = type; 406 | 407 | len = ceil(len/8) * 8; 408 | 409 | std::string val_s ((int32_t)len, '\0'); 410 | val_s = readstring(val_s, sav); 411 | 412 | // shorten the string to the actual size reported by SPSS 413 | val_s.erase(type, std::string::npos); 414 | 415 | // trim additional whitespaces 416 | trim(val_s); 417 | 418 | // Rcpp::Rcout << val_s << std::endl; 419 | Rcpp::as(df[kk])[nn] = val_s; 420 | 421 | break; 422 | } 423 | 424 | } 425 | 426 | ++kk; 427 | 428 | if (kk == kv) { 429 | ++nn; 430 | kk = 0; 431 | } 432 | 433 | } 434 | } 435 | 436 | return(df); 437 | } 438 | -------------------------------------------------------------------------------- /R/readsav.R: -------------------------------------------------------------------------------- 1 | #' read.sav 2 | #' 3 | #' Function to read a SPSS sav file into a data.frame(). 4 | #'@param file _string_ a sav-file to import. can be a file on a computer 5 | #' or an url. in this case the file will be downloaded and read before it is 6 | #' used. 7 | #'@param convert.factors _logical_ if true numeric or character variables 8 | #' will be converted into a factor in R. 9 | #'@param generate.factors _logical_ function to convert variables with 10 | #' partial labels into factors. e.g. 1 - low and 5 - high are provided, labels 11 | #' 2, 3 and 4 will be created. especially useful in combination with 12 | #' `use.missings=TRUE`. 13 | #'@param encoding _logical_ shall values be converted? If true, read.sav 14 | #' will try the charcode stored inside the sav-file. If this value is 2 or not 15 | #' available, fromEncoding can be used to change encoding. 16 | #'@param fromEncoding _character._ encoding of the imported file. This 17 | #' information is stored inside the sav-file, but is currently unused. Still 18 | #' this option can be used to define the initial encoding by hand. 19 | #'@param use.missings _logical_ should missing values be converted. 20 | #' Defaults to TRUE. 21 | #' @param debug _logical_ provides additional debug information. Most 22 | #' likely not useful to any user. 23 | #'@param override _logical_. The filename provided in `file` is 24 | #' checked for the ending sav. If the file ending is different, nothing is read. 25 | #' This option can be used to override this behavior. 26 | #'@param convert.dates _logical_. Should dates be converted on the fly? 27 | #'@param add.rownames _logical._ If `TRUE`, the first column will be 28 | #' used as rownames. Variable will be dropped afterwards. 29 | #'@param pass _character_. If encrypted sav should be imported, this is a 30 | #' maximum of ten character encryption key. 31 | #' 32 | #'@details SPSS files are widely available, though for R long time only foreign 33 | #' and memisc provided functions to import sav-files. Lately haven joined. 34 | #' This package is an approach to offer another alternative, to document the 35 | #' sav-format and provide additional options to import the data. 36 | #' sav-files are stored most exclusively as numerics only in compression mode 37 | #' are some integers stored as integers. Still they are returned as numerics. 38 | #' 39 | #'@examples 40 | #' fl <- system.file("extdata", "electric.sav", package = "readspss") 41 | #' dd <- read.sav(fl) 42 | #' 43 | #'@return `readspss` returns a data.frame with additional attributes 44 | #' 45 | #' * _row.names_ rownames 46 | #' * _names_ colnames 47 | #' * _datalabel_ datalabel 48 | #' * _datestamp_ datestamp 49 | #' * _timestamp_ timestamp 50 | #' * _filelabel_ filelabel 51 | #' * _class_ data.frame 52 | #' * _vtype_ SPSS type 0 is usually a numeric/integer 53 | #' * _disppar_ matrix of display parameters if available 54 | #' * _missings_ a list containing information about the missing variables. if 55 | #' `use.missings=TRUE` this Information will be used to generate missings. 56 | #' * _haslabel_ list of variables that contain labels 57 | #' * _longstring_ character vector of long strings if any in file 58 | #' * _longmissing_ character vector of missings in longstrings if any 59 | #' * _longlabel_ character vector of long labels 60 | #' * _cflag_ 0 if uncompressed, 1 if compressed 61 | #' * _endian_ 2 or 3 if little endian else 0 62 | #' * _compression_ compression similar to cflag, somehow stored twice in the 63 | #' sav file 64 | #' * _doc_ list containing documentation information if any 65 | #' * _charcode_ encoding string most likely 2 is CP1252 66 | #' * _encoding_ sometimes sav-file contain encoding as a extra string 67 | #' * _ownEnc_ encoding of the R-session 68 | #' * _doenc_ was the file supposed to be encoded? 69 | #' * _autoenc_ was encoding applied to the file? 70 | #' * _swapit_ were the bytes swapped? 71 | #' * _totals_ character string of totals if any 72 | #' * _dataview_ xml file how the data should be printed 73 | #' * _extraproduct_ additional string provided 74 | #' * _label_ list containing label value information 75 | #' * _varmatrix_ a matrix with information how the data is stored 76 | #' * _var.label_ variable labels 77 | #' * _lmissings_ missings table if any in longstrings 78 | #' 79 | #'@note Information to decrypt the sav-format was provided by tda 80 | #' [www.stat.rub.de/tda.html](http://www.stat.ruhr-uni-bochum.de/tda.html) and 81 | #' pspp [www.gnu.org/software/pspp/](http://www.gnu.org/software/pspp/) 82 | #' 83 | #'@seealso \code{\link[foreign]{read.spss}}, \code{memisc} and 84 | #'\code{haven}. 85 | 86 | #' @useDynLib readspss, .registration=TRUE 87 | #' @importFrom tools file_ext 88 | #' @importFrom stats na.omit 89 | #' @importFrom utils download.file localeToCharset 90 | #' @export 91 | read.sav <- function(file, convert.factors = TRUE, generate.factors = TRUE, 92 | encoding = TRUE, fromEncoding = NULL, use.missings = TRUE, 93 | debug = FALSE, override = FALSE, convert.dates = TRUE, 94 | add.rownames = FALSE, pass) { 95 | 96 | # Check if path is a url 97 | if (length(grep("^(http|ftp|https)://", file))) { 98 | tmp <- tempfile() 99 | download.file(file, tmp, quiet = TRUE, mode = "wb") 100 | filepath <- tmp 101 | on.exit(unlink(filepath)) 102 | } else { 103 | # construct filepath and read file 104 | filepath <- get.filepath(file) 105 | } 106 | if (!file.exists(filepath)) 107 | return(message("File not found.")) 108 | 109 | file <- file_ext(basename(filepath)) 110 | 111 | if ((tolower(file) != "sav" && tolower(file) != "zsav") && 112 | !isTRUE(override)) { 113 | warning("Filending is not sav. 114 | Use Override if this check should be ignored.") 115 | return(NULL) 116 | } 117 | 118 | encStr <- "" 119 | ownEnc <- localeToCharset(locale = Sys.getlocale("LC_CTYPE"))[1] 120 | 121 | if (!is.null(fromEncoding)) { 122 | encStr <- fromEncoding 123 | } 124 | 125 | if (encoding == FALSE) 126 | encStr <- "NA" 127 | 128 | if (missing(pass)) { 129 | # import data using an rcpp routine 130 | data <- readsav(filepath, debug, encStr, ownEnc) 131 | } else { 132 | 133 | if (nchar(pass) > 10) { 134 | warning("pass longer than 10 characters. most likely unwanted") 135 | } 136 | 137 | data <- readencrypted(filepath, debug, encStr, ownEnc, pass) 138 | } 139 | 140 | attribs <- attributes(data) 141 | 142 | attr(data, "vallabels") <- NULL 143 | attr(data, "vartypes") <- NULL 144 | attr(data, "varnames") <- NULL 145 | attr(data, "varmat") <- NULL 146 | attr(data, "label") <- NULL 147 | attr(data, "res") <- NULL 148 | 149 | encStr <- attribs$encStr 150 | autoenc <- attribs$autoenc 151 | label <- attribs$label 152 | val.labels <- attribs$vallabels 153 | varmat <- do.call("rbind", attribs$varmat) 154 | disppar <- attribs$disppar 155 | if (!identical(disppar, integer(0))) { 156 | disppar <- t(matrix(disppar, ncol = NCOL(data))) 157 | } else { 158 | disppar <- NULL 159 | } 160 | if (NROW(data) == 0) 161 | use.missings <- FALSE 162 | 163 | # convert NAs by missing information provided by SPSS. 164 | # these are just different missing values in Stata and NA in R. 165 | if (use.missings) { 166 | if (!identical(attribs$missings, list())) { 167 | 168 | mvtab <- attribs$missings 169 | missinfo <- varmat[, 3] 170 | missinfo <- which(missinfo %in% missinfo[missinfo != 0]) 171 | 172 | 173 | for (i in seq_along(mvtab)) { 174 | mvtabi <- mvtab[[i]] 175 | missinf <- missinfo[i] 176 | 177 | if (mvtabi[1] == 1) { 178 | naval <- mvtabi[2] 179 | 180 | data[missinf][data[missinf] == naval] <- NA 181 | } 182 | 183 | if (mvtabi[1] == 2) { 184 | naval1 <- mvtabi[2] 185 | naval2 <- mvtabi[3] 186 | 187 | data[missinf][data[missinf] == naval1 | 188 | data[missinf] == naval2] <- NA 189 | } 190 | 191 | if (mvtabi[1] == 3) { 192 | naval1 <- mvtabi[2] 193 | naval2 <- mvtabi[3] 194 | naval3 <- mvtabi[4] 195 | 196 | data[missinf][data[missinf] == naval1 | 197 | data[missinf] == naval2 | data[missinf] == naval3] <- NA 198 | } 199 | 200 | 201 | if (mvtabi[1] == -2) { 202 | # range 203 | 204 | minval <- mvtabi[2] 205 | maxval <- mvtabi[3] 206 | 207 | data[missinf][data[missinf] >= minval & 208 | data[missinf] <= maxval] <- NA 209 | 210 | } 211 | 212 | if (mvtabi[1] == -3) { 213 | # range + descrete 214 | 215 | minval <- mvtabi[2] 216 | maxval <- mvtabi[3] 217 | naval <- mvtabi[4] 218 | 219 | data[missinf][(data[missinf] >= minval & 220 | data[missinf] <= maxval) | data[missinf] == naval] <- NA 221 | 222 | } 223 | } 224 | } 225 | 226 | } 227 | 228 | labnames <- attribs$haslabel 229 | varnames <- attribs$varnames 230 | 231 | # if autoenc labels were not encoded during readsav() so encode now 232 | if (encoding && autoenc) { 233 | 234 | # label 235 | for (i in seq_along(label)) 236 | names(label[[i]]) <- read.encoding(names(label[[i]]), 237 | fromEncoding = encStr, 238 | encoding = ownEnc) 239 | } 240 | 241 | if (convert.factors) { 242 | # vnames <- names(data) 243 | for (i in seq_along(label)) { 244 | 245 | labname <- labnames[[i]] 246 | labtable <- label[[i]] 247 | 248 | for (j in labname) { 249 | varname <- varnames[j] 250 | isNum <- is.numeric(data[, varname]) 251 | anyNA <- any(is.na(labtable)) 252 | 253 | # get unique values / omit NA unless NA already in labtable 254 | if (anyNA) { 255 | varunique <- unique(data[[varname]]) 256 | } else { 257 | varunique <- na.omit(unique(data[[varname]])) 258 | } 259 | 260 | if (isNum && all(is.na(labtable))) { 261 | nam <- names(labtable) 262 | labtable <- as.numeric(labtable) 263 | names(labtable) <- nam 264 | } 265 | 266 | # assign label if label set is complete 267 | if (all(varunique %in% labtable)) { 268 | data[[varname]] <- fast_factor(data[[varname]], y = labtable) 269 | 270 | # else generate labels from codes 271 | } else { 272 | if (generate.factors) { 273 | 274 | names(varunique) <- as.character(varunique) 275 | 276 | gen.lab <- 277 | sort(c(varunique[!varunique %in% labtable], labtable), 278 | na.last = TRUE) 279 | 280 | if (isNum) { 281 | nam <- names(gen.lab) 282 | gen.lab <- as.numeric(gen.lab) 283 | names(gen.lab) <- nam 284 | } 285 | 286 | data[[varname]] <- fast_factor(data[[varname]], y = gen.lab) 287 | } else { 288 | warning( 289 | paste( 290 | names(data)[i], "Missing factor labels - no labels assigned. 291 | Set option generate.factors=T to generate labels." 292 | ) 293 | ) 294 | } 295 | } 296 | } 297 | } 298 | } 299 | 300 | if (convert.dates) { 301 | 302 | nams <- names(data) 303 | isdate <- varmat[, 6] %in% c(20, 23, 24, 28, 29, 30, 38, 39) 304 | isdatetime <- varmat[, 6] %in% c(22, 41) 305 | istime <- varmat[, 6] %in% c(21, 25, 40) 306 | 307 | if (any(isdate)) { 308 | for (nam in nams[isdate]) { 309 | data[[nam]] <- as.Date(as.POSIXct( 310 | round(data[[nam]]), origin = "1582-10-14")) 311 | } 312 | } 313 | if (any(isdatetime)) { 314 | for (nam in nams[isdatetime]) { 315 | data[[nam]] <- as.POSIXct( 316 | data[[nam]], 317 | origin = "1582-10-14", 318 | tz = "GMT") 319 | } 320 | } 321 | if (any(istime)) { 322 | message( 323 | "time format found for:\n", 324 | paste(nams[istime], collapse = "\n"), 325 | "\ntime variables are not dates and thus not converted." 326 | ) 327 | } 328 | 329 | } 330 | 331 | 332 | longvarname <- attribs$longvarname 333 | 334 | haslongvarname <- !identical(longvarname, "") & 335 | !identical(longvarname, character(0)) 336 | 337 | if (haslongvarname) { 338 | 339 | # contains long varname (e.g. when longer varnames are provided or if the 340 | # dataset contains long strings) 341 | longname <- lapply(longvarname, boost_split) 342 | 343 | # contains varname and absolute length eg 344 | # A258=00258 345 | longstring <- attribs$longstring 346 | 347 | haslongstring <- !identical(longstring, "") & 348 | !identical(longstring, character(0)) 349 | 350 | # only applicable, if dataset contains longstrings 351 | if (haslongstring) { 352 | 353 | longstring <- strsplit(longstring[!longstring == ""], "=") 354 | 355 | # If the imported data contains strings longer than nchar(255) the data is 356 | # scrambled at this point. SPSS separates longer strings in different 357 | # pieces of size 255. The rcpp import already sorted the data in 358 | # variables. These variables are now combined. Variable names are split 359 | # after five letters used for identification. Since SPSS can use variable 360 | # names of 8 characters they trim the name down to max of 5. They add 361 | # some digit used to identify the order of the long strings. E.g. 362 | # "Var1, Var1001, Var1002", but similar "STRING_5" and "STRIN0". 363 | # Unsure if there is some kind of trustworthy method 364 | 365 | nams <- names(data) 366 | 367 | replvec <- lapply( 368 | longstring, 369 | function(x) { 370 | 371 | nam <- x[[1]] 372 | 373 | # get name and the amount of SPSS strings required to store 374 | # such a string. use this to calculate the number of varnames 375 | # next to the one stated in the string. this is somewhat risky, 376 | # but grepl adds some checks 377 | len <- as.numeric(x[[2]]) 378 | len <- ceiling(len / 255) - 1 379 | 380 | p <- which(nams %in% nam) 381 | if (!identical(p, integer(0))) 382 | nams[p : (p + len)] 383 | }) 384 | 385 | for (i in rev(seq_len(length(replvec)))) { 386 | 387 | pat <- replvec[[i]] 388 | 389 | if (!is.null(pat)) { 390 | 391 | # any variables to combine? 392 | if (length(pat) > 1 && grepl("0", pat[2])) { 393 | sel <- data[, names(data) %in% pat] 394 | 395 | if (all(sapply(sel, is.character))) { 396 | pp <- pat[-1] 397 | p1 <- pat[1] 398 | 399 | remove <- !names(data) %in% pp 400 | 401 | # remove columns pat[2:n] 402 | data <- data[, remove] 403 | 404 | # resize varmat and disppar as well 405 | varmat <- varmat[remove, ] 406 | 407 | if (!is.null(disppar)) 408 | disppar <- disppar[remove, ] 409 | 410 | data[p1] <- do.call(paste0, sel) 411 | 412 | } 413 | } 414 | } 415 | } 416 | } 417 | 418 | # assign names stored in spss 419 | # Previously the dataset used some different internal names usefull for 420 | # combining different long strings. Now everything is cleaned up and we 421 | # can apply the correct variable names 422 | nams <- names(data) 423 | names(nams) <- nams 424 | 425 | # new_nams <- do.call(rbind, longname) 426 | new_nams <- sapply(longname, function(x) { 427 | z <- x[[2]] 428 | names(z) <- x[[1]] 429 | z 430 | }) 431 | 432 | 433 | # for this replace was used, but in the world of sav-files everything is 434 | # possible even files where nams and new_nams differ. replace got confused 435 | # in such cases, which is why this approach is selected 436 | sel <- which(names(new_nams) %in% nams) 437 | 438 | if (!identical(integer(0), sel)) 439 | nams[names(new_nams[sel])] <- new_nams[sel] 440 | 441 | names(data) <- nams 442 | 443 | } 444 | 445 | # again here, because longvarnames has been set and longmissing refers to 446 | # longvarnames. previous use.missings refered to varmat using default 447 | # varnames 448 | if (use.missings) { 449 | 450 | longmiss <- attribs$longmissing 451 | 452 | if (!identical(longmiss, list())) { 453 | 454 | mvars <- names(longmiss) 455 | 456 | for (mvar in mvars) { 457 | 458 | ismiss <- data[[mvar]] %in% longmiss[[mvar]] 459 | 460 | data[[mvar]][ismiss] <- NA 461 | } 462 | } 463 | 464 | } 465 | 466 | longlabel <- attribs$longlabel 467 | 468 | if (convert.factors && !identical(longlabel, list())) { 469 | 470 | longlabnames <- names(longlabel) 471 | 472 | for (i in seq_along(longlabel)) { 473 | 474 | longlabname <- longlabnames[[i]] 475 | labtable <- longlabel[[longlabname]] 476 | 477 | anyNA <- any(is.na(labtable)) 478 | 479 | # get unique values / omit NA unless NA already in labtable 480 | if (anyNA) { 481 | varunique <- unique(data[[longlabname]]) 482 | } else { 483 | varunique <- na.omit(unique(data[[longlabname]])) 484 | } 485 | 486 | # assign label if label set is complete 487 | if (all(varunique %in% labtable)) { 488 | data[[longlabname]] <- fast_factor(data[[longlabname]], y = labtable) 489 | 490 | # else generate labels from codes 491 | } else { 492 | if (generate.factors) { 493 | 494 | names(varunique) <- as.character(varunique) 495 | 496 | gen.lab <- 497 | sort(c(varunique[!varunique %in% labtable], labtable), 498 | na.last = TRUE) 499 | 500 | data[[longlabname]] <- fast_factor(data[[longlabname]], y = gen.lab) 501 | } else { 502 | warning( 503 | paste( 504 | names(data)[i], "Missing factor labels - no labels assigned. 505 | Set option generate.factors=T to generate labels." 506 | ) 507 | ) 508 | } 509 | } 510 | } 511 | } 512 | 513 | # prepare for return 514 | attr(data, "datalabel") <- attribs$datalabel 515 | attr(data, "datestamp") <- attribs$datestamp 516 | attr(data, "timestamp") <- attribs$timestamp 517 | attr(data, "label") <- label 518 | 519 | attr(data, "varmatrix") <- varmat 520 | attr(data, "disppar") <- disppar 521 | attr(data, "var.label") <- val.labels 522 | attr(data, "longlabel") <- attribs$longlabel 523 | attr(data, "missings") <- attribs$missings 524 | attr(data, "lmissing") <- attribs$longmissing 525 | attr(data, "endian") <- attribs$endian 526 | attr(data, "cflag") <- attribs$cflag 527 | attr(data, "encStr") <- attribs$encStr 528 | attr(data, "ownEnc") <- attribs$ownEnc 529 | attr(data, "autoenc") <- attribs$autoenc 530 | attr(data, "doenc") <- attribs$doenc 531 | 532 | 533 | if (add.rownames) { 534 | rownames(data) <- data[[1]] 535 | data[[1]] <- NULL 536 | } 537 | 538 | # return 539 | return(data) 540 | 541 | } 542 | --------------------------------------------------------------------------------