├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── rhub.yaml ├── src ├── .gitignore ├── Makevars ├── R-xxhash-utils.h ├── init.c ├── R-xxhash-raw.c ├── R-xxhash-utils.c ├── R-xxhash-file.c ├── R-xxhash-con.c └── R-xxhash-serialize.c ├── .devcontainer ├── requirements.txt ├── Dockerfile └── devcontainer.json ├── LICENSE ├── R ├── aaa.R └── xxhash.R ├── tests ├── testthat.R └── testthat │ ├── ref │ ├── dbl.bin │ ├── int.bin │ ├── raw.bin │ └── lgl.bin │ ├── test-xxhash.R │ ├── test-xxhash-utils.R │ ├── test-xxhash-con.R │ ├── test-xxhash-file.R │ └── test-xxhash-raw.R ├── .gitignore ├── man ├── figures │ ├── README-unnamed-chunk-4-1.png │ └── README-unnamed-chunk-6-1.png ├── xxhash_file.Rd ├── xxhash.Rd ├── xxhash_con.Rd └── xxhash_raw.Rd ├── .Rbuildignore ├── NAMESPACE ├── LICENSE.md ├── NEWS.md ├── DESCRIPTION ├── inst └── LICENSE-xxHash ├── README.md └── README.Rmd /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /src/.gitignore: -------------------------------------------------------------------------------- 1 | *.o 2 | *.so 3 | -------------------------------------------------------------------------------- /.devcontainer/requirements.txt: -------------------------------------------------------------------------------- 1 | ydiff 2 | -------------------------------------------------------------------------------- /src/Makevars: -------------------------------------------------------------------------------- 1 | #PKG_CFLAGS += -Wconversion 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2023,2024 2 | COPYRIGHT HOLDER: Mike Cheng 3 | -------------------------------------------------------------------------------- /R/aaa.R: -------------------------------------------------------------------------------- 1 | #' @useDynLib xxhashlite, .registration=TRUE 2 | NULL 3 | 4 | 5 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(xxhashlite) 3 | 4 | test_check("xxhashlite") 5 | -------------------------------------------------------------------------------- /tests/testthat/ref/dbl.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/xxhashlite/HEAD/tests/testthat/ref/dbl.bin -------------------------------------------------------------------------------- /tests/testthat/ref/int.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/xxhashlite/HEAD/tests/testthat/ref/int.bin -------------------------------------------------------------------------------- /tests/testthat/ref/raw.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/xxhashlite/HEAD/tests/testthat/ref/raw.bin -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .Rhistory 3 | *.Rproj 4 | .Rproj.user 5 | *.swp 6 | inst/doc 7 | doc 8 | Meta 9 | working 10 | pkgdown 11 | -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-4-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/xxhashlite/HEAD/man/figures/README-unnamed-chunk-4-1.png -------------------------------------------------------------------------------- /man/figures/README-unnamed-chunk-6-1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coolbutuseless/xxhashlite/HEAD/man/figures/README-unnamed-chunk-6-1.png -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^README.Rmd$ 4 | ^README.md$ 5 | ^working$ 6 | ^LICENSE-xxHash$ 7 | ^\.github$ 8 | ^LICENSE.md$ 9 | ^.devcontainer$ 10 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(xxhash) 4 | export(xxhash_con) 5 | export(xxhash_file) 6 | export(xxhash_raw) 7 | useDynLib(xxhashlite, .registration=TRUE) 8 | -------------------------------------------------------------------------------- /src/R-xxhash-utils.h: -------------------------------------------------------------------------------- 1 | SEXP xxh128_hash_to_robj(XXH128_hash_t hash, SEXP as_raw_); 2 | SEXP xxh64_hash_to_robj(XXH64_hash_t hash, SEXP as_raw_); 3 | SEXP xxh32_hash_to_robj(XXH32_hash_t hash, SEXP as_raw_); 4 | 5 | -------------------------------------------------------------------------------- /tests/testthat/test-xxhash.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | test_that("xxhash() works", { 5 | 6 | dat <- list("test", 1:3, c(10, 11, 12)) 7 | 8 | expect_identical(xxhash(dat, algo="xxh128"), "c2fc4e260dddb5b49271720838923d3a") 9 | expect_identical(xxhash(dat, algo="xxh32" ), "ba28622d") 10 | expect_identical(xxhash(dat, algo="xxh64" ), "c6ba5f15dd3a1ea6") 11 | expect_identical(xxhash(dat, algo="xxh3" ), "15bad8eb82ec503f") 12 | 13 | }) 14 | -------------------------------------------------------------------------------- /tests/testthat/test-xxhash-utils.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("hash as raw vector works", { 3 | 4 | 5 | algos <- c('xxh32', 'xxh64', 'xxh128', 'xxh3') 6 | v <- letters 7 | 8 | for (algo in algos) { 9 | expect_identical( 10 | xxhash(v, algo = 'xxh128', as_raw = FALSE), 11 | paste(xxhash(v, algo = 'xxh128', as_raw = TRUE ), collapse = ""), 12 | label = paste(algo, ":") 13 | ) 14 | } 15 | 16 | 17 | }) 18 | -------------------------------------------------------------------------------- /tests/testthat/test-xxhash-con.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("xxhash_con() works", { 3 | 4 | path <- testthat::test_path("ref") 5 | testfiles <- list.files(path, full.names = TRUE) 6 | algos <- c('xxh32', 'xxh64', 'xxh128', 'xxh3') 7 | 8 | testfile <- testfiles[[1]] 9 | algo <- 'xxh128' 10 | 11 | for (testfile in testfiles) { 12 | for (algo in algos){ 13 | expect_identical( 14 | xxhash_raw(readBin(testfile, raw(), file.size(testfile)), algo = algo), 15 | xxhash_con(file(testfile), algo = algo), 16 | label = paste(algo, testfile, ":") 17 | ) 18 | } 19 | } 20 | }) 21 | -------------------------------------------------------------------------------- /tests/testthat/test-xxhash-file.R: -------------------------------------------------------------------------------- 1 | 2 | test_that("xxhash_file() works", { 3 | 4 | path <- testthat::test_path("ref") 5 | testfiles <- list.files(path, full.names = TRUE) 6 | algos <- c('xxh32', 'xxh64', 'xxh128', 'xxh3') 7 | 8 | testfile <- testfiles[[1]] 9 | algo <- 'xxh128' 10 | 11 | for (testfile in testfiles) { 12 | for (algo in algos){ 13 | expect_identical( 14 | xxhash_raw(readBin(testfile, raw(), file.size(testfile)), algo = algo), 15 | xxhash_file(testfile, algo = algo), 16 | label = paste(algo, testfile, ":") 17 | ) 18 | } 19 | } 20 | }) 21 | -------------------------------------------------------------------------------- /.devcontainer/Dockerfile: -------------------------------------------------------------------------------- 1 | # Pre-built Dev Container Image for R. More info: https://github.com/rocker-org/devcontainer-images/pkgs/container/devcontainer%2Ftidyverse 2 | # Available R version: 4, 4.1, 4.0 3 | ARG VARIANT="4" 4 | FROM ghcr.io/rocker-org/devcontainer/tidyverse:${VARIANT} 5 | 6 | RUN install2.r --error --skipinstalled -n -1 \ 7 | patchwork \ 8 | paletteer \ 9 | here \ 10 | janitor \ 11 | palmerpenguins \ 12 | markdown \ 13 | httpgd \ 14 | languageserver \ 15 | lintr \ 16 | && rm -rf /tmp/downloaded_packages \ 17 | && R -q -e 'remotes::install_github("https://github.com/dcomtois/summarytools/tree/0-8-9")' 18 | 19 | # Install Python packages 20 | COPY requirements.txt /tmp/pip-tmp/ 21 | RUN python3 -m pip --disable-pip-version-check --no-cache-dir install -r /tmp/pip-tmp/requirements.txt --break-system-packages 22 | -------------------------------------------------------------------------------- /man/xxhash_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xxhash.R 3 | \name{xxhash_file} 4 | \alias{xxhash_file} 5 | \title{Calculate the hash of a file} 6 | \usage{ 7 | xxhash_file(file, algo = "xxh128", as_raw = FALSE) 8 | } 9 | \arguments{ 10 | \item{file}{filename} 11 | 12 | \item{algo}{Select the specific xxhash algorithm. Default: 'xxh128'. 13 | (the latest algorithm in the xxhash family) 14 | Valid values: 'xxh32', 'xxh64', 'xxh128', 'xxh3'} 15 | 16 | \item{as_raw}{Return the hash as a raw vector of bytes instead of string? 17 | Default: FALSE. If TRUE, then the raw bytes are returned in big-endian 18 | order - which is what \code{xxHash} considers the \emph{canonical} form.} 19 | } 20 | \value{ 21 | String representation of hash. If \code{as_raw = TRUE} then a 22 | raw vector is returned instead. 23 | } 24 | \description{ 25 | Calculate the hash of a file 26 | } 27 | \examples{ 28 | filename <- system.file('DESCRIPTION', package = 'base', mustWork = TRUE) 29 | xxhash_file(filename) 30 | } 31 | -------------------------------------------------------------------------------- /man/xxhash.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xxhash.R 3 | \name{xxhash} 4 | \alias{xxhash} 5 | \title{Calculate the hash of an arbitrary R object.} 6 | \usage{ 7 | xxhash(robj, algo = "xxh128", as_raw = FALSE) 8 | } 9 | \arguments{ 10 | \item{robj}{Any R object} 11 | 12 | \item{algo}{Select the specific xxhash algorithm. Default: 'xxh128'. 13 | (the latest algorithm in the xxhash family) 14 | Valid values: 'xxh32', 'xxh64', 'xxh128', 'xxh3'} 15 | 16 | \item{as_raw}{Return the hash as a raw vector of bytes instead of string? 17 | Default: FALSE. If TRUE, then the raw bytes are returned in big-endian 18 | order - which is what \code{xxHash} considers the \emph{canonical} form.} 19 | } 20 | \value{ 21 | String representation of hash. If \code{as_raw = TRUE} then a 22 | raw vector is returned instead. 23 | } 24 | \description{ 25 | This function will calculate the hash of any object understood by 26 | \code{base::serialize()}. 27 | } 28 | \examples{ 29 | xxhash(mtcars) 30 | xxhash(mtcars, algo = 'xxh3', as_raw = TRUE) 31 | } 32 | -------------------------------------------------------------------------------- /man/xxhash_con.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xxhash.R 3 | \name{xxhash_con} 4 | \alias{xxhash_con} 5 | \title{Calculate the hash of data from a connection object} 6 | \usage{ 7 | xxhash_con(con, algo = "xxh128", as_raw = FALSE) 8 | } 9 | \arguments{ 10 | \item{con}{connection} 11 | 12 | \item{algo}{Select the specific xxhash algorithm. Default: 'xxh128'. 13 | (the latest algorithm in the xxhash family) 14 | Valid values: 'xxh32', 'xxh64', 'xxh128', 'xxh3'} 15 | 16 | \item{as_raw}{Return the hash as a raw vector of bytes instead of string? 17 | Default: FALSE. If TRUE, then the raw bytes are returned in big-endian 18 | order - which is what \code{xxHash} considers the \emph{canonical} form.} 19 | } 20 | \value{ 21 | String representation of hash. If \code{as_raw = TRUE} then a 22 | raw vector is returned instead. 23 | } 24 | \description{ 25 | Calculate the hash of data from a connection object 26 | } 27 | \examples{ 28 | filename <- system.file('DESCRIPTION', package = 'base', mustWork = TRUE) 29 | xxhash_con(file(filename)) 30 | } 31 | -------------------------------------------------------------------------------- /man/xxhash_raw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/xxhash.R 3 | \name{xxhash_raw} 4 | \alias{xxhash_raw} 5 | \title{Calculate the hash of a raw vector or string} 6 | \usage{ 7 | xxhash_raw(vec, algo = "xxh128", as_raw = FALSE) 8 | } 9 | \arguments{ 10 | \item{vec}{raw vector or single character string} 11 | 12 | \item{algo}{Select the specific xxhash algorithm. Default: 'xxh128'. 13 | (the latest algorithm in the xxhash family) 14 | Valid values: 'xxh32', 'xxh64', 'xxh128', 'xxh3'} 15 | 16 | \item{as_raw}{Return the hash as a raw vector of bytes instead of string? 17 | Default: FALSE. If TRUE, then the raw bytes are returned in big-endian 18 | order - which is what \code{xxHash} considers the \emph{canonical} form.} 19 | } 20 | \value{ 21 | String representation of hash. If \code{as_raw = TRUE} then a 22 | raw vector is returned instead. 23 | } 24 | \description{ 25 | This performs a hash of the raw bytes - not of the serialized representation. 26 | } 27 | \examples{ 28 | vec <- "hello" 29 | xxhash_raw(vec) 30 | vec <- as.raw(c(0x01, 0x02, 0x99)) 31 | xxhash_raw(vec) 32 | } 33 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023,2024 mikefc@coolbutuseless.com 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in 13 | all copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 21 | THE SOFTWARE. 22 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | # xxhashlite 0.2.2.9000 2024-04-11 4 | 5 | * Support for hashing data from connections 6 | 7 | # xxhashlite 0.2.2 2024-03-11 8 | 9 | * Rename hash `algo` options to better align with official documentation 10 | * Skip serialization header so results for `algo = xxh128` match `rlang::hash()` 11 | * Hash files, strings and raw vector contents directly (i.e. not using 12 | serialization) 13 | * Return hash as string or raw vector 14 | 15 | # xxhashlite 0.2.1.9000 2024-03-09 16 | 17 | * Update vendored 'xxHash' library to version 0.8.2 18 | 19 | # xxhashlite 0.2.1 2020-08-22 20 | 21 | * Remove `xxhash_vec()` is now redundant. It is slightly faster than `xxhash` 22 | on vanilla vector inputs, but not useful enough generally to maintain. 23 | 24 | # xxhashlite 0.2.0 2020-09-19 25 | 26 | * Refactor: Consolidate hash functions for atomic vectors into a single call with an 27 | `algo` argument. 28 | * Feature: Calculation hashes on any R object understood by `base::serialize()` 29 | 30 | # xxhashlite 0.1.2 31 | 32 | * Update to xxHash v0.8.0 33 | 34 | # xxhashlite 0.1.1 35 | 36 | * Added support for hashing of vectors of complex numbers 37 | 38 | # xxhashlite 0.1.0 39 | 40 | * Initial release. 41 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: xxhashlite 2 | Type: Package 3 | Title: Extremely Fast Hashing of R Objects, Raw Data and Files using 'xxHash' Algorithms 4 | Version: 0.2.2 5 | Authors@R: c( 6 | person("Mike", "Cheng", role = c("aut", "cre", 'cph'), email = "mikefc@coolbutuseless.com"), 7 | person("Yann", "Collet", role = c("ctb", "cph"), comment = "Author of the embedded xxhash library") 8 | ) 9 | Maintainer: Mike Cheng 10 | Description: Extremely fast hashing of R objects using 'xxHash'. R objects are hashed via 11 | the standard serialization mechanism in R. Raw byte vectors and strings 12 | can be handled directly for compatibility with hashes created on 13 | other systems. This implementation is a wrapper around the 'xxHash' 'C' 14 | library which is available from . 15 | License: MIT + file LICENSE 16 | URL: https://github.com/coolbutuseless/xxhashlite 17 | BugReports: https://github.com/coolbutuseless/xxhashlite/issues 18 | Encoding: UTF-8 19 | RoxygenNote: 7.3.1 20 | Suggests: 21 | testthat 22 | Depends: 23 | R (>= 3.5.0) 24 | Copyright: This package includes code from the 'xxhash' written Yann Collet. 25 | See file 'inst/LICENSE-xxhash' for copyright information of the 26 | original library. 27 | -------------------------------------------------------------------------------- /src/init.c: -------------------------------------------------------------------------------- 1 | 2 | #include 3 | #include 4 | 5 | SEXP xxhash_ (SEXP robj_, SEXP algo_, SEXP as_raw_); 6 | SEXP xxhash_raw_ (SEXP robj_, SEXP algo_, SEXP as_raw_); 7 | SEXP xxhash_file_(SEXP file_, SEXP algo_, SEXP as_raw_); 8 | SEXP xxhash_con_ (SEXP con_ , SEXP algo_, SEXP as_raw_); 9 | 10 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 11 | // .C R_CMethodDef 12 | // .Call R_CallMethodDef 13 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 14 | static const R_CallMethodDef CEntries[] = { 15 | {"xxhash_" , (DL_FUNC) &xxhash_ , 3}, 16 | {"xxhash_raw_" , (DL_FUNC) &xxhash_raw_ , 3}, 17 | {"xxhash_file_", (DL_FUNC) &xxhash_file_, 3}, 18 | {"xxhash_con_" , (DL_FUNC) &xxhash_con_ , 3}, 19 | {NULL, NULL, 0} 20 | }; 21 | 22 | 23 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 24 | // Register the methods 25 | // 26 | // Change the '_simplecall' suffix to match your package name 27 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 28 | void R_init_xxhashlite(DllInfo *info) { 29 | R_registerRoutines( 30 | info, // DllInfo 31 | NULL, // .C 32 | CEntries, // .Call 33 | NULL, // Fortran 34 | NULL // External 35 | ); 36 | R_useDynamicSymbols(info, FALSE); 37 | } 38 | -------------------------------------------------------------------------------- /inst/LICENSE-xxHash: -------------------------------------------------------------------------------- 1 | xxHash Library 2 | Copyright (c) 2012-2021 Yann Collet 3 | All rights reserved. 4 | 5 | BSD 2-Clause License (https://www.opensource.org/licenses/bsd-license.php) 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, this 14 | list of conditions and the following disclaimer in the documentation and/or 15 | other materials provided with the distribution. 16 | 17 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 21 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 24 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v4 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | build_args: 'c("--no-manual","--compact-vignettes=gs+qpdf")' 51 | -------------------------------------------------------------------------------- /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "R Data Science Environment", 3 | "build": { 4 | "dockerfile": "Dockerfile", 5 | // Update VARIANT to pick a specific R version: 4, 4.1, 4.0 6 | // More info: https://github.com/rocker-org/devcontainer-images/pkgs/container/devcontainer%2Ftidyverse 7 | "args": { "VARIANT": "4" } 8 | }, 9 | 10 | // Install Dev Container Features. More info: https://containers.dev/features 11 | "features": { 12 | "ghcr.io/rocker-org/devcontainer-features/quarto-cli:1": {} 13 | // "ghcr.io/rocker-org/devcontainer-features/rstudio-server:0": {} 14 | // Install JupyterLab and IRkernel. 15 | // More info: https://github.com/rocker-org/devcontainer-templates/tree/main/src/r-ver 16 | //"ghcr.io/rocker-org/devcontainer-features/r-rig:1": { 17 | // "version": "none", 18 | // "installJupyterlab": true 19 | //} 20 | }, 21 | 22 | "customizations": { 23 | //"vscode": { 24 | // "extensions": [ 25 | // // Add Jupyter and Python vscode extensions 26 | // "ms-toolsai.jupyter", 27 | // "ms-toolsai.jupyter-renderers", 28 | // "ms-python.python", 29 | // "ms-python.vscode-pylance", 30 | // "vsls-contrib.codetour", 31 | // "GitHub.copilot" 32 | // ] 33 | //} 34 | }, 35 | 36 | // Forward Jupyter and RStudio ports 37 | "forwardPorts": [8787, 8888], 38 | "portsAttributes": { 39 | "8787": { 40 | "label": "Rstudio", 41 | "requireLocalPort": true, 42 | "onAutoForward": "ignore" 43 | } //, 44 | //"8888": { 45 | // "label": "Jupyter", 46 | // "requireLocalPort": true, 47 | // "onAutoForward": "ignore" 48 | //} 49 | }, 50 | 51 | // Use 'postAttachCommand' to run commands after the container is started. 52 | "postAttachCommand": "sudo rstudio-server start" 53 | 54 | // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root 55 | // "remoteUser": "root" 56 | } 57 | -------------------------------------------------------------------------------- /src/R-xxhash-raw.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | 6 | 7 | #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ 8 | #define XXH_IMPLEMENTATION /* access definitions */ 9 | 10 | #include "xxhash.h" 11 | #include "R-xxhash-utils.h" 12 | 13 | 14 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | // Serialize an R object 16 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 17 | SEXP xxhash_raw_(SEXP robj_, SEXP algo_, SEXP as_raw_) { 18 | 19 | const char *algo = CHAR(asChar((algo_))); 20 | 21 | void *src; 22 | size_t len; 23 | char *tmp; 24 | 25 | if (TYPEOF(robj_) == RAWSXP) { 26 | src = (void *)RAW(robj_); 27 | len = (size_t)length(robj_); 28 | } else if (TYPEOF(robj_) == STRSXP) { 29 | if (length(robj_) != 1) { 30 | error("xxhash_raw_(): Only single string expected"); 31 | } 32 | tmp = (char *)CHAR(STRING_ELT(robj_, 0)); 33 | src = (void *)tmp; 34 | len = strlen(tmp); 35 | } else { 36 | error("xxhash_raw_(): Only raw vectors and strings are supported"); 37 | } 38 | 39 | SEXP res_ = R_NilValue; 40 | 41 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 42 | // Set up the state 43 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 44 | if (strcmp(algo, "xxh128") == 0) { 45 | XXH128_hash_t const hash = XXH3_128bits(src, len); 46 | res_ = PROTECT(xxh128_hash_to_robj(hash, as_raw_)); 47 | } else if (strcmp(algo, "xxh3") == 0){ 48 | XXH64_hash_t const hash = XXH3_64bits(src, len); 49 | res_ = PROTECT(xxh64_hash_to_robj(hash, as_raw_)); 50 | } else if (strcmp(algo, "xxh32") == 0) { 51 | XXH32_hash_t const hash = XXH32(src, len, 0); 52 | res_ = PROTECT(xxh32_hash_to_robj(hash, as_raw_)); 53 | } else if (strcmp(algo, "xxh64") == 0) { 54 | XXH64_hash_t const hash = XXH64(src, len, 0); 55 | res_ = PROTECT(xxh64_hash_to_robj(hash, as_raw_)); 56 | } else { 57 | error("xxhash_raw_(): Unknown algo '%s'\n", algo); 58 | } 59 | 60 | UNPROTECT(1); 61 | return res_; 62 | } 63 | 64 | -------------------------------------------------------------------------------- /src/R-xxhash-utils.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ 8 | #include "xxhash.h" 9 | 10 | #include "R-xxhash-utils.h" 11 | 12 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 13 | // 14 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | SEXP xxh128_hash_to_robj(XXH128_hash_t hash, SEXP as_raw_) { 16 | SEXP res_ = R_NilValue; 17 | 18 | if (asLogical(as_raw_)) { 19 | res_ = PROTECT(allocVector(RAWSXP, 16)); 20 | XXH128_canonicalFromHash((XXH128_canonical_t *)RAW(res_), hash); 21 | } else { 22 | char chash[32+1]; 23 | snprintf(chash, sizeof(chash), "%016" PRIx64 "%016" PRIx64, hash.high64, hash.low64); 24 | res_ = PROTECT(mkString(chash)); 25 | } 26 | 27 | UNPROTECT(1); 28 | return res_; 29 | } 30 | 31 | 32 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 33 | // 34 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 35 | SEXP xxh64_hash_to_robj(XXH64_hash_t hash, SEXP as_raw_) { 36 | SEXP res_ = R_NilValue; 37 | 38 | if (asLogical(as_raw_)) { 39 | res_ = PROTECT(allocVector(RAWSXP, 8)); 40 | XXH64_canonicalFromHash((XXH64_canonical_t *)RAW(res_), hash); 41 | } else { 42 | char chash[16+1]; 43 | snprintf(chash, sizeof(chash), "%016" PRIx64, hash); 44 | res_ = PROTECT(mkString(chash)); 45 | } 46 | 47 | UNPROTECT(1); 48 | return res_; 49 | } 50 | 51 | 52 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 53 | // 54 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 55 | SEXP xxh32_hash_to_robj(XXH32_hash_t hash, SEXP as_raw_) { 56 | SEXP res_ = R_NilValue; 57 | 58 | if (asLogical(as_raw_)) { 59 | res_ = PROTECT(allocVector(RAWSXP, 4)); 60 | XXH32_canonicalFromHash((XXH32_canonical_t *)RAW(res_), hash); 61 | } else { 62 | char chash[8+1]; 63 | snprintf(chash, sizeof(chash), "%08x", hash); 64 | res_ = PROTECT(mkString(chash)); 65 | } 66 | 67 | UNPROTECT(1); 68 | return res_; 69 | } 70 | -------------------------------------------------------------------------------- /tests/testthat/test-xxhash-raw.R: -------------------------------------------------------------------------------- 1 | 2 | obj <- list( 3 | raw = as.raw(seq_len(1024) %% 255), 4 | dbl = as.double(seq(1024)), 5 | int = as.integer(seq(1024)), 6 | lgl = rep(c(TRUE, FALSE), 512) 7 | ) 8 | 9 | algos <- c('xxh32', 'xxh64', 'xxh128', 'xxh3') 10 | 11 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 12 | # Run the command line xxHash over data 13 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 14 | if (FALSE) { 15 | 16 | ref <- list() 17 | 18 | for (nm in names(obj)) { 19 | ref[[nm]] <- list() 20 | dat <- obj[[nm]] 21 | filename <- paste0("ref/", nm, ".bin") 22 | filename <- testthat::test_path(filename) 23 | writeBin(serialize(dat, NULL, xdr = FALSE), filename, size = 1) 24 | for (i in seq_along(algos)) { 25 | algo <- algos[i] 26 | cmd <- sprintf("xxhsum -H%i %s", i-1, filename) 27 | res <- system(cmd, intern = TRUE) |> strsplit(" ") |> el() 28 | if (algo == 'xxh3') { 29 | res <- tail(res, 1) 30 | } else { 31 | res <- head(res, 1) 32 | } 33 | ref[[nm]][[algo]] <- res 34 | } 35 | } 36 | 37 | ref 38 | 39 | 40 | } 41 | 42 | 43 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 44 | # Capture all the hashes from the command line here 45 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 46 | ref <- list( 47 | raw = list(xxh32 = "8460bda9", xxh64 = "00e57cb921892898", 48 | xxh128 = "44c4a18300025592f8f3fcb32de7befc", xxh3 = "f8f3fcb32de7befc"), 49 | dbl = list(xxh32 = "ca9c9fce", xxh64 = "aca5accb6a0dfe74", 50 | xxh128 = "41cd4df490971609446fe5ee32c8410f", xxh3 = "9de70506fdaba2a9"), 51 | int = list(xxh32 = "716051ed", xxh64 = "683312d708813dc9", 52 | xxh128 = "4f95b75e8b9506fb1c101904f3279907", xxh3 = "c9851632e886e01c"), 53 | lgl = list(xxh32 = "ba36c788", xxh64 = "7e5df9cc3c997ca8", 54 | xxh128 = "8da01f89f4bd520eb615184122e4760a", xxh3 = "b615184122e4760a") 55 | ) 56 | 57 | 58 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 59 | # Test that xxhashlite::funcs match the command line output 60 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | test_that("Same hashes in R and from xxHash command line (xxhsum)", { 62 | 63 | for (nm in names(obj)) { 64 | filename <- paste0("ref/", nm, ".bin") 65 | filename <- testthat::test_path(filename) 66 | dat <- readBin(filename, raw(), file.size(filename)) 67 | for (algo in algos) { 68 | 69 | result <- xxhash_raw(dat, algo = algo) 70 | reference <- ref[[nm]][[algo]] 71 | 72 | expect_identical(result, reference, label = paste(nm, algo)) 73 | } 74 | } 75 | 76 | }) 77 | 78 | 79 | test_that("xxhash_raw() on strings works", { 80 | 81 | expect_identical( 82 | xxhash_raw("hello"), 83 | xxhash_raw(charToRaw("hello")) 84 | ) 85 | 86 | }) 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /.github/workflows/rhub.yaml: -------------------------------------------------------------------------------- 1 | # R-hub's generic GitHub Actions workflow file. It's canonical location is at 2 | # https://github.com/r-hub/rhub2/blob/v1/inst/workflow/rhub.yaml 3 | # You can update this file to a newer version using the rhub2 package: 4 | # 5 | # rhub2::rhub_setup() 6 | # 7 | # It is unlikely that you need to modify this file manually. 8 | 9 | name: R-hub 10 | run-name: "${{ github.event.inputs.id }}: ${{ github.event.inputs.name || format('Manually run by {0}', github.triggering_actor) }}" 11 | 12 | on: 13 | workflow_dispatch: 14 | inputs: 15 | config: 16 | description: 'A comma separated list of R-hub platforms to use.' 17 | type: string 18 | default: 'linux,windows,macos' 19 | name: 20 | description: 'Run name. You can leave this empty now.' 21 | type: string 22 | id: 23 | description: 'Unique ID. You can leave this empty now.' 24 | type: string 25 | 26 | jobs: 27 | 28 | setup: 29 | runs-on: ubuntu-latest 30 | outputs: 31 | containers: ${{ steps.rhub-setup.outputs.containers }} 32 | platforms: ${{ steps.rhub-setup.outputs.platforms }} 33 | 34 | steps: 35 | # NO NEED TO CHECKOUT HERE 36 | - uses: r-hub/rhub2/actions/rhub-setup@v1 37 | with: 38 | config: ${{ github.event.inputs.config }} 39 | id: rhub-setup 40 | 41 | linux-containers: 42 | needs: setup 43 | if: ${{ needs.setup.outputs.containers != '[]' }} 44 | runs-on: ubuntu-latest 45 | name: ${{ matrix.config.label }} 46 | strategy: 47 | fail-fast: false 48 | matrix: 49 | config: ${{ fromJson(needs.setup.outputs.containers) }} 50 | container: 51 | image: ${{ matrix.config.container }} 52 | 53 | steps: 54 | - uses: r-hub/rhub2/actions/rhub-checkout@v1 55 | - uses: r-hub/rhub2/actions/rhub-platform-info@v1 56 | with: 57 | token: ${{ secrets.RHUB_TOKEN }} 58 | job-config: ${{ matrix.config.job-config }} 59 | - uses: r-hub/rhub2/actions/rhub-setup-deps@v1 60 | with: 61 | token: ${{ secrets.RHUB_TOKEN }} 62 | job-config: ${{ matrix.config.job-config }} 63 | - uses: r-hub/rhub2/actions/rhub-run-check@v1 64 | with: 65 | token: ${{ secrets.RHUB_TOKEN }} 66 | job-config: ${{ matrix.config.job-config }} 67 | 68 | other-platforms: 69 | needs: setup 70 | if: ${{ needs.setup.outputs.platforms != '[]' }} 71 | runs-on: ${{ matrix.config.os }} 72 | name: ${{ matrix.config.label }} 73 | strategy: 74 | fail-fast: false 75 | matrix: 76 | config: ${{ fromJson(needs.setup.outputs.platforms) }} 77 | 78 | steps: 79 | - uses: r-hub/rhub2/actions/rhub-checkout@v1 80 | - uses: r-hub/rhub2/actions/rhub-setup-r@v1 81 | with: 82 | job-config: ${{ matrix.config.job-config }} 83 | token: ${{ secrets.RHUB_TOKEN }} 84 | - uses: r-hub/rhub2/actions/rhub-platform-info@v1 85 | with: 86 | token: ${{ secrets.RHUB_TOKEN }} 87 | job-config: ${{ matrix.config.job-config }} 88 | - uses: r-hub/rhub2/actions/rhub-setup-deps@v1 89 | with: 90 | job-config: ${{ matrix.config.job-config }} 91 | token: ${{ secrets.RHUB_TOKEN }} 92 | - uses: r-hub/rhub2/actions/rhub-run-check@v1 93 | with: 94 | job-config: ${{ matrix.config.job-config }} 95 | token: ${{ secrets.RHUB_TOKEN }} 96 | -------------------------------------------------------------------------------- /R/xxhash.R: -------------------------------------------------------------------------------- 1 | 2 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 3 | #' Calculate the hash of an arbitrary R object. 4 | #' 5 | #' This function will calculate the hash of any object understood by 6 | #' \code{base::serialize()}. 7 | #' 8 | #' @param robj Any R object 9 | #' @param algo Select the specific xxhash algorithm. Default: 'xxh128'. 10 | #' (the latest algorithm in the xxhash family) 11 | #' Valid values: 'xxh32', 'xxh64', 'xxh128', 'xxh3' 12 | #' @param as_raw Return the hash as a raw vector of bytes instead of string? 13 | #' Default: FALSE. If TRUE, then the raw bytes are returned in big-endian 14 | #' order - which is what \code{xxHash} considers the \emph{canonical} form. 15 | #' 16 | #' @return String representation of hash. If \code{as_raw = TRUE} then a 17 | #' raw vector is returned instead. 18 | #' 19 | #' @export 20 | #' 21 | #' @examples 22 | #' xxhash(mtcars) 23 | #' xxhash(mtcars, algo = 'xxh3', as_raw = TRUE) 24 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 25 | xxhash <- function(robj, algo = 'xxh128', as_raw = FALSE) { 26 | .Call(xxhash_, robj, algo, as_raw) 27 | } 28 | 29 | 30 | 31 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 32 | #' Calculate the hash of a raw vector or string 33 | #' 34 | #' This performs a hash of the raw bytes - not of the serialized representation. 35 | #' 36 | #' @inheritParams xxhash 37 | #' @param vec raw vector or single character string 38 | #' 39 | #' @return String representation of hash. If \code{as_raw = TRUE} then a 40 | #' raw vector is returned instead. 41 | #' 42 | #' @export 43 | #' 44 | #' @examples 45 | #' vec <- "hello" 46 | #' xxhash_raw(vec) 47 | #' vec <- as.raw(c(0x01, 0x02, 0x99)) 48 | #' xxhash_raw(vec) 49 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 50 | xxhash_raw <- function(vec, algo = 'xxh128', as_raw = FALSE) { 51 | .Call(xxhash_raw_, vec, algo, as_raw) 52 | } 53 | 54 | 55 | 56 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 57 | #' Calculate the hash of a file 58 | #' 59 | #' @inheritParams xxhash_raw 60 | #' @param file filename 61 | #' 62 | #' @return String representation of hash. If \code{as_raw = TRUE} then a 63 | #' raw vector is returned instead. 64 | #' 65 | #' @export 66 | #' 67 | #' @examples 68 | #' filename <- system.file('DESCRIPTION', package = 'base', mustWork = TRUE) 69 | #' xxhash_file(filename) 70 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 71 | xxhash_file <- function(file, algo = 'xxh128', as_raw = FALSE) { 72 | .Call(xxhash_file_, normalizePath(file), algo, as_raw) 73 | } 74 | 75 | 76 | 77 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 78 | #' Calculate the hash of data from a connection object 79 | #' 80 | #' @inheritParams xxhash_raw 81 | #' @param con connection 82 | #' 83 | #' @return String representation of hash. If \code{as_raw = TRUE} then a 84 | #' raw vector is returned instead. 85 | #' 86 | #' @export 87 | #' 88 | #' @examples 89 | #' filename <- system.file('DESCRIPTION', package = 'base', mustWork = TRUE) 90 | #' xxhash_con(file(filename)) 91 | #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 92 | xxhash_con <- function(con, algo = 'xxh128', as_raw = FALSE) { 93 | stopifnot(inherits(con, "connection")) 94 | if(!isOpen(con)){ 95 | on.exit(close(con)) 96 | open(con, "rb") 97 | } 98 | .Call(xxhash_con_, con, algo, as_raw) 99 | } 100 | -------------------------------------------------------------------------------- /tests/testthat/ref/lgl.bin: -------------------------------------------------------------------------------- 1 | B 2 | UTF-8 3 |  -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # xxhashlite 5 | 6 | 7 | 8 | ![](https://img.shields.io/badge/cool-useless-green.svg) 9 | [![R-CMD-check](https://github.com/coolbutuseless/xxhashlite/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/coolbutuseless/xxhashlite/actions/workflows/R-CMD-check.yaml) 10 | [![CRAN](https://www.r-pkg.org/badges/version/xxhashlite)](https://CRAN.R-project.org/package=xxhashlite) 11 | 12 | 13 | `xxhashlite` provides access to the *extremely* fast hashing functions 14 | in [xxHash](https://cyan4973.github.io/xxHash/) for in-memory hashing of 15 | R objects, files, strings and raw vectors. 16 | 17 | This package is a wrapper around [xxHash 18 | v0.8.2](https://github.com/Cyan4973/xxHash). 19 | See `inst/LICENSE-xxHash` for the copyright and licensing information 20 | for that code. 21 | 22 | ## Notes 23 | 24 | - Only supports R versions \>= v3.5.0 as this is when the serialization 25 | infrastructure had a breaking change, and this package will not 26 | support the old version. 27 | 28 | ## What’s in the box 29 | 30 | - `xxhash(robj, algo, as_raw)` calculates the hash of any R object 31 | understood by `base::serialize()`. 32 | - `xxhash_raw(vec, algo, as_raw)` calculates the hash of a raw vector or 33 | string. This function is appropriate when comparing hashes of non-R 34 | objects e.g.  a checksum hash of raw bytes. 35 | - `xxhash_file(file, algo, as_raw)` calculates the hash of a file 36 | - `xxhash_con(con, algo, as_raw)` calculations the hash of data from a 37 | connection e.g. `gzfile()`, `url()` 38 | 39 | Supports all hashes provided by `xxHash` i.e. XXH128, XXH3, XXH32 and 40 | XXH64. 41 | 42 | ## Installation 43 | 44 | This package can be installed from CRAN 45 | 46 | ``` r 47 | install.packages('xxhashlite') 48 | ``` 49 | 50 | You can install the latest development version from 51 | [GitHub](https://github.com/coolbutuseless/xxhashlite) with: 52 | 53 | ``` r 54 | # install.package('remotes') 55 | remotes::install_github('coolbutuseless/xxhashlite') 56 | ``` 57 | 58 | Pre-built source/binary versions can also be installed from 59 | [R-universe](https://r-universe.dev) 60 | 61 | ``` r 62 | install.packages('xxhashlite', repos = c('https://coolbutuseless.r-universe.dev', 'https://cloud.r-project.org')) 63 | ``` 64 | 65 | ## Why use a hash? 66 | 67 | A hash is a way of succinctly summarising the contents of an object in a 68 | compact format. 69 | 70 | If there are changes to the object (no matter how small) then the hash 71 | should see drastic changes. 72 | 73 | ``` r 74 | library(xxhashlite) 75 | xxhash(mtcars) 76 | #> [1] "d0487363db4e6cc64fdb740cb6617fc0" 77 | 78 | # Small changes results in a different hash 79 | mtcars$cyl[1] <- 0 80 | xxhash(mtcars) 81 | #> [1] "e999db3ed8f21dc2cd52b97a08f0c9f5" 82 | ``` 83 | 84 | ## Available algorithms 85 | 86 | `{xxhashlite}` supports returning the hash as a raw vector, and all the 87 | xxHash algorithms are available: 88 | 89 | ``` r 90 | xxhash(mtcars, as_raw = TRUE) 91 | #> [1] e9 99 db 3e d8 f2 1d c2 cd 52 b9 7a 08 f0 c9 f5 92 | xxhash(mtcars, algo = 'xxh3') 93 | #> [1] "cd52b97a08f0c9f5" 94 | xxhash(mtcars, algo = 'xxh64') 95 | #> [1] "fe3d463a549e63ce" 96 | xxhash(mtcars, algo = 'xxh32') 97 | #> [1] "eaa9d7fb" 98 | ``` 99 | 100 | ## Strings and Raw Vectors 101 | 102 | `xxhash()` uses R’s serialization mechanism to handle any R object. 103 | 104 | When you only require a hash of the raw contents of a file, a raw vector 105 | or a string, then use `xxhash_raw()` and `xxhash_file()`. 106 | 107 | Rather than serializing the data first, the hash is calculated on the 108 | raw bytes. 109 | 110 | ## Acknowledgements 111 | 112 | - Yann Collett for releasing, maintaining and advancing 113 | [xxHash](https://cyan4973.github.io/xxHash/) 114 | - R Core for developing and maintaining such a great language. 115 | - CRAN maintainers, for patiently shepherding packages onto CRAN and 116 | maintaining the repository 117 | -------------------------------------------------------------------------------- /src/R-xxhash-file.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ 8 | #include "xxhash.h" 9 | #include "R-xxhash-utils.h" 10 | 11 | 12 | #define FILEBUFSIZE 1 << 16 13 | 14 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | // XXH128 16 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 17 | SEXP xxhash_file_xxh128(FILE *f, SEXP as_raw_) { 18 | 19 | XXH3_state_t* state = XXH3_createState(); 20 | XXH3_128bits_reset(state); 21 | char buffer[FILEBUFSIZE]; 22 | size_t count; 23 | 24 | while ((count = fread(buffer, 1, FILEBUFSIZE, f)) != 0) { 25 | XXH3_128bits_update(state, buffer, count); 26 | } 27 | 28 | XXH128_hash_t hash = XXH3_128bits_digest(state); 29 | XXH3_freeState(state); 30 | 31 | return xxh128_hash_to_robj(hash, as_raw_); 32 | } 33 | 34 | 35 | 36 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 37 | // XXH3 64bit 38 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 39 | SEXP xxhash_file_xxh3(FILE *f, SEXP as_raw_) { 40 | 41 | XXH3_state_t* state = XXH3_createState(); 42 | XXH3_64bits_reset(state); 43 | char buffer[FILEBUFSIZE]; 44 | size_t count; 45 | 46 | while ((count = fread(buffer, 1, FILEBUFSIZE, f)) != 0) { 47 | XXH3_64bits_update(state, buffer, count); 48 | } 49 | 50 | XXH64_hash_t hash = XXH3_64bits_digest(state); 51 | XXH3_freeState(state); 52 | 53 | return xxh64_hash_to_robj(hash, as_raw_); 54 | } 55 | 56 | 57 | 58 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 59 | // XXH32 60 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 61 | SEXP xxhash_file_xxh32(FILE *f, SEXP as_raw_) { 62 | 63 | XXH32_state_t *state = XXH32_createState(); 64 | XXH32_reset(state, 0); 65 | 66 | char buffer[FILEBUFSIZE]; 67 | size_t count; 68 | 69 | while ((count = fread(buffer, 1, FILEBUFSIZE, f)) != 0) { 70 | XXH32_update(state, buffer, count); 71 | } 72 | 73 | XXH32_hash_t hash = XXH32_digest(state); 74 | XXH32_freeState(state); 75 | 76 | return xxh32_hash_to_robj(hash, as_raw_); 77 | } 78 | 79 | 80 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 81 | // XXH64 82 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 83 | SEXP xxhash_file_xxh64(FILE *f, SEXP as_raw_) { 84 | 85 | XXH64_state_t *state = XXH64_createState(); 86 | XXH64_reset(state, 0); 87 | 88 | char buffer[FILEBUFSIZE]; 89 | size_t count; 90 | 91 | while ((count = fread(buffer, 1, FILEBUFSIZE, f)) != 0) { 92 | XXH64_update(state, buffer, count); 93 | } 94 | 95 | XXH64_hash_t hash = XXH64_digest(state); 96 | XXH64_freeState(state); 97 | 98 | return xxh64_hash_to_robj(hash, as_raw_); 99 | } 100 | 101 | 102 | 103 | 104 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 105 | // File 106 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 107 | SEXP xxhash_file_(SEXP file_, SEXP algo_, SEXP as_raw_) { 108 | 109 | const char *file = CHAR(STRING_ELT(file_, 0)); 110 | FILE *f = fopen(file, "rb"); 111 | if (f == NULL) { 112 | error("xxhash_file_(): Couldn't open file '%s'", file); 113 | } 114 | 115 | const char *algo = CHAR(STRING_ELT(algo_, 0)); 116 | SEXP res_ = R_NilValue; 117 | if (strcmp(algo, "xxh128") == 0) { 118 | res_ = PROTECT(xxhash_file_xxh128(f, as_raw_)); 119 | } else if (strcmp(algo, "xxh3") == 0) { 120 | res_ = PROTECT(xxhash_file_xxh3(f, as_raw_)); 121 | } else if (strcmp(algo, "xxh32") == 0) { 122 | res_ = PROTECT(xxhash_file_xxh32(f, as_raw_)); 123 | } else if (strcmp(algo, "xxh64") == 0) { 124 | res_ = PROTECT(xxhash_file_xxh64(f, as_raw_)); 125 | } else { 126 | error("xxhash_raw_(): Unknown algo '%s'\n", algo); 127 | } 128 | 129 | fclose(f); 130 | UNPROTECT(1); 131 | return res_; 132 | } 133 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, include = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "man/figures/README-", 12 | out.width = "100%" 13 | ) 14 | 15 | library(pryr) 16 | library(dplyr) 17 | library(ggplot2) 18 | library(tidyr) 19 | library(xxhashlite) 20 | ``` 21 | 22 | # xxhashlite 23 | 24 | 25 | ![](https://img.shields.io/badge/cool-useless-green.svg) 26 | [![R-CMD-check](https://github.com/coolbutuseless/xxhashlite/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/coolbutuseless/xxhashlite/actions/workflows/R-CMD-check.yaml) 27 | [![CRAN](https://www.r-pkg.org/badges/version/xxhashlite)](https://CRAN.R-project.org/package=xxhashlite) 28 | 29 | 30 | `xxhashlite` provides access to the *extremely* fast hashing functions 31 | in [xxHash](https://cyan4973.github.io/xxHash/) for in-memory hashing 32 | of R objects, files, strings and raw vectors. 33 | 34 | This package is a wrapper around [xxHash v0.8.2](https://github.com/Cyan4973/xxHash). 35 | See `inst/LICENSE-xxHash` for the copyright and licensing information for that code. 36 | 37 | 38 | ## Notes 39 | 40 | * Only supports R versions >= v3.5.0 as this is when the serialization 41 | infrastructure had a breaking change, and this package will not support 42 | the old version. 43 | 44 | ## What's in the box 45 | 46 | * `xxhash(robj, algo, as_raw)` calculates the hash of any R object understood by 47 | `base::serialize()`. 48 | * `xxhash_raw(vec, algo, as_raw)` calculates the hash of a raw vector or string. 49 | This function is appropriate when comparing hashes of non-R objects e.g. 50 | a checksum hash of raw bytes. 51 | * `xxhash_file(file, algo, as_raw)` calculates the hash of a file 52 | * `xxhash_con(con, algo, as_raw)` calculations the hash of data from a connection 53 | e.g. `gzfile()`, `url()` 54 | 55 | Supports all hashes provided by `xxHash` i.e. XXH128, XXH3, XXH32 and XXH64. 56 | 57 | 58 | ## Installation 59 | 60 | This package can be installed from CRAN 61 | 62 | ``` r 63 | install.packages('xxhashlite') 64 | ``` 65 | 66 | You can install the latest development version from 67 | [GitHub](https://github.com/coolbutuseless/xxhashlite) with: 68 | 69 | ``` r 70 | # install.package('remotes') 71 | remotes::install_github('coolbutuseless/xxhashlite') 72 | ``` 73 | 74 | Pre-built source/binary versions can also be installed from 75 | [R-universe](https://r-universe.dev) 76 | 77 | ``` r 78 | install.packages('xxhashlite', repos = c('https://coolbutuseless.r-universe.dev', 'https://cloud.r-project.org')) 79 | ``` 80 | 81 | 82 | ## Why use a hash? 83 | 84 | A hash is a way of succinctly summarising the contents of an object in a compact format. 85 | 86 | If there are changes to the object (no matter how small) then the hash should 87 | see drastic changes. 88 | 89 | 90 | ```{r} 91 | library(xxhashlite) 92 | xxhash(mtcars) 93 | 94 | # Small changes results in a different hash 95 | mtcars$cyl[1] <- 0 96 | xxhash(mtcars) 97 | ``` 98 | 99 | ## Available algorithms 100 | `{xxhashlite}` supports returning the hash as a raw vector, and all the 101 | xxHash algorithms are available: 102 | 103 | ```{r} 104 | xxhash(mtcars, as_raw = TRUE) 105 | xxhash(mtcars, algo = 'xxh3') 106 | xxhash(mtcars, algo = 'xxh64') 107 | xxhash(mtcars, algo = 'xxh32') 108 | ``` 109 | 110 | ## Strings and Raw Vectors 111 | 112 | `xxhash()` uses R's serialization mechanism to handle any R object. 113 | 114 | When you only require a hash of the raw contents of a file, a raw vector 115 | or a string, then use `xxhash_raw()` and `xxhash_file()`. 116 | 117 | Rather than serializing the data first, the hash is calculated on the raw 118 | bytes. 119 | 120 | 121 | 122 | 123 | ## Acknowledgements 124 | 125 | * Yann Collett for releasing, maintaining and advancing [xxHash](https://cyan4973.github.io/xxHash/) 126 | * R Core for developing and maintaining such a great language. 127 | * CRAN maintainers, for patiently shepherding packages onto CRAN and maintaining 128 | the repository 129 | -------------------------------------------------------------------------------- /src/R-xxhash-con.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | 6 | 7 | #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ 8 | 9 | #include "xxhash.h" 10 | #include "R-xxhash-utils.h" 11 | 12 | #define BUFSIZE 1 << 17 13 | 14 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 15 | // Read data from the connection 16 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 17 | SEXP read_connection(SEXP con_) { 18 | // Call 'readBin(con, raw(), length)' in R 19 | // Modelled after jsonlite/src/push_parser.c 20 | SEXP call = PROTECT( 21 | Rf_lang4( 22 | PROTECT(Rf_install("readBin")), 23 | con_, // con 24 | PROTECT(Rf_allocVector(RAWSXP, 0)), // what = raw() 25 | PROTECT(Rf_ScalarInteger(BUFSIZE)) // n 26 | )); 27 | 28 | // Actually evaluate the readBin() call 29 | SEXP data_ = PROTECT(Rf_eval(call, R_BaseEnv)); 30 | 31 | UNPROTECT(5); 32 | return data_; 33 | } 34 | 35 | 36 | 37 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 38 | // XXH128 39 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 40 | SEXP xxhash_con_xxh128(SEXP con_, SEXP as_raw_) { 41 | 42 | XXH3_state_t* state = XXH3_createState(); 43 | XXH3_128bits_reset(state); 44 | 45 | while (1) { 46 | SEXP data_ = PROTECT(read_connection(con_)); 47 | if (XXH3_128bits_update(state, RAW(data_), (size_t)Rf_xlength(data_)) == XXH_ERROR) { 48 | error("xxhash_con_xxh128(): Couldn't update state"); 49 | } 50 | UNPROTECT(1); 51 | if (Rf_xlength(data_) < BUFSIZE) break; 52 | } 53 | 54 | XXH128_hash_t hash = XXH3_128bits_digest(state); 55 | XXH3_freeState(state); 56 | 57 | return xxh128_hash_to_robj(hash, as_raw_); 58 | } 59 | 60 | 61 | 62 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 63 | // XXH3 64bit 64 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 65 | SEXP xxhash_con_xxh3(SEXP con_, SEXP as_raw_) { 66 | 67 | XXH3_state_t* state = XXH3_createState(); 68 | XXH3_64bits_reset(state); 69 | 70 | while (1) { 71 | SEXP data_ = PROTECT(read_connection(con_)); 72 | if (XXH3_64bits_update(state, RAW(data_), (size_t)Rf_xlength(data_)) == XXH_ERROR) { 73 | error("xxhash_con_xxh3(): Couldn't update state"); 74 | } 75 | UNPROTECT(1); 76 | if (Rf_xlength(data_) < BUFSIZE) break; 77 | } 78 | 79 | XXH64_hash_t hash = XXH3_64bits_digest(state); 80 | XXH3_freeState(state); 81 | 82 | return xxh64_hash_to_robj(hash, as_raw_); 83 | } 84 | 85 | 86 | 87 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 88 | // XXH32 89 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 90 | SEXP xxhash_con_xxh32(SEXP con_, SEXP as_raw_) { 91 | 92 | XXH32_state_t *state = XXH32_createState(); 93 | XXH32_reset(state, 0); 94 | 95 | while (1) { 96 | SEXP data_ = PROTECT(read_connection(con_)); 97 | if (XXH32_update(state, RAW(data_), (size_t)Rf_xlength(data_)) == XXH_ERROR) { 98 | error("xxhash_con_xxh32(): Couldn't update state"); 99 | } 100 | UNPROTECT(1); 101 | if (Rf_xlength(data_) < BUFSIZE) break; 102 | } 103 | 104 | XXH32_hash_t hash = XXH32_digest(state); 105 | XXH32_freeState(state); 106 | 107 | return xxh32_hash_to_robj(hash, as_raw_); 108 | } 109 | 110 | 111 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 112 | // XXH64 113 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 114 | SEXP xxhash_con_xxh64(SEXP con_, SEXP as_raw_) { 115 | 116 | XXH64_state_t *state = XXH64_createState(); 117 | XXH64_reset(state, 0); 118 | 119 | while (1) { 120 | SEXP data_ = PROTECT(read_connection(con_)); 121 | if (XXH64_update(state, RAW(data_), (size_t)Rf_xlength(data_)) == XXH_ERROR) { 122 | error("xxhash_con_xxh64(): Couldn't update state"); 123 | } 124 | UNPROTECT(1); 125 | if (Rf_xlength(data_) < BUFSIZE) break; 126 | } 127 | 128 | XXH64_hash_t hash = XXH64_digest(state); 129 | XXH64_freeState(state); 130 | 131 | return xxh64_hash_to_robj(hash, as_raw_); 132 | } 133 | 134 | 135 | 136 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 137 | // Serialize an R object 138 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 139 | SEXP xxhash_con_(SEXP con_, SEXP algo_, SEXP as_raw_) { 140 | 141 | const char *algo = CHAR(asChar((algo_))); 142 | SEXP res_ = R_NilValue; 143 | 144 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 145 | // Set up the state 146 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 147 | if (strcmp(algo, "xxh128") == 0) { 148 | res_ = PROTECT(xxhash_con_xxh128(con_, as_raw_)); 149 | } else if (strcmp(algo, "xxh3") == 0) { 150 | res_ = PROTECT(xxhash_con_xxh3(con_, as_raw_)); 151 | } else if (strcmp(algo, "xxh32") == 0) { 152 | res_ = PROTECT(xxhash_con_xxh32(con_, as_raw_)); 153 | } else if (strcmp(algo, "xxh64") == 0) { 154 | res_ = PROTECT(xxhash_con_xxh64(con_, as_raw_)); 155 | } else { 156 | error("xxhash_con_(): Unknown algo '%s'", algo); 157 | } 158 | 159 | UNPROTECT(1); 160 | return res_; 161 | } 162 | 163 | -------------------------------------------------------------------------------- /src/R-xxhash-serialize.c: -------------------------------------------------------------------------------- 1 | 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | #define XXH_STATIC_LINKING_ONLY /* access advanced declarations */ 8 | 9 | #include "xxhash.h" 10 | #include "R-xxhash-utils.h" 11 | 12 | typedef struct { 13 | bool in_header; 14 | int n; 15 | int enc_size; 16 | void *xxstate; 17 | } ser_state_t; 18 | 19 | 20 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 21 | // Hash a byte 22 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 23 | void hash_byte(R_outpstream_t stream, int c) { 24 | error("xxhash::hash_byte(): Single byte hashing should never be called during binary serialisation"); 25 | } 26 | 27 | 28 | 29 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 30 | // Hash multiple bytes 31 | // 32 | // Similar to rlang::hash(), we ignore the first 18+n bytes which are just 33 | // the version-specific header for the data. 34 | // The 'n' represents the length of the string used to specify the 35 | // native encodeing. This is often a 5 byte string "UTF-8" 36 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 37 | void xxh3_128bits_hash_bytes(R_outpstream_t stream, void *src, int n) { 38 | ser_state_t *ser_state = (ser_state_t *)stream->data; 39 | 40 | if (ser_state->in_header) { 41 | ser_state->n += n; 42 | if (ser_state->n == 18) { 43 | memcpy(&ser_state->enc_size, src, sizeof(int)); 44 | } 45 | if (ser_state->n == 18 + ser_state->enc_size) { 46 | ser_state->in_header = false; 47 | } 48 | return; 49 | } 50 | 51 | XXH3_state_t *xxstate = (XXH3_state_t *)ser_state->xxstate; 52 | if (XXH3_128bits_update(xxstate, src, (size_t)n) == XXH_ERROR) { 53 | error("xxh3_128bits_hash_bytes(): Error updating state"); 54 | } 55 | } 56 | 57 | 58 | void xxh3_64bits_hash_bytes(R_outpstream_t stream, void *src, int n) { 59 | ser_state_t *ser_state = (ser_state_t *)stream->data; 60 | 61 | if (ser_state->in_header) { 62 | ser_state->n += n; 63 | if (ser_state->n == 18) { 64 | memcpy(&ser_state->enc_size, src, sizeof(int)); 65 | } 66 | if (ser_state->n == 18 + ser_state->enc_size) { 67 | ser_state->in_header = false; 68 | } 69 | return; 70 | } 71 | 72 | XXH3_state_t *xxstate = (XXH3_state_t *)ser_state->xxstate; 73 | if (XXH3_64bits_update(xxstate, src, (size_t)n) == XXH_ERROR) { 74 | error("xxh3_64bits_hash_bytes(): Error updating state"); 75 | } 76 | } 77 | 78 | 79 | void xxh32_hash_bytes(R_outpstream_t stream, void *src, int n) { 80 | ser_state_t *ser_state = (ser_state_t *)stream->data; 81 | 82 | if (ser_state->in_header) { 83 | ser_state->n += n; 84 | if (ser_state->n == 18) { 85 | memcpy(&ser_state->enc_size, src, sizeof(int)); 86 | } 87 | if (ser_state->n == 18 + ser_state->enc_size) { 88 | ser_state->in_header = false; 89 | } 90 | return; 91 | } 92 | 93 | XXH32_state_t *xxstate = (XXH32_state_t *)ser_state->xxstate; 94 | if (XXH32_update(xxstate, src, (size_t)n) == XXH_ERROR) { 95 | error("xxh32_hash_bytes(): Error updating state"); 96 | } 97 | } 98 | 99 | 100 | void xxh64_hash_bytes(R_outpstream_t stream, void *src, int n) { 101 | ser_state_t *ser_state = (ser_state_t *)stream->data; 102 | 103 | if (ser_state->in_header) { 104 | ser_state->n += n; 105 | if (ser_state->n == 18) { 106 | memcpy(&ser_state->enc_size, src, sizeof(int)); 107 | } 108 | if (ser_state->n == 18 + ser_state->enc_size) { 109 | ser_state->in_header = false; 110 | } 111 | return; 112 | } 113 | 114 | XXH64_state_t *xxstate = (XXH64_state_t *)ser_state->xxstate; 115 | if (XXH64_update(xxstate, src, (size_t)n) == XXH_ERROR) { 116 | error("xxh64_hash_bytes(): Error updating state"); 117 | } 118 | } 119 | 120 | 121 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 122 | // Serialize an R object 123 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 124 | SEXP xxhash_(SEXP robj_, SEXP algo_, SEXP as_raw_) { 125 | 126 | const char *algo = CHAR(asChar((algo_))); 127 | 128 | void *hash_bytes; 129 | XXH_errorcode err; 130 | ser_state_t ser_state = { 131 | .in_header = true, 132 | .n = 0, 133 | .xxstate = NULL 134 | }; 135 | 136 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 137 | // Set up the state 138 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 139 | if (strcmp(algo, "xxh128") == 0) { 140 | XXH3_state_t *xxstate = XXH3_createState(); 141 | ser_state.xxstate = (void *)xxstate; 142 | err = XXH3_128bits_reset(xxstate); 143 | hash_bytes = (void *)xxh3_128bits_hash_bytes; 144 | } else if (strcmp(algo, "xxh3") == 0) { 145 | XXH3_state_t *xxstate = XXH3_createState(); 146 | ser_state.xxstate = (void *)xxstate; 147 | err = XXH3_64bits_reset(xxstate); 148 | hash_bytes = (void *)xxh3_64bits_hash_bytes; 149 | } else if (strcmp(algo, "xxh32") == 0) { 150 | XXH32_state_t *xxstate = XXH32_createState(); 151 | ser_state.xxstate = (void *)xxstate; 152 | err = XXH32_reset(xxstate, 0); 153 | hash_bytes = (void *)xxh32_hash_bytes; 154 | } else if (strcmp(algo, "xxh64") == 0) { 155 | XXH64_state_t *xxstate = XXH64_createState(); 156 | ser_state.xxstate = (void *)xxstate; 157 | err = XXH64_reset(xxstate, 0); 158 | hash_bytes = (void *)xxh64_hash_bytes; 159 | } else { 160 | error("xxhash_(): Unknown algo '%s'\n", algo); 161 | } 162 | 163 | if (err == XXH_ERROR) { 164 | error("xxhash_(): Error initialising hashing state for '%s'", algo); 165 | } 166 | 167 | 168 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 169 | // Create the output stream structure 170 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 171 | struct R_outpstream_st output_stream; 172 | 173 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 174 | // Initialise the output stream structure 175 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 176 | R_InitOutPStream( 177 | &output_stream, // The stream object which wraps everything 178 | (R_pstream_data_t) &ser_state, // The "location" to write to 179 | R_pstream_binary_format, // Store as binary 180 | 3, // Version = 3 for R >3.5.0 See `?base::serialize` 181 | hash_byte, // Function to write single byte to buffer 182 | (void (*)(R_outpstream_t, void *, int))hash_bytes, // Function for writing multiple bytes to buffer 183 | NULL, // Func for special handling of reference data. 184 | R_NilValue // Data related to reference data handling 185 | ); 186 | 187 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 188 | // Serialize the object into the output_stream 189 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 190 | R_Serialize(robj_, &output_stream); 191 | 192 | SEXP res_ = R_NilValue; 193 | 194 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 195 | // Produce the final hash value 196 | //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 197 | if (strcmp(algo, "xxh128") == 0) { 198 | XXH128_hash_t const hash = XXH3_128bits_digest(ser_state.xxstate); 199 | XXH3_freeState(ser_state.xxstate); 200 | res_ = PROTECT(xxh128_hash_to_robj(hash, as_raw_)); 201 | } else if (strcmp(algo, "xxh3") == 0) { 202 | XXH64_hash_t const hash = XXH3_64bits_digest(ser_state.xxstate); 203 | XXH3_freeState(ser_state.xxstate); 204 | res_ = PROTECT(xxh64_hash_to_robj(hash, as_raw_)); 205 | } else if (strcmp(algo, "xxh32") == 0) { 206 | XXH32_hash_t const hash = XXH32_digest(ser_state.xxstate); 207 | XXH32_freeState(ser_state.xxstate); 208 | res_ = PROTECT(xxh32_hash_to_robj(hash, as_raw_)); 209 | } else if (strcmp(algo, "xxh64") == 0) { 210 | XXH64_hash_t const hash = XXH64_digest(ser_state.xxstate); 211 | XXH64_freeState(ser_state.xxstate); 212 | res_ = PROTECT(xxh64_hash_to_robj(hash, as_raw_)); 213 | } 214 | 215 | UNPROTECT(1); 216 | return res_; 217 | } 218 | --------------------------------------------------------------------------------