├── .Rbuildignore ├── .github ├── .gitignore └── workflows │ ├── R-CMD-check.yaml │ └── pkgdown.yaml ├── .gitignore ├── DESCRIPTION ├── LICENSE ├── LICENSE.md ├── NAMESPACE ├── NEWS.md ├── R ├── hub_download.R ├── hub_info.R └── hub_snapshot.R ├── README.md ├── _pkgdown.yml ├── cran-comments.md ├── hfhub.Rproj ├── inst └── po │ └── fr │ └── LC_MESSAGES │ └── R-hfhub.mo ├── man ├── WEIGHTS_NAME.Rd ├── hub_download.Rd ├── hub_repo_info.Rd └── hub_snapshot.Rd ├── po ├── R-fr.po └── R-hfhub.pot └── tests ├── testthat.R └── testthat ├── _snaps └── hub_snapshot.md ├── helper-skips.R ├── test-hub_download.R ├── test-hub_info.R ├── test-hub_snapshot.R └── test-message-translations.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^hfhub\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^LICENSE\.md$ 4 | ^\.github$ 5 | ^cran-comments\.md$ 6 | ^_pkgdown\.yml$ 7 | ^docs$ 8 | ^pkgdown$ 9 | ^CRAN-SUBMISSION$ 10 | -------------------------------------------------------------------------------- /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | HUGGINGFACE_HUB_TOKEN: ${{ secrets.HUGGINGFACE_HUB_TOKEN }} 31 | 32 | steps: 33 | - uses: actions/checkout@v3 34 | 35 | - uses: r-lib/actions/setup-pandoc@v2 36 | 37 | - uses: r-lib/actions/setup-r@v2 38 | with: 39 | r-version: ${{ matrix.config.r }} 40 | http-user-agent: ${{ matrix.config.http-user-agent }} 41 | use-public-rspm: true 42 | 43 | - uses: r-lib/actions/setup-r-dependencies@v2 44 | with: 45 | extra-packages: any::rcmdcheck 46 | needs: check 47 | 48 | - uses: r-lib/actions/check-r-package@v2 49 | with: 50 | upload-snapshots: true 51 | env: 52 | HUGGINGFACE_HUB_CACHE: ${{ runner.temp }}/cache/ 53 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v3 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.4.1 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .Rdata 4 | .httr-oauth 5 | .DS_Store 6 | docs 7 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: hfhub 2 | Title: Hugging Face Hub Interface 3 | Version: 0.1.1.9000 4 | Authors@R: c( 5 | person("Daniel", "Falbel", , "daniel@posit.co", role = c("aut", "cre")), 6 | person("Regouby", "Christophe", , "christophe.regouby@free.fr", c("ctb")), 7 | person(family = "Posit", role = c("cph")) 8 | ) 9 | Description: Provides functionality to download and cache files from 'Hugging Face Hub' . 10 | Uses the same caching structure so files can be shared between different client libraries. 11 | License: MIT + file LICENSE 12 | Encoding: UTF-8 13 | Roxygen: list(markdown = TRUE) 14 | RoxygenNote: 7.2.3 15 | Imports: 16 | httr, 17 | filelock, 18 | fs, 19 | cli, 20 | withr, 21 | curl, 22 | glue, 23 | urltools, 24 | rlang 25 | Suggests: 26 | testthat (>= 3.0.0), 27 | jsonlite 28 | Config/testthat/edition: 3 29 | URL: https://mlverse.github.io/hfhub/ 30 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | YEAR: 2023 2 | COPYRIGHT HOLDER: hfhub authors 3 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | # MIT License 2 | 3 | Copyright (c) 2023 hfhub authors 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(WEIGHTS_INDEX_NAME) 4 | export(WEIGHTS_NAME) 5 | export(hub_dataset_info) 6 | export(hub_download) 7 | export(hub_repo_info) 8 | export(hub_snapshot) 9 | importFrom(rlang,"%||%") 10 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # hfhub (development version) 2 | 3 | * Added FR translation of the R messages. (#8 @cregouby) 4 | 5 | # hfhub 0.1.1 6 | 7 | * Added a `NEWS.md` file to track changes to the package. 8 | * Added `hub_snapshot` to alllow downloading an entire repository at once (#2). 9 | * Added support for authentication using `HUGGING_FACE_HUB_TOKEN`. (#5) 10 | -------------------------------------------------------------------------------- /R/hub_download.R: -------------------------------------------------------------------------------- 1 | #' Downloads files from HuggingFace repositories 2 | #' 3 | #' @param repo_id The repository identifier, eg `"bert-base-uncased"` or `"deepset/sentence_bert"`. 4 | #' @param filename Filename to download from the repository. Example `"config.json"`. 5 | #' @param revision Revision (branch, tag or commitid) to download the file from. 6 | #' @param repo_type The type of the repository. Currently only `"model"` is supported. 7 | #' @param local_files_only Only use cached files? 8 | #' @param force_download For re-downloading of files that are cached. 9 | #' @param ... currenytly unused. 10 | #' 11 | #' @returns The file path of the downloaded or cached file. The snapshot path is returned 12 | #' as an attribute. 13 | #' @examples 14 | #' try({ 15 | #' withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = tempdir()), { 16 | #' path <- hub_download("gpt2", "config.json") 17 | #' print(path) 18 | #' str(jsonlite::fromJSON(path)) 19 | #' }) 20 | #' }) 21 | #' 22 | #' @export 23 | hub_download <- function(repo_id, filename, ..., revision = "main", repo_type = "model", local_files_only = FALSE, force_download = FALSE) { 24 | cache_dir <- HUGGINGFACE_HUB_CACHE() 25 | storage_folder <- fs::path(cache_dir, repo_folder_name(repo_id, repo_type)) 26 | 27 | # revision is a commit hash and file exists in the cache, quicly return it. 28 | if (grepl(REGEX_COMMIT_HASH(), revision)) { 29 | pointer_path <- get_pointer_path(storage_folder, revision, filename) 30 | if (fs::file_exists(pointer_path)) { 31 | return(pointer_path) 32 | } 33 | } 34 | 35 | url <- hub_url(repo_id, filename, revision = revision, repo_type = repo_type) 36 | 37 | etag <- NULL 38 | commit_hash <- NULL 39 | expected_size <- NULL 40 | 41 | if (!local_files_only) { 42 | tryCatch({ 43 | metadata <- get_file_metadata(url) 44 | 45 | commit_hash <- metadata$commit_hash 46 | if (is.null(commit_hash)) { 47 | cli::cli_abort(gettext("Distant resource does not seem to be on huggingface.co (missing commit header).")) 48 | } 49 | 50 | etag <- metadata$etag 51 | if (is.null(etag)) { 52 | cli::cli_abort(gettext("Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility.")) 53 | } 54 | 55 | # Expected (uncompressed) size 56 | expected_size <- metadata$size 57 | 58 | # In case of a redirect, save an extra redirect on the request.get call, 59 | # and ensure we download the exact atomic version even if it changed 60 | # between the HEAD and the GET (unlikely, but hey). 61 | # Useful for lfs blobs that are stored on a CDN. 62 | if (metadata$location != url) { 63 | url <- metadata$location 64 | } 65 | }) 66 | } 67 | 68 | # etag is NULL == we don't have a connection or we passed local_files_only. 69 | # try to get the last downloaded one from the specified revision. 70 | # If the specified revision is a commit hash, look inside "snapshots". 71 | # If the specified revision is a branch or tag, look inside "refs". 72 | if (is.null(etag)) { 73 | # Try to get "commit_hash" from "revision" 74 | commit_hash <- NULL 75 | if (grepl(REGEX_COMMIT_HASH(), revision)) { 76 | commit_hash <- revision 77 | } else { 78 | ref_path <- fs::path(storage_folder, "refs", revision) 79 | if (fs::file_exists(ref_path)) { 80 | commit_hash <- readLines(ref_path) 81 | } 82 | } 83 | 84 | # Return pointer file if exists 85 | if (!is.null(commit_hash)) { 86 | pointer_path <- get_pointer_path(storage_folder, commit_hash, filename) 87 | if (fs::file_exists(pointer_path)) { 88 | return(pointer_path) 89 | } 90 | } 91 | 92 | if (local_files_only) { 93 | cli::cli_abort(gettext( 94 | "Cannot find the requested files in the disk cache and", 95 | " outgoing traffic has been disabled. To enable hf.co look-ups", 96 | " and downloads online, set 'local_files_only' to False." 97 | )) 98 | } else { 99 | cli::cli_abort(gettext( 100 | "Connection error, and we cannot find the requested files in", 101 | " the disk cache. Please try again or make sure your Internet", 102 | " connection is on." 103 | )) 104 | } 105 | } 106 | 107 | if (is.null(etag)) cli::cli_abort(gettext("etag must have been retrieved from server")) 108 | if (is.null(commit_hash)) cli::cli_abort(gettext("commit_hash must have been retrieved from server")) 109 | 110 | blob_path <- fs::path(storage_folder, "blobs", etag) 111 | pointer_path <- get_pointer_path(storage_folder, commit_hash, filename) 112 | 113 | fs::dir_create(fs::path_dir(blob_path)) 114 | fs::dir_create(fs::path_dir(pointer_path)) 115 | 116 | # if passed revision is not identical to commit_hash 117 | # then revision has to be a branch name or tag name. 118 | # In that case store a ref. 119 | # we write an alias between revision and commit-hash 120 | if (revision != commit_hash) { 121 | ref_path <- fs::path(storage_folder, "refs", revision) 122 | fs::dir_create(fs::path_dir(ref_path)) 123 | fs::file_create(ref_path) 124 | writeLines(commit_hash, ref_path) 125 | } 126 | 127 | if (fs::file_exists(pointer_path) && !force_download) { 128 | return(pointer_path) 129 | } 130 | 131 | if (fs::file_exists(blob_path) && !force_download) { 132 | fs::link_create(blob_path, pointer_path) 133 | return(pointer_path) 134 | } 135 | 136 | withr::with_tempfile("tmp", { 137 | lock <- filelock::lock(paste0(blob_path, ".lock")) 138 | on.exit({filelock::unlock(lock)}) 139 | tryCatch({ 140 | bar_id <- cli::cli_progress_bar( 141 | name = filename, 142 | total = if (is.numeric(expected_size)) expected_size else NA, 143 | type = "download", 144 | ) 145 | progress <- function(down, up) { 146 | if (down[1] != 0) { 147 | cli::cli_progress_update(total = down[1], set = down[2], id = bar_id) 148 | } 149 | TRUE 150 | } 151 | handle <- curl::new_handle(noprogress = FALSE, progressfunction = progress) 152 | curl::handle_setheaders(handle, .list = hub_headers()) 153 | curl::curl_download(url, tmp, handle = handle, quiet = FALSE) 154 | cli::cli_progress_done(id = bar_id) 155 | }, error = function(err) { 156 | cli::cli_abort(gettext("Error downloading from {.url {url}}"), parent = err) 157 | }) 158 | fs::file_move(tmp, blob_path) 159 | 160 | # fs::link_create doesn't work for linking files on windows. 161 | try(fs::file_delete(pointer_path), silent = TRUE) # delete the link to avoid warnings 162 | file.symlink(blob_path, pointer_path) 163 | }) 164 | 165 | pointer_path 166 | } 167 | 168 | hub_url <- function(repo_id, filename, ..., revision = "main", repo_type = "model") { 169 | if (repo_type == "model") { 170 | glue::glue("https://huggingface.co/{repo_id}/resolve/{revision}/{filename}") 171 | } else { 172 | glue::glue("https://huggingface.co/{repo_type}s/{repo_id}/resolve/{revision}/{filename}") 173 | } 174 | } 175 | 176 | get_pointer_path <- function(storage_folder, revision, relative_filename) { 177 | snapshot_path <- fs::path(storage_folder, "snapshots", revision) 178 | pointer_path <- fs::path(snapshot_path, relative_filename) 179 | attr(pointer_path, "snapshot_path") <- snapshot_path 180 | pointer_path 181 | } 182 | 183 | repo_folder_name <- function(repo_id, repo_type = "model") { 184 | repo_id <- gsub(pattern = "/", x = repo_id, replacement = REPO_ID_SEPARATOR()) 185 | glue::glue("{repo_type}s{REPO_ID_SEPARATOR()}{repo_id}") 186 | } 187 | 188 | hub_headers <- function() { 189 | headers <- c("user-agent" = "hfhub/0.0.1") 190 | 191 | token <- Sys.getenv("HUGGING_FACE_HUB_TOKEN", unset = "") 192 | if (!nzchar(token)) 193 | token <- Sys.getenv("HUGGINGFACE_HUB_TOKEN", unset = "") 194 | 195 | if (nzchar(token)) { 196 | headers["authorization"] <- paste0("Bearer ", token) 197 | } 198 | 199 | headers 200 | } 201 | 202 | #' @importFrom rlang %||% 203 | get_file_metadata <- function(url) { 204 | 205 | headers <- hub_headers() 206 | headers["Accept-Encoding"] <- "identity" 207 | 208 | req <- reqst(httr::HEAD, 209 | url = url, 210 | httr::config(followlocation = FALSE), 211 | httr::add_headers(.headers = headers), 212 | follow_relative_redirects = TRUE 213 | ) 214 | list( 215 | location = grab_from_headers(req, "location") %||% req$url, 216 | commit_hash = grab_from_headers(req, "x-repo-commit"), 217 | etag = normalize_etag(grab_from_headers(req, c(HUGGINGFACE_HEADER_X_LINKED_ETAG(), "etag"))), 218 | size = as.integer(grab_from_headers(req, "content-length")) 219 | ) 220 | } 221 | 222 | grab_from_headers <- function(req, nms) { 223 | headers <- req$all_headers 224 | for (nm in nms) { 225 | nm <- tolower(nm) 226 | 227 | for(h in headers) { 228 | header <- h$headers 229 | names(headers) <- tolower(headers) 230 | 231 | if (!is.null(header[[nm]])) 232 | return(header[[nm]]) 233 | } 234 | } 235 | NULL 236 | } 237 | 238 | normalize_etag <- function(etag) { 239 | if (is.null(etag)) return(NULL) 240 | etag <- gsub(pattern = '"', x = etag, replacement = "") 241 | etag <- gsub(pattern = "W/", x = etag, replacement = "") 242 | etag 243 | } 244 | 245 | REPO_ID_SEPARATOR <- function() { 246 | "--" 247 | } 248 | HUGGINGFACE_HUB_CACHE <- function() { 249 | # we use the same cache structure as the Python library - which is useful for 250 | # numerous reasons. Thus we don't use R's tools for cache handling such as 251 | # rappdirs or R_user_dir. 252 | path <- Sys.getenv("HUGGINGFACE_HUB_CACHE", "~/.cache/huggingface/hub") 253 | fs::path_expand(path) 254 | } 255 | REGEX_COMMIT_HASH <- function() { 256 | "^[0-9a-f]{40}$" 257 | } 258 | 259 | #' Weight file names in HUB 260 | #' 261 | #' @describeIn WEIGHTS_NAME Name of weights file 262 | #' 263 | #' @returns A string with the default file names for indexes in the Hugging Face Hub. 264 | #' @examples 265 | #' WEIGHTS_NAME() 266 | #' WEIGHTS_INDEX_NAME() 267 | #' @export 268 | WEIGHTS_NAME <- function() "pytorch_model.bin" 269 | #' @export 270 | #' @describeIn WEIGHTS_NAME Name of weights index file 271 | WEIGHTS_INDEX_NAME <- function() "pytorch_model.bin.index.json" 272 | 273 | HUGGINGFACE_HEADER_X_LINKED_ETAG <- function() "X-Linked-Etag" 274 | 275 | reqst <- function(method, url, ..., follow_relative_redirects = FALSE) { 276 | if (follow_relative_redirects) { 277 | r <- reqst(method, url, ..., follow_relative_redirects = FALSE) 278 | if (r$status_code >= 300 && r$status_code <= 399) { 279 | redirect_url <- urltools::url_parse(httr::headers(r)$location) 280 | if (is.na(redirect_url$domain)) { 281 | p <- urltools::url_parse(url) 282 | p$path <- redirect_url$path 283 | url <- urltools::url_compose(p) 284 | return(reqst(method, url, ..., follow_relative_redirects = TRUE)) 285 | } 286 | } 287 | } 288 | method(url, ...) 289 | } 290 | 291 | utils::globalVariables("tmp") 292 | 293 | -------------------------------------------------------------------------------- /R/hub_info.R: -------------------------------------------------------------------------------- 1 | #' Queries information about Hub repositories 2 | #' 3 | #' @inheritParams hub_download 4 | #' @param files_metadata Obtain files metadata information when querying repository information. 5 | #' @export 6 | hub_repo_info <- function(repo_id, ..., repo_type = NULL, revision = NULL, files_metadata = FALSE) { 7 | if (is.null(repo_type) || repo_type == "model") { 8 | path <- glue::glue("https://huggingface.co/api/models/{repo_id}") 9 | } else { 10 | path <- glue::glue("https://huggingface.co/api/{repo_type}s/{repo_id}") 11 | } 12 | 13 | if (!is.null(revision)) { 14 | path <- glue::glue("{path}/revision/{revision}") 15 | } 16 | 17 | params <- list() 18 | if (files_metadata) { 19 | params$blobs <- TRUE 20 | } 21 | 22 | headers <- hub_headers() 23 | 24 | results <- httr::GET( 25 | path, 26 | query = params, 27 | httr::add_headers(.headers = headers) 28 | ) 29 | 30 | httr::content(results) 31 | } 32 | 33 | #' @describeIn hub_repo_info Query information from a Hub Dataset 34 | #' @export 35 | hub_dataset_info <- function(repo_id, ..., revision = NULL, files_metadata = FALSE) { 36 | hub_repo_info( 37 | repo_id, 38 | revision = revision, 39 | repo_type = "dataset", 40 | files_metadata = files_metadata 41 | ) 42 | } 43 | 44 | -------------------------------------------------------------------------------- /R/hub_snapshot.R: -------------------------------------------------------------------------------- 1 | #' Snapshot the entire repository 2 | #' 3 | #' Downloads and stores all files from a Hugging Face Hub repository. 4 | #' @inheritParams hub_download 5 | #' @param allow_patterns A character vector containing patters that are used to 6 | #' filter allowed files to snapshot. 7 | #' @param ignore_patterns A character vector contaitning patterns to reject files 8 | #' from being downloaded. 9 | #' 10 | #' @export 11 | hub_snapshot <- function(repo_id, ..., revision = "main", repo_type = "model", 12 | local_files_only = FALSE, force_download = FALSE, 13 | allow_patterns = NULL, ignore_patterns = NULL) { 14 | info <- hub_repo_info(repo_id, repo_type = repo_type) 15 | all_files <- sapply(info$siblings, function(x) x$rfilename) 16 | 17 | allowed_files <- all_files 18 | if (!is.null(allow_patterns)) { 19 | allowed_files <- lapply(allow_patterns, function(x) { 20 | all_files[grepl(allow_patterns, all_files)] 21 | }) 22 | allowed_files <- unique(unlist(allowed_files)) 23 | } 24 | 25 | files <- allowed_files 26 | if (!is.null(ignore_patterns)) { 27 | for (pattern in ignore_patterns) { 28 | files <- files[!grepl(pattern, files)] 29 | } 30 | } 31 | 32 | id <- cli::cli_progress_bar( 33 | name = "Downloading files", 34 | type = "tasks", 35 | total = length(files), 36 | clear = FALSE 37 | ) 38 | 39 | i <- 0 40 | cli::cli_progress_step("Snapshotting files {i}/{length(files)}") 41 | for (i in seq_along(files)) { 42 | d <- hub_download( 43 | repo_id = repo_id, 44 | filename = files[i], 45 | revision = info$sha, 46 | repo_type = repo_type, 47 | local_files_only = local_files_only, 48 | force_download = force_download 49 | ) 50 | } 51 | 52 | attr(d, "snapshot_path") 53 | } 54 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | # hfhub 3 | 4 | 5 | [![R-CMD-check](https://github.com/mlverse/hfhub/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/mlverse/hfhub/actions/workflows/R-CMD-check.yaml) 6 | 7 | 8 | hfhub is a minimal port of [huggingface_hub](https://github.com/huggingface/huggingface_hub) that allows downloading files from Hugging Face Hub and caching them with the same structure used in the original implementation. 9 | 10 | ## Installation 11 | 12 | `hfhub` can be installed from CRAN with: 13 | 14 | ``` 15 | install.packages("hfhub") 16 | ``` 17 | 18 | You can install the development version of hfhub like so: 19 | 20 | ``` r 21 | remotes::install_github("mlverse/hfhub") 22 | ``` 23 | 24 | ## Example 25 | 26 | `hub_download` the the only exported function in the package and can be used to 27 | download and cache a file from any Hugging Face Hub repository. It returns a 28 | path to the file. 29 | 30 | ``` r 31 | library(hfhub) 32 | path <- hub_download("gpt2", "config.json") 33 | str(jsonlite::fromJSON(path)) 34 | ``` 35 | 36 | ## Authentication 37 | 38 | You can set the `HUGGING_FACE_HUB_TOKEN` environment variable with the value 39 | of a token obtained in the Access Token section of the Hugging Face account setting page. 40 | This will allow you to download private files from Hugging Face Hub. 41 | -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: https://mlverse.github.io/hfhub/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | Re-submission making requested changes. 2 | -------------------------------------------------------------------------------- /hfhub.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | LineEndingConversion: Posix 18 | 19 | BuildType: Package 20 | PackageUseDevtools: Yes 21 | PackageInstallArgs: --no-multiarch --with-keep.source 22 | PackageRoxygenize: rd,collate,namespace 23 | -------------------------------------------------------------------------------- /inst/po/fr/LC_MESSAGES/R-hfhub.mo: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mlverse/hfhub/acf8584d50c9f2531d54c3942f4a872db56bdfb1/inst/po/fr/LC_MESSAGES/R-hfhub.mo -------------------------------------------------------------------------------- /man/WEIGHTS_NAME.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hub_download.R 3 | \name{WEIGHTS_NAME} 4 | \alias{WEIGHTS_NAME} 5 | \alias{WEIGHTS_INDEX_NAME} 6 | \title{Weight file names in HUB} 7 | \usage{ 8 | WEIGHTS_NAME() 9 | 10 | WEIGHTS_INDEX_NAME() 11 | } 12 | \value{ 13 | A string with the default file names for indexes in the Hugging Face Hub. 14 | } 15 | \description{ 16 | Weight file names in HUB 17 | } 18 | \section{Functions}{ 19 | \itemize{ 20 | \item \code{WEIGHTS_NAME()}: Name of weights file 21 | 22 | \item \code{WEIGHTS_INDEX_NAME()}: Name of weights index file 23 | 24 | }} 25 | \examples{ 26 | WEIGHTS_NAME() 27 | WEIGHTS_INDEX_NAME() 28 | } 29 | -------------------------------------------------------------------------------- /man/hub_download.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hub_download.R 3 | \name{hub_download} 4 | \alias{hub_download} 5 | \title{Downloads files from HuggingFace repositories} 6 | \usage{ 7 | hub_download( 8 | repo_id, 9 | filename, 10 | ..., 11 | revision = "main", 12 | repo_type = "model", 13 | local_files_only = FALSE, 14 | force_download = FALSE 15 | ) 16 | } 17 | \arguments{ 18 | \item{repo_id}{The repository identifier, eg \code{"bert-base-uncased"} or \code{"deepset/sentence_bert"}.} 19 | 20 | \item{filename}{Filename to download from the repository. Example \code{"config.json"}.} 21 | 22 | \item{...}{currenytly unused.} 23 | 24 | \item{revision}{Revision (branch, tag or commitid) to download the file from.} 25 | 26 | \item{repo_type}{The type of the repository. Currently only \code{"model"} is supported.} 27 | 28 | \item{local_files_only}{Only use cached files?} 29 | 30 | \item{force_download}{For re-downloading of files that are cached.} 31 | } 32 | \value{ 33 | The file path of the downloaded or cached file. The snapshot path is returned 34 | as an attribute. 35 | } 36 | \description{ 37 | Downloads files from HuggingFace repositories 38 | } 39 | \examples{ 40 | try({ 41 | withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = tempdir()), { 42 | path <- hub_download("gpt2", "config.json") 43 | print(path) 44 | str(jsonlite::fromJSON(path)) 45 | }) 46 | }) 47 | 48 | } 49 | -------------------------------------------------------------------------------- /man/hub_repo_info.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hub_info.R 3 | \name{hub_repo_info} 4 | \alias{hub_repo_info} 5 | \alias{hub_dataset_info} 6 | \title{Queries information about Hub repositories} 7 | \usage{ 8 | hub_repo_info( 9 | repo_id, 10 | ..., 11 | repo_type = NULL, 12 | revision = NULL, 13 | files_metadata = FALSE 14 | ) 15 | 16 | hub_dataset_info(repo_id, ..., revision = NULL, files_metadata = FALSE) 17 | } 18 | \arguments{ 19 | \item{repo_id}{The repository identifier, eg \code{"bert-base-uncased"} or \code{"deepset/sentence_bert"}.} 20 | 21 | \item{...}{currenytly unused.} 22 | 23 | \item{repo_type}{The type of the repository. Currently only \code{"model"} is supported.} 24 | 25 | \item{revision}{Revision (branch, tag or commitid) to download the file from.} 26 | 27 | \item{files_metadata}{Obtain files metadata information when querying repository information.} 28 | } 29 | \description{ 30 | Queries information about Hub repositories 31 | } 32 | \section{Functions}{ 33 | \itemize{ 34 | \item \code{hub_dataset_info()}: Query information from a Hub Dataset 35 | 36 | }} 37 | -------------------------------------------------------------------------------- /man/hub_snapshot.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hub_snapshot.R 3 | \name{hub_snapshot} 4 | \alias{hub_snapshot} 5 | \title{Snapshot the entire repository} 6 | \usage{ 7 | hub_snapshot( 8 | repo_id, 9 | ..., 10 | revision = "main", 11 | repo_type = "model", 12 | local_files_only = FALSE, 13 | force_download = FALSE, 14 | allow_patterns = NULL, 15 | ignore_patterns = NULL 16 | ) 17 | } 18 | \arguments{ 19 | \item{repo_id}{The repository identifier, eg \code{"bert-base-uncased"} or \code{"deepset/sentence_bert"}.} 20 | 21 | \item{...}{currenytly unused.} 22 | 23 | \item{revision}{Revision (branch, tag or commitid) to download the file from.} 24 | 25 | \item{repo_type}{The type of the repository. Currently only \code{"model"} is supported.} 26 | 27 | \item{local_files_only}{Only use cached files?} 28 | 29 | \item{force_download}{For re-downloading of files that are cached.} 30 | 31 | \item{allow_patterns}{A character vector containing patters that are used to 32 | filter allowed files to snapshot.} 33 | 34 | \item{ignore_patterns}{A character vector contaitning patterns to reject files 35 | from being downloaded.} 36 | } 37 | \description{ 38 | Downloads and stores all files from a Hugging Face Hub repository. 39 | } 40 | -------------------------------------------------------------------------------- /po/R-fr.po: -------------------------------------------------------------------------------- 1 | msgid "" 2 | msgstr "" 3 | "Project-Id-Version: hfhub 0.1.1.9000\n" 4 | "POT-Creation-Date: 2024-09-04 19:28+0200\n" 5 | "PO-Revision-Date: 2024-09-04 19:49+0200\n" 6 | "Last-Translator: \n" 7 | "Language-Team: \n" 8 | "Language: fr\n" 9 | "MIME-Version: 1.0\n" 10 | "Content-Type: text/plain; charset=UTF-8\n" 11 | "Content-Transfer-Encoding: 8bit\n" 12 | "X-Generator: Poedit 3.4.3\n" 13 | 14 | #: hub_download.R:47 15 | msgid "" 16 | "Distant resource does not seem to be on huggingface.co (missing commit " 17 | "header)." 18 | msgstr "" 19 | "La ressource ne semble pas être disponible sur huggingface.co (début de " 20 | "commit manquant)." 21 | 22 | #: hub_download.R:52 23 | msgid "" 24 | "Distant resource does not have an ETag, we won't be able to reliably ensure " 25 | "reproducibility." 26 | msgstr "" 27 | "La ressource distante n'a pas d'ETag, il n'y aura donc pas de garantie de " 28 | "reproductibilité." 29 | 30 | #: hub_download.R:94 31 | msgid "Cannot find the requested files in the disk cache and" 32 | msgstr "" 33 | "Impossible de trouver les fichiers demandés dans le cache du disque et" 34 | 35 | #: hub_download.R:95 36 | msgid "outgoing traffic has been disabled. To enable hf.co look-ups" 37 | msgstr "le trafic sortant a été désactivé. Pour activer les requêtes hf.co" 38 | 39 | #: hub_download.R:96 40 | msgid "and downloads online, set 'local_files_only' to False." 41 | msgstr "" 42 | "et les téléchargements en ligne, définissez 'local_files_only = FALSE'." 43 | 44 | #: hub_download.R:100 45 | msgid "Connection error, and we cannot find the requested files in" 46 | msgstr "" 47 | "Erreur de connexion, et impossible de trouver les fichiers demandés dans" 48 | 49 | #: hub_download.R:101 50 | msgid "the disk cache. Please try again or make sure your Internet" 51 | msgstr "" 52 | "le cache du disque. Veuillez réessayer ou vérifiez que votre connexion " 53 | "Internet" 54 | 55 | #: hub_download.R:102 56 | msgid "connection is on." 57 | msgstr "est activée." 58 | 59 | #: hub_download.R:107 60 | msgid "etag must have been retrieved from server" 61 | msgstr "l'ETag doit être téléchargé depuis le serveur." 62 | 63 | #: hub_download.R:108 64 | msgid "commit_hash must have been retrieved from server" 65 | msgstr "'commit_hash' doit être téléchargé du serveur." 66 | 67 | #: hub_download.R:156 68 | msgid "Error downloading from {.url {url}}" 69 | msgstr "Erreur de téléchargement depuis {.url {url}}" 70 | 71 | #: hub_download.R:170 72 | msgid "https://huggingface.co/{repo_id}/resolve/{revision}/{filename}" 73 | msgstr "https://huggingface.co/{repo_id}/resolve/{revision}/{filename}" 74 | 75 | #: hub_download.R:172 76 | msgid "" 77 | "https://huggingface.co/{repo_type}s/{repo_id}/resolve/{revision}/{filename}" 78 | msgstr "" 79 | "https://huggingface.co/{repo_type}s/{repo_id}/resolve/{revision}/{filename}" 80 | 81 | #: hub_download.R:185 82 | msgid "{repo_type}s{REPO_ID_SEPARATOR()}{repo_id}" 83 | msgstr "{repo_type}s{REPO_ID_SEPARATOR()}{repo_id}" 84 | 85 | #: hub_info.R:8 86 | msgid "https://huggingface.co/api/models/{repo_id}" 87 | msgstr "https://huggingface.co/api/models/{repo_id}" 88 | 89 | #: hub_info.R:10 90 | msgid "https://huggingface.co/api/{repo_type}s/{repo_id}" 91 | msgstr "https://huggingface.co/api/{repo_type}s/{repo_id}" 92 | 93 | #: hub_info.R:14 94 | msgid "{path}/revision/{revision}" 95 | msgstr "{path}/revision/{revision}" 96 | -------------------------------------------------------------------------------- /po/R-hfhub.pot: -------------------------------------------------------------------------------- 1 | msgid "" 2 | msgstr "" 3 | "Project-Id-Version: hfhub 0.1.1.9000\n" 4 | "POT-Creation-Date: 2024-09-04 19:28+0200\n" 5 | "PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" 6 | "Last-Translator: FULL NAME \n" 7 | "Language-Team: LANGUAGE \n" 8 | "Language: \n" 9 | "MIME-Version: 1.0\n" 10 | "Content-Type: text/plain; charset=UTF-8\n" 11 | "Content-Transfer-Encoding: 8bit\n" 12 | 13 | #: hub_download.R:47 14 | msgid "" 15 | "Distant resource does not seem to be on huggingface.co (missing commit " 16 | "header)." 17 | msgstr "" 18 | 19 | #: hub_download.R:52 20 | msgid "" 21 | "Distant resource does not have an ETag, we won't be able to reliably ensure " 22 | "reproducibility." 23 | msgstr "" 24 | 25 | #: hub_download.R:94 26 | msgid "Cannot find the requested files in the disk cache and" 27 | msgstr "" 28 | 29 | #: hub_download.R:95 30 | msgid "outgoing traffic has been disabled. To enable hf.co look-ups" 31 | msgstr "" 32 | 33 | #: hub_download.R:96 34 | msgid "and downloads online, set 'local_files_only' to False." 35 | msgstr "" 36 | 37 | #: hub_download.R:100 38 | msgid "Connection error, and we cannot find the requested files in" 39 | msgstr "" 40 | 41 | #: hub_download.R:101 42 | msgid "the disk cache. Please try again or make sure your Internet" 43 | msgstr "" 44 | 45 | #: hub_download.R:102 46 | msgid "connection is on." 47 | msgstr "" 48 | 49 | #: hub_download.R:107 50 | msgid "etag must have been retrieved from server" 51 | msgstr "" 52 | 53 | #: hub_download.R:108 54 | msgid "commit_hash must have been retrieved from server" 55 | msgstr "" 56 | 57 | #: hub_download.R:156 58 | msgid "Error downloading from {.url {url}}" 59 | msgstr "" 60 | 61 | #: hub_download.R:170 62 | msgid "https://huggingface.co/{repo_id}/resolve/{revision}/{filename}" 63 | msgstr "" 64 | 65 | #: hub_download.R:172 66 | msgid "" 67 | "https://huggingface.co/{repo_type}s/{repo_id}/resolve/{revision}/{filename}" 68 | msgstr "" 69 | 70 | #: hub_download.R:185 71 | msgid "{repo_type}s{REPO_ID_SEPARATOR()}{repo_id}" 72 | msgstr "" 73 | 74 | #: hub_info.R:8 75 | msgid "https://huggingface.co/api/models/{repo_id}" 76 | msgstr "" 77 | 78 | #: hub_info.R:10 79 | msgid "https://huggingface.co/api/{repo_type}s/{repo_id}" 80 | msgstr "" 81 | 82 | #: hub_info.R:14 83 | msgid "{path}/revision/{revision}" 84 | msgstr "" 85 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | # This file is part of the standard setup for testthat. 2 | # It is recommended that you do not modify it. 3 | # 4 | # Where should you do additional test configuration? 5 | # Learn more about the roles of various files in: 6 | # * https://r-pkgs.org/tests.html 7 | # * https://testthat.r-lib.org/reference/test_package.html#special-files 8 | 9 | library(testthat) 10 | library(hfhub) 11 | 12 | test_check("hfhub") 13 | -------------------------------------------------------------------------------- /tests/testthat/_snaps/hub_snapshot.md: -------------------------------------------------------------------------------- 1 | # snapshot 2 | 3 | Code 4 | p <- hub_snapshot("dfalbel/cran-packages", repo_type = "dataset", 5 | allow_patterns = "\\.R") 6 | Message 7 | i Snapshotting files 0/4 8 | v Snapshotting files 4/4 [0ms] 9 | 10 | 11 | -------------------------------------------------------------------------------- /tests/testthat/helper-skips.R: -------------------------------------------------------------------------------- 1 | skip_if_no_token <- function() { 2 | token <- Sys.getenv("HUGGINGFACE_HUB_TOKEN", "") 3 | if (token == "") { 4 | token <- Sys.getenv("HUGGING_FACE_HUB_TOKEN", "") 5 | } 6 | 7 | if (token == "") 8 | skip("No auth token set.") 9 | } 10 | -------------------------------------------------------------------------------- /tests/testthat/test-hub_download.R: -------------------------------------------------------------------------------- 1 | skip_on_cran() 2 | 3 | test_that("hub_download", { 4 | file <- hub_download("gpt2", filename = "config.json") 5 | 6 | expect_equal( 7 | jsonlite::fromJSON(file)$architectures, 8 | "GPT2LMHeadModel" 9 | ) 10 | 11 | file <- hub_download("gpt2", filename = "config.json", force_download = TRUE) 12 | expect_equal( 13 | jsonlite::fromJSON(file)$architectures, 14 | "GPT2LMHeadModel" 15 | ) 16 | 17 | file <- hub_download("gpt2", filename = "config.json", local_files_only = TRUE) 18 | expect_equal( 19 | jsonlite::fromJSON(file)$architectures, 20 | "GPT2LMHeadModel" 21 | ) 22 | 23 | tmp <- tempfile() 24 | dir.create(tmp) 25 | withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = tmp), { 26 | file <- hub_download("gpt2", filename = "config.json") 27 | }) 28 | expect_equal(list.files(tmp), "models--gpt2") 29 | }) 30 | 31 | test_that("can download from private repo", { 32 | 33 | skip_if_no_token() 34 | 35 | expect_error(regexp = NA, { 36 | hub_download( 37 | repo_id = "dfalbel/test-hfhub", 38 | filename = ".gitattributes", 39 | force_download = TRUE 40 | ) 41 | }) 42 | 43 | expect_error(regexp = NA, { 44 | hub_download( 45 | repo_id = "dfalbel/test-hfhub", 46 | filename = "hello.safetensors", 47 | force_download = TRUE 48 | ) 49 | }) 50 | 51 | }) 52 | -------------------------------------------------------------------------------- /tests/testthat/test-hub_info.R: -------------------------------------------------------------------------------- 1 | skip_on_cran() 2 | 3 | test_that("dataset info", { 4 | info <- hub_dataset_info("dfalbel/cran-packages") 5 | expect_equal(info$author, "dfalbel") 6 | expect_true(length(info$siblings) >= 13) 7 | }) 8 | 9 | test_that("can get ifo for private repositories", { 10 | skip_if_no_token() 11 | 12 | info <- hub_dataset_info("dfalbel/test-hfhub-dataset") 13 | expect_equal(info$author, "dfalbel") 14 | }) 15 | -------------------------------------------------------------------------------- /tests/testthat/test-hub_snapshot.R: -------------------------------------------------------------------------------- 1 | skip_on_cran() 2 | 3 | test_that("snapshot", { 4 | expect_snapshot({ 5 | p <- hub_snapshot("dfalbel/cran-packages", repo_type = "dataset", allow_patterns = "\\.R") 6 | }, 7 | transform = function(x) { 8 | sub("\\[[0-9\\.]+[a-z]+\\]", "[0ms]", x = x) 9 | }) 10 | 11 | expect_true(length(fs::dir_ls(p)) >= 4) 12 | }) 13 | 14 | test_that("can snapshot private repositories", { 15 | 16 | skip_if_no_token() 17 | 18 | expect_error(regexp=NA, { 19 | hub_snapshot("dfalbel/test-hfhub", repo_type = "model", force_download = TRUE) 20 | }) 21 | 22 | }) 23 | -------------------------------------------------------------------------------- /tests/testthat/test-message-translations.R: -------------------------------------------------------------------------------- 1 | test_that("R-level cli_abort messages are correctly translated in FR", { 2 | skip_if_no_token() 3 | withr::with_envvar(c(HUGGINGFACE_HUB_CACHE = tempdir()), { 4 | try({ 5 | withr::with_language( 6 | lang = "fr", 7 | expect_error( 8 | hub_download( 9 | repo_id = "dfalbel/test-hfh", 10 | filename = ".gitattributes", 11 | force_download = TRUE 12 | ), 13 | regexp = "La ressource ne semble pas être disponible sur", 14 | ) 15 | 16 | ) 17 | }) 18 | }) 19 | 20 | }) 21 | --------------------------------------------------------------------------------