├── .github ├── .gitignore └── workflows │ ├── pkgdown.yaml │ └── R-CMD-check.yaml ├── R ├── ucidata.R ├── car_eval_docs.R ├── autompg_docs.R ├── bridges_docs.R ├── glass_docs.R ├── hepatitis_docs.R ├── wine_docs.R ├── bcw_original_docs.R ├── abalone_docs.R ├── forest_fires_docs.R ├── adult_docs.R ├── heart_disease_processed_docs.R ├── autoimports_docs.R └── bike_sharing_daily_docs.R ├── NAMESPACE ├── data ├── wine.rda ├── adult.rda ├── glass.rda ├── abalone.rda ├── autompg.rda ├── bridges.rda ├── car_eval.rda ├── hepatitis.rda ├── autoimports.rda ├── bcw_original.rda ├── forest_fires.rda ├── heart_disease_ch.rda ├── heart_disease_cl.rda ├── heart_disease_hu.rda ├── heart_disease_va.rda └── bike_sharing_daily.rda ├── .gitignore ├── _pkgdown.yml ├── .Rbuildignore ├── data-raw ├── forest_fires_build.R ├── autompg_build.R ├── car_eval_build.R ├── abalone_build.R ├── adult_build.R ├── bridges_build.R ├── wine_build.R ├── autoimports_build.R ├── glass_build.R ├── hepatitis_build.R ├── bcw_original_build.R ├── bike_sharing_daily_build.R └── heart_disease_build.R ├── ucidata.Rproj ├── DESCRIPTION ├── cran-comments.md ├── man ├── ucidata-package.Rd ├── car_eval.Rd ├── autompg.Rd ├── glass.Rd ├── bridges.Rd ├── wine.Rd ├── bcw_original.Rd ├── hepatitis.Rd ├── abalone.Rd ├── forest_fires.Rd ├── adult.Rd ├── heart_disease.Rd ├── autoimports.Rd └── bike_sharing_daily.Rd ├── NEWS.md ├── README.md └── README.Rmd /.github/.gitignore: -------------------------------------------------------------------------------- 1 | *.html 2 | -------------------------------------------------------------------------------- /R/ucidata.R: -------------------------------------------------------------------------------- 1 | #' @keywords internal 2 | "_PACKAGE" 3 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | -------------------------------------------------------------------------------- /data/wine.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/wine.rda -------------------------------------------------------------------------------- /data/adult.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/adult.rda -------------------------------------------------------------------------------- /data/glass.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/glass.rda -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .DS_Store 6 | docs 7 | -------------------------------------------------------------------------------- /data/abalone.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/abalone.rda -------------------------------------------------------------------------------- /data/autompg.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/autompg.rda -------------------------------------------------------------------------------- /data/bridges.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/bridges.rda -------------------------------------------------------------------------------- /data/car_eval.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/car_eval.rda -------------------------------------------------------------------------------- /data/hepatitis.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/hepatitis.rda -------------------------------------------------------------------------------- /data/autoimports.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/autoimports.rda -------------------------------------------------------------------------------- /data/bcw_original.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/bcw_original.rda -------------------------------------------------------------------------------- /data/forest_fires.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/forest_fires.rda -------------------------------------------------------------------------------- /_pkgdown.yml: -------------------------------------------------------------------------------- 1 | url: http://r-pkg.thecoatlessprofessor.com/ucidata/ 2 | template: 3 | bootstrap: 5 4 | 5 | -------------------------------------------------------------------------------- /data/heart_disease_ch.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/heart_disease_ch.rda -------------------------------------------------------------------------------- /data/heart_disease_cl.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/heart_disease_cl.rda -------------------------------------------------------------------------------- /data/heart_disease_hu.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/heart_disease_hu.rda -------------------------------------------------------------------------------- /data/heart_disease_va.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/heart_disease_va.rda -------------------------------------------------------------------------------- /data/bike_sharing_daily.rda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/coatless-rpkg/ucidata/HEAD/data/bike_sharing_daily.rda -------------------------------------------------------------------------------- /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^data-raw$ 4 | ^README\.Rmd$ 5 | ^README-.*\.png$ 6 | ^\.travis\.yml$ 7 | ^cran-comments\.md$ 8 | ^\.github$ 9 | ^_pkgdown\.yml$ 10 | ^docs$ 11 | ^pkgdown$ 12 | -------------------------------------------------------------------------------- /data-raw/forest_fires_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Forest Fire Data https://archive.ics.uci.edu/ml/datasets/Forest+Fires 3 | 4 | url_forest_fires = "https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv" 5 | 6 | forest_fires = read.csv(url_forest_fires, header = TRUE) 7 | 8 | usethis::use_data(forest_fires, overwrite = TRUE) 9 | -------------------------------------------------------------------------------- /ucidata.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | AutoAppendNewline: Yes 16 | StripTrailingWhitespace: Yes 17 | 18 | BuildType: Package 19 | PackageUseDevtools: Yes 20 | PackageInstallArgs: --no-multiarch --with-keep.source 21 | PackageRoxygenize: rd,collate,namespace 22 | -------------------------------------------------------------------------------- /data-raw/autompg_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## autompg Data https://archive.ics.uci.edu/ml/datasets/auto+mpg 3 | 4 | autompg = read.table( 5 | "http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data", 6 | quote = "\"", 7 | comment.char = "", 8 | stringsAsFactors = FALSE, 9 | header = FALSE) 10 | 11 | colnames(autompg) = c("mpg", "cylinders", "displacement", "horsepower", 12 | "weight", "acceleration", "model_year", "origin", "car_name") 13 | 14 | usethis::use_data(autompg, overwrite = TRUE) 15 | -------------------------------------------------------------------------------- /data-raw/car_eval_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Car Evalutation Data https://archive.ics.uci.edu/ml/datasets/Car+Evaluation 3 | 4 | url_car_eval = "https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data" 5 | 6 | car_eval = read.csv(url_car_eval, header = FALSE) 7 | 8 | colnames(car_eval) = c("buying", 9 | "maint", 10 | "doors", 11 | "persons", 12 | "lug_boot", 13 | "safety", 14 | "class_value") 15 | 16 | usethis::use_data(car_eval, overwrite = TRUE) 17 | -------------------------------------------------------------------------------- /data-raw/abalone_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Abalone Data https://archive.ics.uci.edu/ml/datasets/Abalone 3 | 4 | url_abalone = "http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data" 5 | 6 | abalone = read.csv(url_abalone, header = FALSE) 7 | 8 | colnames(abalone) = c("sex", 9 | "length", 10 | "diameter", 11 | "height", 12 | "whole_weight", 13 | "shucked_weight", 14 | "viscera_weight", 15 | "shell_weight", 16 | "rings") 17 | 18 | # Save dataset 19 | usethis::use_data(abalone, overwrite = TRUE) 20 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: ucidata 2 | Title: Collection of Datasets from the UC Irvine Machine Learning Repository 3 | Version: 0.0.3 4 | Authors@R: person("James", "Balamuta", email = "balamut2@illinois.edu", role = c("aut", "cre")) 5 | Description: Select datasets from the UC Irvine 6 | Machine Learning Repository that conform to being reasonable in 7 | size while also allowing for a wide variety of visualizations and models 8 | to be formed. 9 | Depends: R (>= 4.1.0) 10 | License: GPL (>= 2) 11 | URL: https://github.com/coatless-rpkg/ucidata, http://r-pkg.thecoatlessprofessor.com/ucidata/ 12 | BugReports: https://github.com/coatless-rpkg/ucidata/issues 13 | Encoding: UTF-8 14 | LazyData: true 15 | Roxygen: list(markdown = TRUE) 16 | RoxygenNote: 7.2.3 17 | -------------------------------------------------------------------------------- /cran-comments.md: -------------------------------------------------------------------------------- 1 | ## Test environments 2 | 3 | - local OS X install, R 3.4.3 4 | - ubuntu 12.04 (on travis-ci), R 3.4.3 5 | - win-builder (devel and release) 6 | 7 | ## R CMD check results 8 | 9 | 0 errors | 0 warnings | 1 note 10 | 11 | Possibly mis-spelled words in DESCRIPTION: 12 | UC (2:40, 6:66) 13 | 14 | Found the following (possibly) invalid URLs: 15 | URL: http://www.r-pkg.org/pkg/ucidata (moved to https://www.r-pkg.org:443/pkg/ucidata) 16 | From: README.md 17 | Status: 404 18 | Message: Not Found 19 | 20 | - This is a new release of a data package. As a result, the `r-pkg.org` link 21 | included in the readme has yet to be created. Once approved, this link will 22 | become active. 23 | - `UC` is the acronym for the University of California network of schools. 24 | 25 | ## Reverse dependencies 26 | 27 | This is a new release, so there are no reverse dependencies. 28 | -------------------------------------------------------------------------------- /man/ucidata-package.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/ucidata.R 3 | \docType{package} 4 | \name{ucidata-package} 5 | \alias{ucidata} 6 | \alias{ucidata-package} 7 | \title{ucidata: Collection of Datasets from the UC Irvine Machine Learning Repository} 8 | \description{ 9 | Select datasets from the UC Irvine Machine Learning Repository that conform to being reasonable in size while also allowing for a wide variety of visualizations and models to be formed. 10 | } 11 | \seealso{ 12 | Useful links: 13 | \itemize{ 14 | \item \url{https://github.com/coatless-rpkg/ucidata} 15 | \item \url{http://r-pkg.thecoatlessprofessor.com/ucidata/} 16 | \item Report bugs at \url{https://github.com/coatless-rpkg/ucidata/issues} 17 | } 18 | 19 | } 20 | \author{ 21 | \strong{Maintainer}: James Balamuta \email{balamut2@illinois.edu} 22 | 23 | } 24 | \keyword{internal} 25 | -------------------------------------------------------------------------------- /data-raw/adult_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | # Adult data https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data 3 | 4 | adult = read.csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', 5 | na.strings = "?", fill = FALSE, strip.white = TRUE, header = FALSE) 6 | 7 | colnames(adult) = c('age', 8 | 'workclass', 9 | 'fnlwgt', 10 | 'education', 11 | 'education_num', 12 | 'marital_status', 13 | 'occupation', 14 | 'relationship', 15 | 'race', 16 | 'sex', 17 | 'capital_gain', 18 | 'capital_loss', 19 | 'hours_per_week', 20 | 'native_country', 21 | 'income') 22 | 23 | usethis::use_data(adult, overwrite = TRUE) 24 | -------------------------------------------------------------------------------- /data-raw/bridges_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | # Pittsburgh Bridges Data https://archive.ics.uci.edu/ml/datasets/Pittsburgh+Bridges 3 | 4 | url_bridges = "https://archive.ics.uci.edu/ml/machine-learning-databases/bridges/bridges.data.version1" 5 | 6 | bridges = read.csv(url_bridges, 7 | header = FALSE, na.strings = "?") 8 | 9 | # Columns taken verbatim from ML page 10 | # Regex search with: [0-9]{1,2}\. (.*) / .* / .* / .* 11 | # Replacement: "\1", 12 | var_names = c( 13 | "IDENTIF", 14 | "RIVER", 15 | "LOCATION", 16 | "ERECTED", 17 | "PURPOSE", 18 | "LENGTH", 19 | "LANES", 20 | "CLEAR-G", 21 | "T-OR-D", 22 | "MATERIAL", 23 | "SPAN", 24 | "REL-L", 25 | "TYPE" 26 | ) 27 | 28 | # Label columns 29 | colnames(bridges) = gsub("-", "_", tolower(var_names)) 30 | 31 | # Switch from numeric to factor: 32 | bridges = within(bridges, { 33 | lanes = factor(lanes) 34 | }) 35 | 36 | usethis::use_data(bridges, overwrite = TRUE) 37 | 38 | -------------------------------------------------------------------------------- /data-raw/wine_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Wine Data https://archive.ics.uci.edu/ml/datasets/wine 3 | 4 | # Location of Data Sets 5 | red_wine_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv" 6 | white_wine_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv" 7 | 8 | # Note the .csv uses a `;` as the separater. Not `,` 9 | red_wine_data = read.csv(red_wine_url, sep = ";", header = TRUE) 10 | white_wine_data = read.csv(white_wine_url, sep = ";", header = TRUE) 11 | 12 | # Load in Red vs. White Data 13 | red_wine_data$color = "Red" 14 | white_wine_data$color = "White" 15 | 16 | # Merge the two data sets together 17 | wine = rbind(red_wine_data, white_wine_data) 18 | 19 | # Convert color into a factor 20 | wine$color = as.factor(wine$color) 21 | 22 | # Remove periods 23 | colnames(wine) = gsub("\\.", "_", colnames(wine)) 24 | 25 | usethis::use_data(wine, overwrite = TRUE) 26 | -------------------------------------------------------------------------------- /R/car_eval_docs.R: -------------------------------------------------------------------------------- 1 | #' Car Evaluation Data Set 2 | #' 3 | #' Car Evaluation Database was derived from a simple hierarchical decision model 4 | #' originally developed for the demonstration of DEX. 5 | #' 6 | #' @format A data frame with 1728 observations on the following 7 variables. 7 | #' - `buying` 8 | #' - vhigh, high, med, low. 9 | #' - `maint` 10 | #' - vhigh, high, med, low. 11 | #' - `doors` 12 | #' - 2, 3, 4, 5more. 13 | #' - `persons` 14 | #' - 2, 4, more. 15 | #' - `lug_boot` 16 | #' - small, med, big. 17 | #' - `safety` 18 | #' - low, med, high. 19 | #' - `class_value` 20 | #' - unacc, acc, good, vgood 21 | #' @source 22 | #' Marko Bohanec (marko.bohanec '@' ijs.si) 23 | #' Blaz Zupan (blaz.zupan '@' ijs.si) 24 | #' @references 25 | #' M. Bohanec, V. Rajkovic: Expert system for decision making. Sistemica 1(1), pp. 145-157, 1990.) 26 | #' 27 | #' 28 | "car_eval" 29 | -------------------------------------------------------------------------------- /R/autompg_docs.R: -------------------------------------------------------------------------------- 1 | #' Autompg Data Set 2 | #' 3 | #' This dataset is a slightly modified version of the dataset provided in 4 | #' the StatLib library. In line with the use by Ross Quinlan (1993) in 5 | #' predicting the attribute "mpg", 8 of the original instances were removed 6 | #' because they had unknown values for the "mpg" attribute. 7 | #' @format A data frame with 398 observations on the following 9 variables. 8 | #' - `mpg`: continuous 9 | #' - `cylinders`: multi-valued discrete 10 | #' - `displacement`: continuous 11 | #' - `horsepower`: continuous 12 | #' - `weight`: continuous 13 | #' - `acceleration`: continuous 14 | #' - `model_year`: multi-valued discrete 15 | #' - `origin`: multi-valued discrete 16 | #' - `car_name`: string (unique for each instance) 17 | #' @source This dataset was taken from the StatLib library which is 18 | #' maintained at Carnegie Mellon University. The dataset was 19 | #' used in the 1983 American Statistical Association Exposition. 20 | #' @references 21 | #' 22 | #' 23 | "autompg" 24 | -------------------------------------------------------------------------------- /data-raw/autoimports_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | # Automobile (Imports) data https://archive.ics.uci.edu/ml/datasets/Automobile 3 | 4 | url_autoimports = "http://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data" 5 | 6 | autoimports = read.csv(url_autoimports, 7 | header = FALSE, na.strings = "?") 8 | 9 | # Label columns 10 | # Columns taken verbatim from ML page 11 | # Regex search with: [0-9]{1,2}\. (.*):.* 12 | # Replacement: "\1", 13 | 14 | var_names = c( 15 | "symboling", 16 | "normalized-losses", 17 | "make", 18 | "fuel-type", 19 | "aspiration", 20 | "num-of-doors", 21 | "body-style", 22 | "drive-wheels", 23 | "engine-location", 24 | "wheel-base", 25 | "length", 26 | "width", 27 | "height", 28 | "curb-weight", 29 | "engine-type", 30 | "num-of-cylinders", 31 | "engine-size", 32 | "fuel-system", 33 | "bore", 34 | "stroke", 35 | "compression-ratio", 36 | "horsepower", 37 | "peak-rpm", 38 | "city-mpg", 39 | "highway-mpg", 40 | "price" 41 | ) 42 | 43 | var_names_safe = gsub("-", "_", var_names) 44 | 45 | colnames(autoimports) = var_names_safe 46 | 47 | usethis::use_data(autoimports, overwrite = TRUE) 48 | 49 | 50 | 51 | -------------------------------------------------------------------------------- /data-raw/glass_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Glass Data https://archive.ics.uci.edu/ml/datasets/Glass+Identification 3 | 4 | url_glass = "https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data" 5 | 6 | glass = read.csv(url_glass, header = FALSE) 7 | 8 | # Columns taken verbatim from ML page 9 | # Regex search with: [0-9]{1,2}\. (.*) / .* / .* / .* 10 | # Replacement: "\1", 11 | var_names = c("ID", 12 | "RI", 13 | "Na", 14 | "Mg", 15 | "Al", 16 | "Si", 17 | "K", 18 | "Ca", 19 | "Ba", 20 | "Fe", 21 | "Type") 22 | 23 | # Label column names 24 | colnames(glass) = var_names 25 | 26 | glass = within(glass, { 27 | Type = factor(Type, labels = c( 28 | "building_windows_float_processed", 29 | "building_windows_non_float_processed", 30 | "vehicle_windows_float_processed", 31 | # "vehicle_windows_non_float_processed", # none in dataset 32 | "containers", 33 | "tableware", 34 | "headlamps" 35 | )) 36 | }) 37 | 38 | # Save dataset 39 | usethis::use_data(glass, overwrite = TRUE) 40 | -------------------------------------------------------------------------------- /R/bridges_docs.R: -------------------------------------------------------------------------------- 1 | #' Pittsburgh Bridges Data Set 2 | #' 3 | #' Data containing examples of Pittsburgh bridges and the relevant surrounding 4 | #' area. 5 | #' 6 | #' @format A data frame with 108 observations on the following 13 variables. 7 | #' - `identif` 8 | #' - identifier of the examples 9 | #' - `river` 10 | #' - A, M, O, Y 11 | #' - `location` 12 | #' - Location of Bridge 13 | #' - `erected` 14 | #' - Year built 15 | #' - `purpose` 16 | #' - WALK, AQUEDUCT, RR, HIGHWAY 17 | #' - `length` 18 | #' - 804 - 4558 19 | #' - `lanes` 20 | #' - 1, 2, 4, 6 21 | #' - `clear_g` 22 | #' - N, G 23 | #' - `t_or_d` 24 | #' - THROUGH, DECK 25 | #' - `material` 26 | #' - WOOD, IRON, STEEL 27 | #' - `span` 28 | #' - SHORT, MEDUIM, LONG 29 | #' - `rel_l` 30 | #' - S, S-F, F 31 | #' - `type` 32 | #' - WOOD, SUSPEN, SIMPLE-T, ARCH, CANTILEV, CONT-T 33 | #' @details 34 | #' This data set is non-discretized, meaning the numeric properties 35 | #' were left intact. 36 | #' @source 37 | #' Yoram Reich & Steven J. Fenves 38 | #' Department of Civil Engineering 39 | #' and 40 | #' Engineering Design Research Center 41 | #' Carnegie Mellon University 42 | #' Pittsburgh, PA 15213 43 | #' @references 44 | #' 45 | #' 46 | "bridges" 47 | -------------------------------------------------------------------------------- /man/car_eval.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/car_eval_docs.R 3 | \docType{data} 4 | \name{car_eval} 5 | \alias{car_eval} 6 | \title{Car Evaluation Data Set} 7 | \format{ 8 | A data frame with 1728 observations on the following 7 variables. 9 | \itemize{ 10 | \item \code{buying} 11 | \itemize{ 12 | \item vhigh, high, med, low. 13 | } 14 | \item \code{maint} 15 | \itemize{ 16 | \item vhigh, high, med, low. 17 | } 18 | \item \code{doors} 19 | \itemize{ 20 | \item 2, 3, 4, 5more. 21 | } 22 | \item \code{persons} 23 | \itemize{ 24 | \item 2, 4, more. 25 | } 26 | \item \code{lug_boot} 27 | \itemize{ 28 | \item small, med, big. 29 | } 30 | \item \code{safety} 31 | \itemize{ 32 | \item low, med, high. 33 | } 34 | \item \code{class_value} 35 | \itemize{ 36 | \item unacc, acc, good, vgood 37 | } 38 | } 39 | } 40 | \source{ 41 | Marko Bohanec (marko.bohanec '@' ijs.si) 42 | Blaz Zupan (blaz.zupan '@' ijs.si) 43 | } 44 | \usage{ 45 | car_eval 46 | } 47 | \description{ 48 | Car Evaluation Database was derived from a simple hierarchical decision model 49 | originally developed for the demonstration of DEX. 50 | } 51 | \references{ 52 | M. Bohanec, V. Rajkovic: Expert system for decision making. Sistemica 1(1), pp. 145-157, 1990.) 53 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/car/car.data} 54 | \url{https://archive.ics.uci.edu/ml/datasets/Car+Evaluation} 55 | } 56 | \keyword{datasets} 57 | -------------------------------------------------------------------------------- /R/glass_docs.R: -------------------------------------------------------------------------------- 1 | #' Glass Identification Data Set 2 | #' 3 | #' The study of classification of types of glass was motivated by criminological 4 | #' investigation. At the scene of the crime, the glass left can be used as 5 | #' evidence...if it is correctly identified! 6 | #' 7 | #' @format A data frame with 214 observations on the following 11 variables. 8 | #' - `ID`: 1 to 214 9 | #' - `RI`: refractive index 10 | #' - `Na`: weight percent in corresponding oxide, as are attributes 4-10) 11 | #' - `Mg`: Magnesium 12 | #' - `Al`: Aluminum 13 | #' - `Si`: Silicon 14 | #' - `K`: Potassium 15 | #' - `Ca`: Calcium 16 | #' - `Ba`: Barium 17 | #' - `Fe`: Iron 18 | #' - `Type`: Class attribute 19 | #' - 1: building_windows_float_processed 20 | #' - 2: building_windows_non_float_processed 21 | #' - 3: vehicle_windows_float_processed 22 | #' - 4: vehicle_windows_non_float_processed (none in this database) 23 | #' - 5: containers 24 | #' - 6: tableware 25 | #' - 7: headlamps 26 | #' @source 27 | #' 28 | #' B. German 29 | #' Central Research Establishment 30 | #' Home Office Forensic Science Service 31 | #' Aldermaston, Reading, Berkshire RG7 4PN 32 | #' 33 | #' Vina Spiehler, Ph.D., DABFT 34 | #' Diagnostic Products Corporation 35 | #' (213) 776-0180 (ext 3014) 36 | #' 37 | #' @references 38 | #' 39 | #' 40 | "glass" 41 | -------------------------------------------------------------------------------- /man/autompg.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autompg_docs.R 3 | \docType{data} 4 | \name{autompg} 5 | \alias{autompg} 6 | \title{Autompg Data Set} 7 | \format{ 8 | A data frame with 398 observations on the following 9 variables. 9 | \itemize{ 10 | \item \code{mpg}: continuous 11 | \item \code{cylinders}: multi-valued discrete 12 | \item \code{displacement}: continuous 13 | \item \code{horsepower}: continuous 14 | \item \code{weight}: continuous 15 | \item \code{acceleration}: continuous 16 | \item \code{model_year}: multi-valued discrete 17 | \item \code{origin}: multi-valued discrete 18 | \item \code{car_name}: string (unique for each instance) 19 | } 20 | } 21 | \source{ 22 | This dataset was taken from the StatLib library which is 23 | maintained at Carnegie Mellon University. The dataset was 24 | used in the 1983 American Statistical Association Exposition. 25 | } 26 | \usage{ 27 | autompg 28 | } 29 | \description{ 30 | This dataset is a slightly modified version of the dataset provided in 31 | the StatLib library. In line with the use by Ross Quinlan (1993) in 32 | predicting the attribute "mpg", 8 of the original instances were removed 33 | because they had unknown values for the "mpg" attribute. 34 | } 35 | \references{ 36 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.names} 37 | \url{https://archive.ics.uci.edu/ml/datasets/auto+mpg} 38 | } 39 | \keyword{datasets} 40 | -------------------------------------------------------------------------------- /data-raw/hepatitis_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | # Hepatitis Data http://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data 3 | 4 | url_hepatitis = "http://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data" 5 | 6 | hepatitis = read.csv( 7 | url_hepatitis, 8 | header = FALSE, na.strings = "?" 9 | ) 10 | 11 | # Columns taken verbatim from ML page 12 | # Regex search with: [0-9]{1,2}\. (.*):.* 13 | # Replacement: "\1", 14 | var_names = c( 15 | "Class", 16 | "AGE", 17 | "SEX", 18 | "STEROID", 19 | "ANTIVIRALS", 20 | "FATIGUE", 21 | "MALAISE", 22 | "ANOREXIA", 23 | "LIVER BIG", 24 | "LIVER FIRM", 25 | "SPLEEN PALPABLE", 26 | "SPIDERS", 27 | "ASCITES", 28 | "VARICES", 29 | "BILIRUBIN", 30 | "ALK PHOSPHATE", 31 | "SGOT", 32 | "ALBUMIN", 33 | "PROTIME", 34 | "HISTOLOGY" 35 | ) 36 | 37 | var_names_safe = gsub("[[:space:]]", "_", var_names) 38 | 39 | # Label columns 40 | colnames(hepatitis) = tolower(var_names_safe) 41 | 42 | # Make into a dichotomous variable marked by a factor 43 | hepatitis[, c(4:14, 20)] = lapply(hepatitis[, c(4:14, 20)], factor, labels = c("No", "Yes")) 44 | 45 | # Switch to being factor based 46 | hepatitis = within(hepatitis,{ 47 | class = factor(class, labels = c("Die", "Live")) 48 | sex = factor(sex, labels = c("Male", "Female")) 49 | }) 50 | 51 | usethis::use_data(hepatitis, overwrite = TRUE) 52 | 53 | ## output colnames 54 | cat(paste0(colnames(hepatitis),"\n"), sep="") 55 | -------------------------------------------------------------------------------- /.github/workflows/pkgdown.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | release: 9 | types: [published] 10 | workflow_dispatch: 11 | 12 | name: pkgdown 13 | 14 | jobs: 15 | pkgdown: 16 | runs-on: ubuntu-latest 17 | # Only restrict concurrency for non-PR jobs 18 | concurrency: 19 | group: pkgdown-${{ github.event_name != 'pull_request' || github.run_id }} 20 | env: 21 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 22 | permissions: 23 | contents: write 24 | steps: 25 | - uses: actions/checkout@v3 26 | 27 | - uses: r-lib/actions/setup-pandoc@v2 28 | 29 | - uses: r-lib/actions/setup-r@v2 30 | with: 31 | use-public-rspm: true 32 | 33 | - uses: r-lib/actions/setup-r-dependencies@v2 34 | with: 35 | extra-packages: any::pkgdown, local::. 36 | needs: website 37 | 38 | - name: Build site 39 | run: pkgdown::build_site_github_pages(new_process = FALSE, install = FALSE) 40 | shell: Rscript {0} 41 | 42 | - name: Deploy to GitHub pages 🚀 43 | if: github.event_name != 'pull_request' 44 | uses: JamesIves/github-pages-deploy-action@v4.4.1 45 | with: 46 | clean: false 47 | branch: gh-pages 48 | folder: docs 49 | -------------------------------------------------------------------------------- /R/hepatitis_docs.R: -------------------------------------------------------------------------------- 1 | #' Hepatitis Data Set 2 | #' 3 | #' This data set contains information on folks that suffer from hepatitis. 4 | #' 5 | #' @format A data frame with 6497 observations (1599 Red and 4898 White) on the following 12 variables. 6 | #' - `class` 7 | #' - Die or Live 8 | #' - `age` 9 | #' - Integer 10 | #' - `sex` 11 | #' - Male, Female 12 | #' - `steroid` 13 | #' - No, Yes 14 | #' - `antivirals` 15 | #' - No, Yes 16 | #' - `fatigue` 17 | #' - No, Yes 18 | #' - `malaise` 19 | #' - No, Yes 20 | #' - `anorexia` 21 | #' - No, Yes 22 | #' - `liver_big` 23 | #' - No, Yes 24 | #' - `liver_firm` 25 | #' - No, Yes 26 | #' - `spleen_palpable` 27 | #' - No, Yes 28 | #' - `spiders` 29 | #' - No, Yes 30 | #' - `ascites` 31 | #' - No, Yes 32 | #' - `varices` 33 | #' - No, Yes 34 | #' - `bilirubin` 35 | #' - Numeric 36 | #' - This can also be treated as a factor 37 | #' - `alk_phosphate` 38 | #' - Integer 39 | #' - `sgot` 40 | #' - Integer 41 | #' - `albumin` 42 | #' - Numeric 43 | #' - `protime` 44 | #' - Integer 45 | #' - `histology` 46 | #' - No, Yes 47 | #' @source 48 | #' G.Gong (Carnegie-Mellon University) via 49 | #' Bojan Cestnik 50 | #' Jozef Stefan Institute 51 | #' Jamova 39 52 | #' 61000 Ljubljana 53 | #' Yugoslavia (tel.: (38)(+61) 214-399 ext.287) 54 | #' 55 | #' @references 56 | #' 57 | #' 58 | "hepatitis" 59 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | # Workflow derived from https://github.com/r-lib/actions/tree/v2/examples 2 | # Need help debugging build failures? Start at https://github.com/r-lib/actions#where-to-find-help 3 | on: 4 | push: 5 | branches: [main, master] 6 | pull_request: 7 | branches: [main, master] 8 | 9 | name: R-CMD-check 10 | 11 | jobs: 12 | R-CMD-check: 13 | runs-on: ${{ matrix.config.os }} 14 | 15 | name: ${{ matrix.config.os }} (${{ matrix.config.r }}) 16 | 17 | strategy: 18 | fail-fast: false 19 | matrix: 20 | config: 21 | - {os: macos-latest, r: 'release'} 22 | - {os: windows-latest, r: 'release'} 23 | - {os: ubuntu-latest, r: 'devel', http-user-agent: 'release'} 24 | - {os: ubuntu-latest, r: 'release'} 25 | - {os: ubuntu-latest, r: 'oldrel-1'} 26 | 27 | env: 28 | GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} 29 | R_KEEP_PKG_SOURCE: yes 30 | 31 | steps: 32 | - uses: actions/checkout@v3 33 | 34 | - uses: r-lib/actions/setup-pandoc@v2 35 | 36 | - uses: r-lib/actions/setup-r@v2 37 | with: 38 | r-version: ${{ matrix.config.r }} 39 | http-user-agent: ${{ matrix.config.http-user-agent }} 40 | use-public-rspm: true 41 | 42 | - uses: r-lib/actions/setup-r-dependencies@v2 43 | with: 44 | extra-packages: any::rcmdcheck 45 | needs: check 46 | 47 | - uses: r-lib/actions/check-r-package@v2 48 | with: 49 | upload-snapshots: true 50 | -------------------------------------------------------------------------------- /data-raw/bcw_original_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Breast Cancer Wisonsin (Original) Data https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original) 3 | 4 | url_breast_cancer = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data" 5 | 6 | breast_cancer_wis_data = read.csv(url_breast_cancer, 7 | header = FALSE, # No header 8 | na.strings = "?", # NA strings are `?` in the data (~16) 9 | stringsAsFactors = FALSE) 10 | 11 | colnames(breast_cancer_wis_data) = c("sample_code_number", 12 | "clump_thickness", 13 | "uniformity_of_cell_size", 14 | "uniformity_of_cell_shape", 15 | "marginal_adhesion", 16 | "single_epithelial_cell_size", 17 | "bare_nuclei", 18 | "bland_chromatin", 19 | "normal_nucleoli", 20 | "mitoses", 21 | "class") 22 | 23 | breast_cancer_wis_data = within(breast_cancer_wis_data, { 24 | factor(class, labels = c("benign", "malignant")) 25 | }) 26 | 27 | bcw_original = breast_cancer_wis_data 28 | 29 | rm(list="breast_cancer_wis_data") 30 | 31 | usethis::use_data(bcw_original, overwrite = TRUE) 32 | -------------------------------------------------------------------------------- /NEWS.md: -------------------------------------------------------------------------------- 1 | # ucidata 0.0.3 2 | 3 | ## Features 4 | 5 | - Select datasets from the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php) 6 | as `data.frame` objects with appropriate type-casts. 7 | - Accompanying documentation is available via `?dataset`. 8 | 9 | ## Data Sets 10 | 11 | - [`abalone`](https://archive.ics.uci.edu/ml/datasets/abalone) 12 | - [`adult`](https://archive.ics.uci.edu/ml/datasets/adult) 13 | - [`autoimports`](https://archive.ics.uci.edu/ml/datasets/Automobile) 14 | - [`autompg`](https://archive.ics.uci.edu/ml/datasets/auto+mpg) 15 | - Breast Cancer Wisconsin: 16 | - [`bcw_original` (Breast Cancer Wisconsin Original)](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)) 17 | - Heart Disease 18 | - [`heart_disease_cl`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 19 | - [`heart_disease_hu`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 20 | - [`heart_disease_va`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 21 | - [`heart_disease_ch`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 22 | - [`bike_sharing_daily`](https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset) 23 | - [`bridges`](https://archive.ics.uci.edu/ml/datasets/Pittsburgh+Bridges) 24 | - [`car_eval`](https://archive.ics.uci.edu/ml/datasets/Car+Evaluation) 25 | - [`forest_fires`](https://archive.ics.uci.edu/ml/datasets/Forest+Fires) 26 | - [`glass`](https://archive.ics.uci.edu/ml/datasets/Glass+Identification) 27 | - [`hepatitis`](https://archive.ics.uci.edu/ml/datasets/hepatitis) 28 | - [`wine`](https://archive.ics.uci.edu/ml/datasets/wine) 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /R/wine_docs.R: -------------------------------------------------------------------------------- 1 | #' Wine Data Set 2 | #' 3 | #' This data set is the combination of two datasets that were created, using red and white wine samples. 4 | #' The inputs include objective tests (e.g. PH values) and the output is based on sensory data 5 | #' (median of at least 3 evaluations made by wine experts). Each expert graded the wine quality 6 | #' between 0 (very bad) and 10 (very excellent). Several data mining methods were applied to model 7 | #' these datasets under a regression approach. The support vector machine model achieved the 8 | #' best results. Several metrics were computed: MAD, confusion matrix for a fixed error tolerance (T), 9 | #' etc. Also, we plot the relative importances of the input variables (as measured by a sensitivity 10 | #' analysis procedure). 11 | #' @format A data frame with 6497 observations (1599 Red and 4898 White) on the following 12 variables. 12 | #' - fixed acidity 13 | #' - volatile acidity 14 | #' - citric acid 15 | #' - residual sugar 16 | #' - chlorides 17 | #' - free sulfur dioxide 18 | #' - total sulfur dioxide 19 | #' - density 20 | #' - pH 21 | #' - sulphates 22 | #' - alcohol 23 | #' - quality 24 | #' - Score between 0 and 10 based on sensor reading 25 | #' - color 26 | #' - `"White"` or `"Red"` 27 | #' @source P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 28 | #' Modeling wine preferences by data mining from physicochemical properties. 29 | #' In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236. 30 | #' @references 31 | #' 32 | #' 33 | "wine" 34 | -------------------------------------------------------------------------------- /man/glass.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/glass_docs.R 3 | \docType{data} 4 | \name{glass} 5 | \alias{glass} 6 | \title{Glass Identification Data Set} 7 | \format{ 8 | A data frame with 214 observations on the following 11 variables. 9 | \itemize{ 10 | \item \code{ID}: 1 to 214 11 | \item \code{RI}: refractive index 12 | \item \code{Na}: weight percent in corresponding oxide, as are attributes 4-10) 13 | \item \code{Mg}: Magnesium 14 | \item \code{Al}: Aluminum 15 | \item \code{Si}: Silicon 16 | \item \code{K}: Potassium 17 | \item \code{Ca}: Calcium 18 | \item \code{Ba}: Barium 19 | \item \code{Fe}: Iron 20 | \item \code{Type}: Class attribute 21 | \itemize{ 22 | \item 1: building_windows_float_processed 23 | \item 2: building_windows_non_float_processed 24 | \item 3: vehicle_windows_float_processed 25 | \item 4: vehicle_windows_non_float_processed (none in this database) 26 | \item 5: containers 27 | \item 6: tableware 28 | \item 7: headlamps 29 | } 30 | } 31 | } 32 | \source{ 33 | B. German 34 | Central Research Establishment 35 | Home Office Forensic Science Service 36 | Aldermaston, Reading, Berkshire RG7 4PN 37 | 38 | Vina Spiehler, Ph.D., DABFT 39 | Diagnostic Products Corporation 40 | (213) 776-0180 (ext 3014) 41 | } 42 | \usage{ 43 | glass 44 | } 45 | \description{ 46 | The study of classification of types of glass was motivated by criminological 47 | investigation. At the scene of the crime, the glass left can be used as 48 | evidence...if it is correctly identified! 49 | } 50 | \references{ 51 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/glass/glass.data} 52 | \url{https://archive.ics.uci.edu/ml/datasets/Glass+Identification} 53 | } 54 | \keyword{datasets} 55 | -------------------------------------------------------------------------------- /R/bcw_original_docs.R: -------------------------------------------------------------------------------- 1 | #' Breast Cancer Wisconsin (Original) Data Set 2 | #' 3 | #' Samples arrive periodically as Dr. Wolberg reports his clinical cases. 4 | #' The database therefore reflects this chronological grouping of the data. 5 | #' 6 | #' @format A data frame with 699 observations on the following 11 variables. 7 | #' - `sample_code_number`: id number 8 | #' - `clump_thickness`: 1 - 10 9 | #' - `uniformity_of_cell_size`: 1 - 10 10 | #' - `uniformity_of_cell_shape`: 1 - 10 11 | #' - `single_epithelial_cell_size`: 1 - 10 12 | #' - `bare_nuclei`: 1 - 10 13 | #' - `bland_chromatin`: 1 - 10 14 | #' - `normal_nucleoli`: 1 - 10 15 | #' - `mitoses`: 1 - 10 16 | #' - `class`: 2 for benign, 4 for malignant 17 | #' 18 | #' @source 19 | #' Dr. William H. Wolberg - Physician 20 | #' University of Wisconsin Hospitals 21 | #' Madison, Wisconsin, USA 22 | #' 23 | #' @references 24 | #' 25 | #' 26 | #' 27 | #' @details 28 | #' This grouping information appears immediately below, having been removed from the data itself: 29 | #' 30 | #' \tabular{rrr}{ 31 | #' Group \tab Instances \tab Date of Collection\cr 32 | #' 1 \tab 367 \tab January 1989\cr 33 | #' 2 \tab 70 \tab October 1989\cr 34 | #' 3 \tab 31 \tab February 1990\cr 35 | #' 4 \tab 17 \tab April 1990\cr 36 | #' 5 \tab 48 \tab August 1990\cr 37 | #' 6 \tab 49 \tab Updated January 1991\cr 38 | #' 7 \tab 31 \tab June 1991\cr 39 | #' 8 \tab 86 \tab November 1991\cr 40 | #' Total \tab 699 points \tab 15 July 1992 41 | #' } 42 | #' 43 | #' Note that the results summarized above in Past Usage refer to a dataset of 44 | #' size 369, while Group 1 has only 367 instances. This is because it 45 | #' originally contained 369 instances; 2 were removed. 46 | "bcw_original" 47 | -------------------------------------------------------------------------------- /man/bridges.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bridges_docs.R 3 | \docType{data} 4 | \name{bridges} 5 | \alias{bridges} 6 | \title{Pittsburgh Bridges Data Set} 7 | \format{ 8 | A data frame with 108 observations on the following 13 variables. 9 | \itemize{ 10 | \item \code{identif} 11 | \itemize{ 12 | \item identifier of the examples 13 | } 14 | \item \code{river} 15 | \itemize{ 16 | \item A, M, O, Y 17 | } 18 | \item \code{location} 19 | \itemize{ 20 | \item Location of Bridge 21 | } 22 | \item \code{erected} 23 | \itemize{ 24 | \item Year built 25 | } 26 | \item \code{purpose} 27 | \itemize{ 28 | \item WALK, AQUEDUCT, RR, HIGHWAY 29 | } 30 | \item \code{length} 31 | \itemize{ 32 | \item 804 - 4558 33 | } 34 | \item \code{lanes} 35 | \itemize{ 36 | \item 1, 2, 4, 6 37 | } 38 | \item \code{clear_g} 39 | \itemize{ 40 | \item N, G 41 | } 42 | \item \code{t_or_d} 43 | \itemize{ 44 | \item THROUGH, DECK 45 | } 46 | \item \code{material} 47 | \itemize{ 48 | \item WOOD, IRON, STEEL 49 | } 50 | \item \code{span} 51 | \itemize{ 52 | \item SHORT, MEDUIM, LONG 53 | } 54 | \item \code{rel_l} 55 | \itemize{ 56 | \item S, S-F, F 57 | } 58 | \item \code{type} 59 | \itemize{ 60 | \item WOOD, SUSPEN, SIMPLE-T, ARCH, CANTILEV, CONT-T 61 | } 62 | } 63 | } 64 | \source{ 65 | Yoram Reich & Steven J. Fenves 66 | Department of Civil Engineering 67 | and 68 | Engineering Design Research Center 69 | Carnegie Mellon University 70 | Pittsburgh, PA 15213 71 | } 72 | \usage{ 73 | bridges 74 | } 75 | \description{ 76 | Data containing examples of Pittsburgh bridges and the relevant surrounding 77 | area. 78 | } 79 | \details{ 80 | This data set is non-discretized, meaning the numeric properties 81 | were left intact. 82 | } 83 | \references{ 84 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/bridges/bridges.data.version1} 85 | \url{https://archive.ics.uci.edu/ml/datasets/Pittsburgh+Bridges} 86 | } 87 | \keyword{datasets} 88 | -------------------------------------------------------------------------------- /man/wine.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/wine_docs.R 3 | \docType{data} 4 | \name{wine} 5 | \alias{wine} 6 | \title{Wine Data Set} 7 | \format{ 8 | A data frame with 6497 observations (1599 Red and 4898 White) on the following 12 variables. 9 | \itemize{ 10 | \item fixed acidity 11 | \item volatile acidity 12 | \item citric acid 13 | \item residual sugar 14 | \item chlorides 15 | \item free sulfur dioxide 16 | \item total sulfur dioxide 17 | \item density 18 | \item pH 19 | \item sulphates 20 | \item alcohol 21 | \item quality 22 | \itemize{ 23 | \item Score between 0 and 10 based on sensor reading 24 | } 25 | \item color 26 | \itemize{ 27 | \item \code{"White"} or \code{"Red"} 28 | } 29 | } 30 | } 31 | \source{ 32 | P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. 33 | Modeling wine preferences by data mining from physicochemical properties. 34 | In Decision Support Systems, Elsevier, 47(4):547-553. ISSN: 0167-9236. 35 | } 36 | \usage{ 37 | wine 38 | } 39 | \description{ 40 | This data set is the combination of two datasets that were created, using red and white wine samples. 41 | The inputs include objective tests (e.g. PH values) and the output is based on sensory data 42 | (median of at least 3 evaluations made by wine experts). Each expert graded the wine quality 43 | between 0 (very bad) and 10 (very excellent). Several data mining methods were applied to model 44 | these datasets under a regression approach. The support vector machine model achieved the 45 | best results. Several metrics were computed: MAD, confusion matrix for a fixed error tolerance (T), 46 | etc. Also, we plot the relative importances of the input variables (as measured by a sensitivity 47 | analysis procedure). 48 | } 49 | \references{ 50 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality.names} 51 | \url{https://archive.ics.uci.edu/ml/datasets/Wine+Quality} 52 | } 53 | \keyword{datasets} 54 | -------------------------------------------------------------------------------- /R/abalone_docs.R: -------------------------------------------------------------------------------- 1 | #' Abalone Data Set 2 | #' 3 | #' Predicting the age of abalone from physical measurements. The age of abalone 4 | #' is determined by cutting the shell through the cone, staining it, and 5 | #' counting the number of rings through a microscope -- a boring and 6 | #' time-consuming task. Other measurements, which are easier to obtain, are 7 | #' used to predict the age. Further information, such as weather patterns and 8 | #' location (hence food availability) may be required to solve the problem. 9 | #' 10 | #' @format A data frame with 4177 observations on the following 9 variables. 11 | #' - `sex`: Factor 12 | #' - `M` (Male), `F` (Female), and `I` (Infant) 13 | #' - `length`: Numeric 14 | #' - Longest shell measurement (mm) 15 | #' - `diameter`: Numeric 16 | #' - Perpendicular to length (mm) 17 | #' - `height`: Numeric 18 | #' - With meat in shell (mm) 19 | #' - `whole_weight`: Numeric 20 | #' - Whole abalone weight (grams) 21 | #' - `shucked_weight`: Numeric 22 | #' - Weight of meat (grams) 23 | #' - `viscera_weight`: Numeric 24 | #' - Gut weight after bleeding (grams) 25 | #' - `shell_weight`: Numeric 26 | #' - Shell weight after being dried (grams) 27 | #' - `rings`: Integer 28 | #' - Adding 1.5 gives the age in years 29 | #' @references 30 | #' Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and Wes B Ford (1994) 31 | #' "The Population Biology of Abalone (_Haliotis_ species) in Tasmania. I. Blacklip Abalone (_H. rubra_) from the North Coast and Islands of Bass Strait", 32 | #' Sea Fisheries Division, Technical Report No. 48 (ISSN 1034-3288) 33 | #' 34 | #' 35 | #' 36 | #' 37 | #' @source 38 | #' Marine Resources Division 39 | #' Marine Research Laboratories - Taroona 40 | #' Department of Primary Industry and Fisheries, Tasmania 41 | #' GPO Box 619F, Hobart, Tasmania 7001, Australia 42 | #' (contact: Warwick Nash +61 02 277277, wnash '@' dpi.tas.gov.au) 43 | "abalone" 44 | -------------------------------------------------------------------------------- /R/forest_fires_docs.R: -------------------------------------------------------------------------------- 1 | #' Forest Fires Data Set 2 | #' 3 | #' The aim is to predict the burned area of forest fires, in the northeast 4 | #' region of Portugal, by using meteorological and other data 5 | #' 6 | #' @format A data frame with 517 observations on the following 13 variables. 7 | #' - `X` 8 | #' - x-axis spatial coordinate within the Montesinho park map: 1 to 9 9 | #' - `Y` 10 | #' - y-axis spatial coordinate within the Montesinho park map: 2 to 9 11 | #' - `month` 12 | #' - month of the year: "jan" to "dec" 13 | #' - `day` 14 | #' - day of the week: "mon" to "sun" 15 | #' - `FFMC` 16 | #' - FFMC index from the FWI system: 18.7 to 96.20 17 | #' - `DMC` 18 | #' - DMC index from the FWI system: 1.1 to 291.3 19 | #' - `DC` 20 | #' - DC index from the FWI system: 7.9 to 860.6 21 | #' - `ISI` 22 | #' - ISI index from the FWI system: 0.0 to 56.10 23 | #' - `temp` 24 | #' - temperature in Celsius degrees: 2.2 to 33.30 25 | #' - `RH` 26 | #' - relative humidity in %: 15.0 to 100 27 | #' - `wind` 28 | #' - wind speed in km/h: 0.40 to 9.40 29 | #' - `rain` 30 | #' - outside rain in mm/m2 : 0.0 to 6.4 31 | #' - `area` 32 | #' - the burned area of the forest (in ha): 0.00 to 1090.84#' 33 | #' @source 34 | #' Paulo Cortez, pcortez '@' dsi.uminho.pt, Department of Information Systems, University of Minho, Portugal. 35 | #' Aníbal Morais, araimorais '@' gmail.com, Department of Information Systems, University of Minho, Portugal. 36 | #' @references 37 | #' [ P. Cortez and A. Morais. A Data Mining Approach to Predict Forest Fires using Meteorological Data. In J. Neves, M. F. Santos and J. Machado Eds., New Trends in Artificial Intelligence, Proceedings of the 13th EPIA 2007 - Portuguese Conference on Artificial Intelligence, December, Guimarães, Portugal, pp. 512-523, 2007. APPIA, ISBN-13 978-989-95618-0-9](http://www.dsi.uminho.pt/~pcortez/fires.pdf) 38 | #' 39 | #' 40 | "forest_fires" 41 | 42 | 43 | -------------------------------------------------------------------------------- /man/bcw_original.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bcw_original_docs.R 3 | \docType{data} 4 | \name{bcw_original} 5 | \alias{bcw_original} 6 | \title{Breast Cancer Wisconsin (Original) Data Set} 7 | \format{ 8 | A data frame with 699 observations on the following 11 variables. 9 | \itemize{ 10 | \item \code{sample_code_number}: id number 11 | \item \code{clump_thickness}: 1 - 10 12 | \item \code{uniformity_of_cell_size}: 1 - 10 13 | \item \code{uniformity_of_cell_shape}: 1 - 10 14 | \item \code{single_epithelial_cell_size}: 1 - 10 15 | \item \code{bare_nuclei}: 1 - 10 16 | \item \code{bland_chromatin}: 1 - 10 17 | \item \code{normal_nucleoli}: 1 - 10 18 | \item \code{mitoses}: 1 - 10 19 | \item \code{class}: 2 for benign, 4 for malignant 20 | } 21 | } 22 | \source{ 23 | Dr. William H. Wolberg - Physician 24 | University of Wisconsin Hospitals 25 | Madison, Wisconsin, USA 26 | } 27 | \usage{ 28 | bcw_original 29 | } 30 | \description{ 31 | Samples arrive periodically as Dr. Wolberg reports his clinical cases. 32 | The database therefore reflects this chronological grouping of the data. 33 | } 34 | \details{ 35 | This grouping information appears immediately below, having been removed from the data itself: 36 | 37 | \tabular{rrr}{ 38 | Group \tab Instances \tab Date of Collection\cr 39 | 1 \tab 367 \tab January 1989\cr 40 | 2 \tab 70 \tab October 1989\cr 41 | 3 \tab 31 \tab February 1990\cr 42 | 4 \tab 17 \tab April 1990\cr 43 | 5 \tab 48 \tab August 1990\cr 44 | 6 \tab 49 \tab Updated January 1991\cr 45 | 7 \tab 31 \tab June 1991\cr 46 | 8 \tab 86 \tab November 1991\cr 47 | Total \tab 699 points \tab 15 July 1992 48 | } 49 | 50 | Note that the results summarized above in Past Usage refer to a dataset of 51 | size 369, while Group 1 has only 367 instances. This is because it 52 | originally contained 369 instances; 2 were removed. 53 | } 54 | \references{ 55 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/breast-cancer-wisconsin.data} 56 | } 57 | \keyword{datasets} 58 | -------------------------------------------------------------------------------- /man/hepatitis.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/hepatitis_docs.R 3 | \docType{data} 4 | \name{hepatitis} 5 | \alias{hepatitis} 6 | \title{Hepatitis Data Set} 7 | \format{ 8 | A data frame with 6497 observations (1599 Red and 4898 White) on the following 12 variables. 9 | \itemize{ 10 | \item \code{class} 11 | \itemize{ 12 | \item Die or Live 13 | } 14 | \item \code{age} 15 | \itemize{ 16 | \item Integer 17 | } 18 | \item \code{sex} 19 | \itemize{ 20 | \item Male, Female 21 | } 22 | \item \code{steroid} 23 | \itemize{ 24 | \item No, Yes 25 | } 26 | \item \code{antivirals} 27 | \itemize{ 28 | \item No, Yes 29 | } 30 | \item \code{fatigue} 31 | \itemize{ 32 | \item No, Yes 33 | } 34 | \item \code{malaise} 35 | \itemize{ 36 | \item No, Yes 37 | } 38 | \item \code{anorexia} 39 | \itemize{ 40 | \item No, Yes 41 | } 42 | \item \code{liver_big} 43 | \itemize{ 44 | \item No, Yes 45 | } 46 | \item \code{liver_firm} 47 | \itemize{ 48 | \item No, Yes 49 | } 50 | \item \code{spleen_palpable} 51 | \itemize{ 52 | \item No, Yes 53 | } 54 | \item \code{spiders} 55 | \itemize{ 56 | \item No, Yes 57 | } 58 | \item \code{ascites} 59 | \itemize{ 60 | \item No, Yes 61 | } 62 | \item \code{varices} 63 | \itemize{ 64 | \item No, Yes 65 | } 66 | \item \code{bilirubin} 67 | \itemize{ 68 | \item Numeric 69 | \item This can also be treated as a factor 70 | } 71 | \item \code{alk_phosphate} 72 | \itemize{ 73 | \item Integer 74 | } 75 | \item \code{sgot} 76 | \itemize{ 77 | \item Integer 78 | } 79 | \item \code{albumin} 80 | \itemize{ 81 | \item Numeric 82 | } 83 | \item \code{protime} 84 | \itemize{ 85 | \item Integer 86 | } 87 | \item \code{histology} 88 | \itemize{ 89 | \item No, Yes 90 | } 91 | } 92 | } 93 | \source{ 94 | G.Gong (Carnegie-Mellon University) via 95 | Bojan Cestnik 96 | Jozef Stefan Institute 97 | Jamova 39 98 | 61000 Ljubljana 99 | Yugoslavia (tel.: (38)(+61) 214-399 ext.287) 100 | } 101 | \usage{ 102 | hepatitis 103 | } 104 | \description{ 105 | This data set contains information on folks that suffer from hepatitis. 106 | } 107 | \references{ 108 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/hepatitis/hepatitis.data} 109 | \url{https://archive.ics.uci.edu/ml/datasets/hepatitis} 110 | } 111 | \keyword{datasets} 112 | -------------------------------------------------------------------------------- /man/abalone.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/abalone_docs.R 3 | \docType{data} 4 | \name{abalone} 5 | \alias{abalone} 6 | \title{Abalone Data Set} 7 | \format{ 8 | A data frame with 4177 observations on the following 9 variables. 9 | \itemize{ 10 | \item \code{sex}: Factor 11 | \itemize{ 12 | \item \code{M} (Male), \code{F} (Female), and \code{I} (Infant) 13 | } 14 | \item \code{length}: Numeric 15 | \itemize{ 16 | \item Longest shell measurement (mm) 17 | } 18 | \item \code{diameter}: Numeric 19 | \itemize{ 20 | \item Perpendicular to length (mm) 21 | } 22 | \item \code{height}: Numeric 23 | \itemize{ 24 | \item With meat in shell (mm) 25 | } 26 | \item \code{whole_weight}: Numeric 27 | \itemize{ 28 | \item Whole abalone weight (grams) 29 | } 30 | \item \code{shucked_weight}: Numeric 31 | \itemize{ 32 | \item Weight of meat (grams) 33 | } 34 | \item \code{viscera_weight}: Numeric 35 | \itemize{ 36 | \item Gut weight after bleeding (grams) 37 | } 38 | \item \code{shell_weight}: Numeric 39 | \itemize{ 40 | \item Shell weight after being dried (grams) 41 | } 42 | \item \code{rings}: Integer 43 | \itemize{ 44 | \item Adding 1.5 gives the age in years 45 | } 46 | } 47 | } 48 | \source{ 49 | Marine Resources Division 50 | Marine Research Laboratories - Taroona 51 | Department of Primary Industry and Fisheries, Tasmania 52 | GPO Box 619F, Hobart, Tasmania 7001, Australia 53 | (contact: Warwick Nash +61 02 277277, wnash '@' dpi.tas.gov.au) 54 | } 55 | \usage{ 56 | abalone 57 | } 58 | \description{ 59 | Predicting the age of abalone from physical measurements. The age of abalone 60 | is determined by cutting the shell through the cone, staining it, and 61 | counting the number of rings through a microscope -- a boring and 62 | time-consuming task. Other measurements, which are easier to obtain, are 63 | used to predict the age. Further information, such as weather patterns and 64 | location (hence food availability) may be required to solve the problem. 65 | } 66 | \references{ 67 | Warwick J Nash, Tracy L Sellers, Simon R Talbot, Andrew J Cawthorn and Wes B Ford (1994) 68 | "The Population Biology of Abalone (\emph{Haliotis} species) in Tasmania. I. Blacklip Abalone (\emph{H. rubra}) from the North Coast and Islands of Bass Strait", 69 | Sea Fisheries Division, Technical Report No. 48 (ISSN 1034-3288) 70 | 71 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/abalone/} 72 | 73 | \url{https://archive.ics.uci.edu/ml/datasets/abalone} 74 | } 75 | \keyword{datasets} 76 | -------------------------------------------------------------------------------- /data-raw/bike_sharing_daily_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Bike Sharing (Daily) Data https://archive.ics.uci.edu/ml/datasets/Bike+Sharing+Dataset 3 | 4 | # Download the zip file 5 | download.file("http://archive.ics.uci.edu/ml/machine-learning-databases/00275/Bike-Sharing-Dataset.zip", 6 | "data-raw/Bike-Sharing-Dataset.zip") 7 | 8 | # Unzip and load bike sharing data into R 9 | # Note, data has a header in it! 10 | bike_sharing_daily = read.csv( 11 | unz("data-raw/Bike-Sharing-Dataset.zip", "day.csv"), 12 | header = TRUE, 13 | colClasses = c( 14 | "character", # instant 15 | "Date", # dteday 16 | "factor", # season 17 | "factor", # yr 18 | "factor", # mnth 19 | "factor", # holiday 20 | "factor", # weekday 21 | "factor", # workingday 22 | "factor", # weathersit 23 | "numeric", # temp 24 | "numeric", # atemp 25 | "numeric", # hum 26 | "numeric", # windspeed 27 | "integer", # casual 28 | "integer", # registered 29 | "integer" # cnt 30 | ) 31 | ) 32 | 33 | # Improve factor labels 34 | bike_sharing_daily = within(bike_sharing_daily, { 35 | levels(season) = c("Winter", "Spring", "Summer", "Fall") 36 | levels(yr) = c(2011, 2012) 37 | mnth = ordered(mnth, 1:12) # Order temporally 38 | levels(mnth) = c(month.abb) 39 | levels(holiday) = c("No", "Yes") 40 | levels(weekday) = c("Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat") 41 | levels(workingday) = c("No", "Yes") 42 | levels(weathersit) = c("Clear, Few clouds, Partly cloudy, Partly cloudy", 43 | "Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist", 44 | "Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds", 45 | "Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog") 46 | }) 47 | 48 | ## Add in normalized variables 49 | # bike_sharing_daily = within(bike_sharing_daily, { 50 | # actual_temp_celsius = denormalize_temp(temp, -8, 39) # Not sure if accurate 51 | # actual_atemp_celsius = denormalize_temp(atemp, -16, 50) # Not sure if accurate 52 | # actual_hum = hum * 100 53 | # actual_windspeed = windspeed * 67 54 | # }) 55 | 56 | # Write the bike_sharing_daily dataset 57 | usethis::use_data(bike_sharing_daily, overwrite = TRUE) 58 | 59 | # Remove the zip + csv after read in. 60 | file.remove("data-raw/Bike-Sharing-Dataset.zip") 61 | -------------------------------------------------------------------------------- /data-raw/heart_disease_build.R: -------------------------------------------------------------------------------- 1 | ### UCI Irvine 2 | ## Heart Disease Data https://archive.ics.uci.edu/ml/datasets/Heart+Disease 3 | 4 | # Named entries correspond to suffix of exported data set 5 | 6 | heart_disease_locs = c( 7 | "cl" = "cleveland", 8 | "hu" = "hungarian", 9 | "ch" = "switzerland", 10 | "va" = "va" 11 | ) 12 | 13 | # Data names 14 | heart_disease_names = paste0("heart_disease_", names(heart_disease_locs)) 15 | 16 | read_heart_disease_data = function(loc, url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.") { 17 | read.csv( 18 | paste0(url, loc, ".data"), 19 | header = FALSE, 20 | sep = ",", 21 | na.strings = "?" 22 | ) 23 | } 24 | 25 | 26 | cast_heart_disease_data = function(data) { 27 | names(data) = c( 28 | "age", 29 | "sex", 30 | "cp", 31 | "trestbps", 32 | "chol", 33 | "fbs", 34 | "restecg", 35 | "thalach", 36 | "exang", 37 | "oldpeak", 38 | "slope", 39 | "ca", 40 | "thal", 41 | "num" 42 | ) 43 | 44 | data = within(data, { 45 | sex = factor(sex, labels = c("Female", "Male")) 46 | cp = factor( 47 | cp, 48 | labels = c( 49 | "typical angina", 50 | "atypical angina", 51 | "non-anginal pain", 52 | "asymptomatic" 53 | ) 54 | ) 55 | restecg = factor( 56 | restecg, 57 | labels = c( 58 | "normal", 59 | "ST-T wave abnormality", 60 | "probable/definite hypertrophy" 61 | ) 62 | ) 63 | exang = factor(exang, labels = c("No", "Yes")) 64 | slope = factor(slope, labels = c("upsloping", 65 | "flat", 66 | "downsloping")) 67 | thal = factor(thal, 68 | labels = c("normal", 69 | "fixed defect", 70 | "reversable defect")) 71 | }) 72 | 73 | data 74 | } 75 | 76 | 77 | heart_disease_data = lapply(heart_disease_locs, read_heart_disease_data) 78 | heart_disease_data = lapply(heart_disease_data, cast_heart_disease_data) 79 | 80 | names(heart_disease_data) = heart_disease_names 81 | 82 | # Convert to global environment 83 | # See https://stackoverflow.com/questions/30516325/converting-a-list-of-data-frames-into-individual-data-frames-in-r 84 | list2env(heart_disease_data, envir = .GlobalEnv) 85 | 86 | # Poor man's devtools::use_data 87 | sapply(heart_disease_names, FUN = function(ds_name) { 88 | save(list = ds_name, 89 | file = paste0("data/", ds_name, ".rda")) 90 | }) 91 | -------------------------------------------------------------------------------- /man/forest_fires.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/forest_fires_docs.R 3 | \docType{data} 4 | \name{forest_fires} 5 | \alias{forest_fires} 6 | \title{Forest Fires Data Set} 7 | \format{ 8 | A data frame with 517 observations on the following 13 variables. 9 | \itemize{ 10 | \item \code{X} 11 | \itemize{ 12 | \item x-axis spatial coordinate within the Montesinho park map: 1 to 9 13 | } 14 | \item \code{Y} 15 | \itemize{ 16 | \item y-axis spatial coordinate within the Montesinho park map: 2 to 9 17 | } 18 | \item \code{month} 19 | \itemize{ 20 | \item month of the year: "jan" to "dec" 21 | } 22 | \item \code{day} 23 | \itemize{ 24 | \item day of the week: "mon" to "sun" 25 | } 26 | \item \code{FFMC} 27 | \itemize{ 28 | \item FFMC index from the FWI system: 18.7 to 96.20 29 | } 30 | \item \code{DMC} 31 | \itemize{ 32 | \item DMC index from the FWI system: 1.1 to 291.3 33 | } 34 | \item \code{DC} 35 | \itemize{ 36 | \item DC index from the FWI system: 7.9 to 860.6 37 | } 38 | \item \code{ISI} 39 | \itemize{ 40 | \item ISI index from the FWI system: 0.0 to 56.10 41 | } 42 | \item \code{temp} 43 | \itemize{ 44 | \item temperature in Celsius degrees: 2.2 to 33.30 45 | } 46 | \item \code{RH} 47 | \itemize{ 48 | \item relative humidity in \%: 15.0 to 100 49 | } 50 | \item \code{wind} 51 | \itemize{ 52 | \item wind speed in km/h: 0.40 to 9.40 53 | } 54 | \item \code{rain} 55 | \itemize{ 56 | \item outside rain in mm/m2 : 0.0 to 6.4 57 | } 58 | \item \code{area} 59 | \itemize{ 60 | \item the burned area of the forest (in ha): 0.00 to 1090.84#' 61 | } 62 | } 63 | } 64 | \source{ 65 | Paulo Cortez, pcortez '@' dsi.uminho.pt, Department of Information Systems, University of Minho, Portugal. 66 | Aníbal Morais, araimorais '@' gmail.com, Department of Information Systems, University of Minho, Portugal. 67 | } 68 | \usage{ 69 | forest_fires 70 | } 71 | \description{ 72 | The aim is to predict the burned area of forest fires, in the northeast 73 | region of Portugal, by using meteorological and other data 74 | } 75 | \references{ 76 | \href{http://www.dsi.uminho.pt/~pcortez/fires.pdf}{ P. Cortez and A. Morais. A Data Mining Approach to Predict Forest Fires using Meteorological Data. In J. Neves, M. F. Santos and J. Machado Eds., New Trends in Artificial Intelligence, Proceedings of the 13th EPIA 2007 - Portuguese Conference on Artificial Intelligence, December, Guimarães, Portugal, pp. 512-523, 2007. APPIA, ISBN-13 978-989-95618-0-9} 77 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv} 78 | \url{https://archive.ics.uci.edu/ml/datasets/Forest+Fires} 79 | } 80 | \keyword{datasets} 81 | -------------------------------------------------------------------------------- /R/adult_docs.R: -------------------------------------------------------------------------------- 1 | #' Adult Data Set 2 | #' 3 | #' Extraction was done by Barry Becker from the 1994 Census database. 4 | #' A set of reasonably clean records was extracted using the following 5 | #' conditions: ((AAGE > 16) && (AGI > 100) && (AFNLWGT > 1) && (HRSWK > 0)) 6 | #' 7 | #' @format A data frame with 32561 observations on the following 15 variables. 8 | #' - `age`: Integer 9 | #' - Number of years alive 10 | #' - `workclass`: Factor 11 | #' - Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, 12 | #' State-gov, Without-pay, Never-worked. 13 | #' - `fnlwgt`: Numeric 14 | #' - The variable represents the Final Weight, which is more so a sampling weight. 15 | #' See the names file listed in references for more details. 16 | #' - `education`: Factor 17 | #' - Highest level of education attained 18 | #' - Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, 19 | #' Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 20 | #' 5th-6th, Preschool. 21 | #' - `education_num`: Numeric 22 | #' - Number of years of education 23 | #' - `marital_status`: Factor 24 | #' - Married-civ-spouse, Divorced, Never-married, Separated, Widowed, 25 | #' Married-spouse-absent, Married-AF-spouse 26 | #' - `occupation`: Factor 27 | #' - Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, 28 | #' Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, 29 | #' Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, 30 | #' Armed-Forces. 31 | #' - `relationship`: Factor 32 | #' - Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 33 | #' - `race`: Factor 34 | #' - White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 35 | #' - `sex`: Factor 36 | #' - Female, Male 37 | #' - `capital_gain`: Integer 38 | #' - Income from investment sources, apart from wages/salary 39 | #' - `capital_loss`: Integer 40 | #' - Losses from investment sources, apart from wages/salary 41 | #' - `hours_per_week`: Integer 42 | #' - Amount of hours worked per week 43 | #' - `native_country`: Factor 44 | #' - Country of origin 45 | #' - United-States, Cambodia, England, Puerto-Rico, Canada, Germany, 46 | #' Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, 47 | #' Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, 48 | #' Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, 49 | #' Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, 50 | #' Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. 51 | #' - `income`: Factor 52 | #' - Whether the income greater than $50,000 or not. 53 | #' - <=50K, >50K 54 | #' @details 55 | #' Prediction task is to determine whether a person makes over 50K a year. 56 | #' @references 57 | #' 58 | #' 59 | #' 60 | #' 61 | #' 62 | #' @source 63 | #' Ronny Kohavi and Barry Becker 64 | #' Data Mining and Visualization 65 | #' Silicon Graphics. 66 | #' e-mail: ronnyk '@' live.com for questions. 67 | "adult" 68 | -------------------------------------------------------------------------------- /R/heart_disease_processed_docs.R: -------------------------------------------------------------------------------- 1 | #' Heart Disease Processed Data Sets 2 | #' 3 | #' Detecting the presence of heart disease in patients. 4 | #' 5 | #' @format Four `data.frames` with a varying number of observations that contain 6 | #' the following 14 variables. 7 | #' - `age`: age in years 8 | #' - `sex`: sex (1 = male; 0 = female) 9 | #' - `cp`: chest pain type 10 | #' - Value 1: typical angina 11 | #' - Value 2: atypical angina 12 | #' - Value 3: non-anginal pain 13 | #' - Value 4: asymptomatic 14 | #' - `trestbps`: resting blood pressure (in mm Hg on admission to the hospital) 15 | #' - `chol`: serum cholestoral in mg/dl 16 | #' - `fbs`: fasting blood sugar > 120 mg/dl (1 = true; 0 = false) 17 | #' - `restecg`: resting electrocardiographic results 18 | #' - Value 0: normal 19 | #' - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 20 | #' - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 21 | #' - `thalach`: maximum heart rate achieved 22 | #' - `exang`: exercise induced angina (1 = yes; 0 = no) 23 | #' - `oldpeak`: ST depression induced by exercise relative to rest 24 | #' - `slope`: the slope of the peak exercise ST segment 25 | #' - Value 1: upsloping 26 | #' - Value 2: flat 27 | #' - Value 3: downsloping 28 | #' - `ca`: number of major vessels (0-3) colored by flourosopy 29 | #' - `thal`: See below 30 | #' - Value 3: normal 31 | #' - Value 6: fixed defect 32 | #' - Value 7: reversable defect 33 | #' - `num`: diagnosis of heart disease (angiographic disease status) 34 | #' - Value 0: < 50% diameter narrowing 35 | #' - Value 1: > 50% diameter narrowing 36 | #' 37 | #' @rdname heart_disease 38 | #' @source 39 | #' 1. Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. 40 | #' 2. University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. 41 | #' 3. University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. 42 | #' 4. V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D. 43 | #' 44 | #' @references 45 | #' 46 | #' 47 | #' 48 | #' 49 | #' 50 | #' 51 | #' 52 | #' @details 53 | #' The data was collected from the four following locations: 54 | #' 55 | #' 1. `heart_disease_cl`: Cleveland Clinic Foundation 56 | #' 2. `heart_disease_hu`: Hungarian Institute of Cardiology, Budapest 57 | #' 3. `heart_disease_va`: V.A. Medical Center, Long Beach, CA 58 | #' 4. `heart_disease_ch`: University Hospital, Zurich, Switzerland 59 | #' 60 | #' \tabular{rrr}{ 61 | #' Database \tab Instances \cr 62 | #' Cleveland \tab 303 \cr 63 | #' Hungarian \tab 294 \cr 64 | #' Switzerland \tab 123 \cr 65 | #' Long Beach VA \tab 200 \cr 66 | #' } 67 | #' 68 | "heart_disease_cl" 69 | 70 | #' @rdname heart_disease 71 | "heart_disease_hu" 72 | 73 | #' @rdname heart_disease 74 | "heart_disease_va" 75 | 76 | #' @rdname heart_disease 77 | "heart_disease_ch" 78 | -------------------------------------------------------------------------------- /R/autoimports_docs.R: -------------------------------------------------------------------------------- 1 | #' Autoimports Dataset 2 | #' 3 | #' This data set consists of three types of entities: 4 | #' (a) the specification of an auto in terms of various characteristics, 5 | #' (b) its assigned insurance risk rating, 6 | #' (c) its normalized losses in use as compared to other cars. 7 | #' The second rating corresponds to the degree to which the auto is more risky 8 | #' than its price indicates. Cars are initially assigned a risk factor symbol 9 | #' associated with its price. Then, if it is more risky (or less), this symbol 10 | #' is adjusted by moving it up (or down) the scale. Actuarians call this 11 | #' process "symboling". A value of +3 indicates that the auto is risky, -3 12 | #' that it is probably pretty safe. The third factor is the relative average 13 | #' loss payment per insured vehicle year. This value is normalized for all autos 14 | #' within a particular size classification (two-door small, station wagons, 15 | #' sports/speciality, etc...), and represents the average loss per car per year. 16 | #' 17 | #' @format A data frame with 205 observations on the following 26 variables. 18 | #' - `symboling`: 19 | #' - -3, -2, -1, 0, 1, 2, 3. 20 | #' - `normalized_losses`: 21 | #' - continuous from 65 to 256. 22 | #' - `make`: 23 | #' - alfa-romero, audi, bmw, chevrolet, dodge, honda, isuzu, 24 | #' jaguar, mazda, mercedes-benz, mercury, mitsubishi, nissan, 25 | #' peugot, plymouth, porsche, renault, saab, subaru, toyota, volkswagen, 26 | #' volvo 27 | #' - `fuel_type`: 28 | #' - diesel, gas. 29 | #' - `aspiration`: 30 | #' - std, turbo. 31 | #' - `num_of_doors`: 32 | #' - four, two. 33 | #' - `body_style`: 34 | #' - hardtop, wagon, sedan, hatchback, convertible. 35 | #' - `drive_wheels`: 36 | #' - 4wd, fwd, rwd. 37 | #' - `engine_location`: 38 | #' - front, rear. 39 | #' - `wheel_base`: 40 | #' - continuous from 86.6 120.9. 41 | #' - `length`: 42 | #' - continuous from 141.1 to 208.1. 43 | #' - `width`: 44 | #' - continuous from 60.3 to 72.3. 45 | #' - `height`: 46 | #' - continuous from 47.8 to 59.8. 47 | #' - `curb_weight`: 48 | #' - continuous from 1488 to 4066. 49 | #' - `engine_type`: 50 | #' - dohc, dohcv, l, ohc, ohcf, ohcv, rotor. 51 | #' - `num_of_cylinders`: 52 | #' - eight, five, four, six, three, twelve, two. 53 | #' - `engine_size`: 54 | #' - continuous from 61 to 326. 55 | #' - `fuel_system`: 56 | #' - 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi. 57 | #' - `bore`: 58 | #' - continuous from 2.54 to 3.94. 59 | #' - `stroke`: 60 | #' - continuous from 2.07 to 4.17. 61 | #' - `compression_ratio`: 62 | #' - continuous from 7 to 23. 63 | #' - `horsepower`: 64 | #' - continuous from 48 to 288. 65 | #' - `peak_rpm`: 66 | #' - continuous from 4150 to 6600. 67 | #' - `city_mpg`: 68 | #' - continuous from 13 to 49. 69 | #' - `highway_mpg`: 70 | #' - continuous from 16 to 54. 71 | #' - `price`: 72 | #' - continuous from 5118 to 45400. 73 | #' 74 | #' @source 75 | #' 76 | #' Donor: Jeffrey C. Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu) 77 | #' 78 | #' 1985 Model Import Car and Truck Specifications, 1985 Ward's Automotive Yearbook. 79 | #' 80 | #' Personal Auto Manuals, Insurance Services Office, 160 Water Street, New York, NY 10038 81 | #' 82 | #' Insurance Collision Report, Insurance Institute for Highway Safety, Watergate 600, Washington, DC 20037 83 | #' 84 | #' @references 85 | #' 86 | #' 87 | "autoimports" 88 | -------------------------------------------------------------------------------- /man/adult.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/adult_docs.R 3 | \docType{data} 4 | \name{adult} 5 | \alias{adult} 6 | \title{Adult Data Set} 7 | \format{ 8 | A data frame with 32561 observations on the following 15 variables. 9 | \itemize{ 10 | \item \code{age}: Integer 11 | \itemize{ 12 | \item Number of years alive 13 | } 14 | \item \code{workclass}: Factor 15 | \itemize{ 16 | \item Private, Self-emp-not-inc, Self-emp-inc, Federal-gov, Local-gov, 17 | State-gov, Without-pay, Never-worked. 18 | } 19 | \item \code{fnlwgt}: Numeric 20 | \itemize{ 21 | \item The variable represents the Final Weight, which is more so a sampling weight. 22 | See the names file listed in references for more details. 23 | } 24 | \item \code{education}: Factor 25 | \itemize{ 26 | \item Highest level of education attained 27 | \item Bachelors, Some-college, 11th, HS-grad, Prof-school, Assoc-acdm, 28 | Assoc-voc, 9th, 7th-8th, 12th, Masters, 1st-4th, 10th, Doctorate, 29 | 5th-6th, Preschool. 30 | } 31 | \item \code{education_num}: Numeric 32 | \itemize{ 33 | \item Number of years of education 34 | } 35 | \item \code{marital_status}: Factor 36 | \itemize{ 37 | \item Married-civ-spouse, Divorced, Never-married, Separated, Widowed, 38 | Married-spouse-absent, Married-AF-spouse 39 | } 40 | \item \code{occupation}: Factor 41 | \itemize{ 42 | \item Tech-support, Craft-repair, Other-service, Sales, Exec-managerial, 43 | Prof-specialty, Handlers-cleaners, Machine-op-inspct, Adm-clerical, 44 | Farming-fishing, Transport-moving, Priv-house-serv, Protective-serv, 45 | Armed-Forces. 46 | } 47 | \item \code{relationship}: Factor 48 | \itemize{ 49 | \item Wife, Own-child, Husband, Not-in-family, Other-relative, Unmarried. 50 | } 51 | \item \code{race}: Factor 52 | \itemize{ 53 | \item White, Asian-Pac-Islander, Amer-Indian-Eskimo, Other, Black. 54 | } 55 | \item \code{sex}: Factor 56 | \itemize{ 57 | \item Female, Male 58 | } 59 | \item \code{capital_gain}: Integer 60 | \itemize{ 61 | \item Income from investment sources, apart from wages/salary 62 | } 63 | \item \code{capital_loss}: Integer 64 | \itemize{ 65 | \item Losses from investment sources, apart from wages/salary 66 | } 67 | \item \code{hours_per_week}: Integer 68 | \itemize{ 69 | \item Amount of hours worked per week 70 | } 71 | \item \code{native_country}: Factor 72 | \itemize{ 73 | \item Country of origin 74 | \item United-States, Cambodia, England, Puerto-Rico, Canada, Germany, 75 | Outlying-US(Guam-USVI-etc), India, Japan, Greece, South, China, Cuba, 76 | Iran, Honduras, Philippines, Italy, Poland, Jamaica, Vietnam, Mexico, 77 | Portugal, Ireland, France, Dominican-Republic, Laos, Ecuador, Taiwan, 78 | Haiti, Columbia, Hungary, Guatemala, Nicaragua, Scotland, Thailand, 79 | Yugoslavia, El-Salvador, Trinadad&Tobago, Peru, Hong, Holand-Netherlands. 80 | } 81 | \item \code{income}: Factor 82 | \itemize{ 83 | \item Whether the income greater than $50,000 or not. 84 | \item <=50K, >50K 85 | } 86 | } 87 | } 88 | \source{ 89 | Ronny Kohavi and Barry Becker 90 | Data Mining and Visualization 91 | Silicon Graphics. 92 | e-mail: ronnyk '@' live.com for questions. 93 | } 94 | \usage{ 95 | adult 96 | } 97 | \description{ 98 | Extraction was done by Barry Becker from the 1994 Census database. 99 | A set of reasonably clean records was extracted using the following 100 | conditions: ((AAGE > 16) && (AGI > 100) && (AFNLWGT > 1) && (HRSWK > 0)) 101 | } 102 | \details{ 103 | Prediction task is to determine whether a person makes over 50K a year. 104 | } 105 | \references{ 106 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/adult/} 107 | 108 | \url{http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.names} 109 | 110 | \url{https://archive.ics.uci.edu/ml/datasets/adult} 111 | } 112 | \keyword{datasets} 113 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # `ucidata` - Data Sets from UC Irvine’s ML Library 5 | 6 | 7 | 8 | [![R-CMD-check](https://github.com/coatless-rpkg/ucidata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/coatless-rpkg/ucidata/actions/workflows/R-CMD-check.yaml) 9 | 10 | 11 | The following is an *R* data package that features certain data sets 12 | from the [Machine Learning Library at UC 13 | Irvine](https://archive.ics.uci.edu/ml/). These data sets have been 14 | cleaned up and provide documentation via *R*’s help system. 15 | 16 | > \[!NOTE\] 17 | > 18 | > Want to easily access data sets not included in this package? 19 | > 20 | > Check out the 21 | > [`{ucimlrepo}`](https://github.com/coatless-rpkg/ucimlrepo) R package! 22 | > The package provides an interface to download and automatically load 23 | > data sets from the UC Irvine Machine Learning Repository. 24 | 25 | ## Installation 26 | 27 | You can install `ucidata` from github with: 28 | 29 | ``` r 30 | # install.packages("remotes") 31 | remotes::install_github("coatless-rpkg/ucidata") 32 | ``` 33 | 34 | ## Using data in the package 35 | 36 | There are two ways to access the data contained within this package. 37 | 38 | The first is to load the package itself and type the name of a data set. 39 | This approach takes advantage of *R*’s lazy loading mechansim, which 40 | avoids loading the data until it is used in *R* session. For details on 41 | how lazy loading works, please see [Section 1.17: Lazy 42 | Loading](https://cran.r-project.org/doc/manuals/r-release/R-ints.html#Lazy-loading) 43 | of the [R 44 | Internals](https://cran.r-project.org/doc/manuals/r-release/R-ints.html) 45 | manual. 46 | 47 | ``` r 48 | # Load the `ucidata` package 49 | library("ucidata") 50 | 51 | # See the first 10 observations of the `autompg` dataset 52 | head(autompg) 53 | 54 | # View the help documentation for `autompg` 55 | ?autompg 56 | ``` 57 | 58 | The second approach is to use the `data()` command to load data on the 59 | fly without and type the name of a data set. 60 | 61 | ``` r 62 | # Loading `autompg` without a `library(ucidata)` call 63 | data("autompg", package = "ucidata") 64 | 65 | # See the first 10 observations of the `autompg` dataset 66 | head(autompg) 67 | 68 | # View the help documentation for `autompg` 69 | ?autompg 70 | ``` 71 | 72 | ## Included Data Sets 73 | 74 | The following data sets are included in the `ucidata` package: 75 | 76 | - [`abalone`](https://archive.ics.uci.edu/ml/datasets/abalone) 77 | - [`adult`](https://archive.ics.uci.edu/ml/datasets/adult) 78 | - [`autoimports`](https://archive.ics.uci.edu/ml/datasets/Automobile) 79 | - [`autompg`](https://archive.ics.uci.edu/ml/datasets/auto+mpg) 80 | - Breast Cancer Wisconsin: 81 | - [`bcw_original` (Breast Cancer Wisconsin 82 | Original)](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)) 83 | - Heart Disease 84 | - [`heart_disease_cl`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 85 | - [`heart_disease_hu`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 86 | - [`heart_disease_va`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 87 | - [`heart_disease_ch`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 88 | - [`bike_sharing_daily`](https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset) 89 | - [`bridges`](https://archive.ics.uci.edu/ml/datasets/Pittsburgh+Bridges) 90 | - [`car_eval`](https://archive.ics.uci.edu/ml/datasets/Car+Evaluation) 91 | - [`forest_fires`](https://archive.ics.uci.edu/ml/datasets/Forest+Fires) 92 | - [`glass`](https://archive.ics.uci.edu/ml/datasets/Glass+Identification) 93 | - [`hepatitis`](https://archive.ics.uci.edu/ml/datasets/hepatitis) 94 | - [`wine`](https://archive.ics.uci.edu/ml/datasets/wine) 95 | 96 | ## Build Scripts 97 | 98 | Want to see how each data set was imported? Check out the 99 | [`data-raw`](https://github.com/coatless-rpkg/ucidata/tree/master/data-raw) 100 | folder! 101 | -------------------------------------------------------------------------------- /man/heart_disease.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/heart_disease_processed_docs.R 3 | \docType{data} 4 | \name{heart_disease_cl} 5 | \alias{heart_disease_cl} 6 | \alias{heart_disease_hu} 7 | \alias{heart_disease_va} 8 | \alias{heart_disease_ch} 9 | \title{Heart Disease Processed Data Sets} 10 | \format{ 11 | Four \code{data.frames} with a varying number of observations that contain 12 | the following 14 variables. 13 | \itemize{ 14 | \item \code{age}: age in years 15 | \item \code{sex}: sex (1 = male; 0 = female) 16 | \item \code{cp}: chest pain type 17 | \itemize{ 18 | \item Value 1: typical angina 19 | \item Value 2: atypical angina 20 | \item Value 3: non-anginal pain 21 | \item Value 4: asymptomatic 22 | } 23 | \item \code{trestbps}: resting blood pressure (in mm Hg on admission to the hospital) 24 | \item \code{chol}: serum cholestoral in mg/dl 25 | \item \code{fbs}: fasting blood sugar > 120 mg/dl (1 = true; 0 = false) 26 | \item \code{restecg}: resting electrocardiographic results 27 | \itemize{ 28 | \item Value 0: normal 29 | \item Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV) 30 | \item Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria 31 | } 32 | \item \code{thalach}: maximum heart rate achieved 33 | \item \code{exang}: exercise induced angina (1 = yes; 0 = no) 34 | \item \code{oldpeak}: ST depression induced by exercise relative to rest 35 | \item \code{slope}: the slope of the peak exercise ST segment 36 | \itemize{ 37 | \item Value 1: upsloping 38 | \item Value 2: flat 39 | \item Value 3: downsloping 40 | } 41 | \item \code{ca}: number of major vessels (0-3) colored by flourosopy 42 | \item \code{thal}: See below 43 | \itemize{ 44 | \item Value 3: normal 45 | \item Value 6: fixed defect 46 | \item Value 7: reversable defect 47 | } 48 | \item \code{num}: diagnosis of heart disease (angiographic disease status) 49 | \itemize{ 50 | \item Value 0: < 50\% diameter narrowing 51 | \item Value 1: > 50\% diameter narrowing 52 | } 53 | } 54 | 55 | An object of class \code{data.frame} with 294 rows and 14 columns. 56 | 57 | An object of class \code{data.frame} with 200 rows and 14 columns. 58 | 59 | An object of class \code{data.frame} with 123 rows and 14 columns. 60 | } 61 | \source{ 62 | \enumerate{ 63 | \item Hungarian Institute of Cardiology. Budapest: Andras Janosi, M.D. 64 | \item University Hospital, Zurich, Switzerland: William Steinbrunn, M.D. 65 | \item University Hospital, Basel, Switzerland: Matthias Pfisterer, M.D. 66 | \item V.A. Medical Center, Long Beach and Cleveland Clinic Foundation: Robert Detrano, M.D., Ph.D. 67 | } 68 | } 69 | \usage{ 70 | heart_disease_cl 71 | 72 | heart_disease_hu 73 | 74 | heart_disease_va 75 | 76 | heart_disease_ch 77 | } 78 | \description{ 79 | Detecting the presence of heart disease in patients. 80 | } 81 | \details{ 82 | The data was collected from the four following locations: 83 | \enumerate{ 84 | \item \code{heart_disease_cl}: Cleveland Clinic Foundation 85 | \item \code{heart_disease_hu}: Hungarian Institute of Cardiology, Budapest 86 | \item \code{heart_disease_va}: V.A. Medical Center, Long Beach, CA 87 | \item \code{heart_disease_ch}: University Hospital, Zurich, Switzerland 88 | } 89 | 90 | \tabular{rrr}{ 91 | Database \tab Instances \cr 92 | Cleveland \tab 303 \cr 93 | Hungarian \tab 294 \cr 94 | Switzerland \tab 123 \cr 95 | Long Beach VA \tab 200 \cr 96 | } 97 | } 98 | \references{ 99 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data} 100 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.hungarian.data} 101 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.switzerland.data} 102 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.va.data} 103 | \url{https://archive.ics.uci.edu/ml/datasets/Heart+Disease} 104 | } 105 | \keyword{datasets} 106 | -------------------------------------------------------------------------------- /README.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | output: github_document 3 | --- 4 | 5 | 6 | 7 | ```{r, echo = FALSE} 8 | knitr::opts_chunk$set( 9 | collapse = TRUE, 10 | comment = "#>", 11 | fig.path = "README-" 12 | ) 13 | ``` 14 | 15 | # `ucidata` - Data Sets from UC Irvine's ML Library 16 | 17 | 18 | [![R-CMD-check](https://github.com/coatless-rpkg/ucidata/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/coatless-rpkg/ucidata/actions/workflows/R-CMD-check.yaml) 19 | 20 | 21 | The following is an _R_ data package that features certain data sets from 22 | the [Machine Learning Library at UC Irvine](https://archive.ics.uci.edu/ml/). 23 | These data sets have been cleaned up and provide documentation via _R_'s help system. 24 | 25 | > [!NOTE] 26 | > 27 | > Want to easily access data sets not included in this package? 28 | > 29 | > Check out the [`{ucimlrepo}`](https://github.com/coatless-rpkg/ucimlrepo) R package! 30 | > The package provides an interface to download and automatically load data 31 | > sets from the UC Irvine Machine Learning Repository. 32 | 33 | ## Installation 34 | 35 | You can install `ucidata` from github with: 36 | 37 | ```{r gh-installation, eval = FALSE} 38 | # install.packages("remotes") 39 | remotes::install_github("coatless-rpkg/ucidata") 40 | ``` 41 | 42 | ## Using data in the package 43 | 44 | There are two ways to access the data contained within this package. 45 | 46 | The first is to load the package itself and type the name of a data set. 47 | This approach takes advantage of _R_'s lazy loading mechansim, which avoids 48 | loading the data until it is used in _R_ session. For details on 49 | how lazy loading works, please see [Section 1.17: Lazy Loading](https://cran.r-project.org/doc/manuals/r-release/R-ints.html#Lazy-loading) 50 | of the [R Internals](https://cran.r-project.org/doc/manuals/r-release/R-ints.html) 51 | manual. 52 | 53 | ```{r use-data-package, eval = FALSE} 54 | # Load the `ucidata` package 55 | library("ucidata") 56 | 57 | # See the first 10 observations of the `autompg` dataset 58 | head(autompg) 59 | 60 | # View the help documentation for `autompg` 61 | ?autompg 62 | ``` 63 | 64 | The second approach is to use the `data()` command to load data on the 65 | fly without and type the name of a data set. 66 | 67 | ```{r use-data-call, eval = FALSE} 68 | # Loading `autompg` without a `library(ucidata)` call 69 | data("autompg", package = "ucidata") 70 | 71 | # See the first 10 observations of the `autompg` dataset 72 | head(autompg) 73 | 74 | # View the help documentation for `autompg` 75 | ?autompg 76 | ``` 77 | 78 | ## Included Data Sets 79 | 80 | The following data sets are included in the `ucidata` package: 81 | 82 | - [`abalone`](https://archive.ics.uci.edu/ml/datasets/abalone) 83 | - [`adult`](https://archive.ics.uci.edu/ml/datasets/adult) 84 | - [`autoimports`](https://archive.ics.uci.edu/ml/datasets/Automobile) 85 | - [`autompg`](https://archive.ics.uci.edu/ml/datasets/auto+mpg) 86 | - Breast Cancer Wisconsin: 87 | - [`bcw_original` (Breast Cancer Wisconsin Original)](https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original)) 88 | - Heart Disease 89 | - [`heart_disease_cl`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 90 | - [`heart_disease_hu`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 91 | - [`heart_disease_va`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 92 | - [`heart_disease_ch`](https://archive.ics.uci.edu/ml/datasets/Heart+Disease) 93 | - [`bike_sharing_daily`](https://archive.ics.uci.edu/ml/datasets/bike+sharing+dataset) 94 | - [`bridges`](https://archive.ics.uci.edu/ml/datasets/Pittsburgh+Bridges) 95 | - [`car_eval`](https://archive.ics.uci.edu/ml/datasets/Car+Evaluation) 96 | - [`forest_fires`](https://archive.ics.uci.edu/ml/datasets/Forest+Fires) 97 | - [`glass`](https://archive.ics.uci.edu/ml/datasets/Glass+Identification) 98 | - [`hepatitis`](https://archive.ics.uci.edu/ml/datasets/hepatitis) 99 | - [`wine`](https://archive.ics.uci.edu/ml/datasets/wine) 100 | 101 | ## Build Scripts 102 | 103 | Want to see how each data set was imported? Check out the [`data-raw`](https://github.com/coatless-rpkg/ucidata/tree/master/data-raw) folder! 104 | -------------------------------------------------------------------------------- /man/autoimports.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/autoimports_docs.R 3 | \docType{data} 4 | \name{autoimports} 5 | \alias{autoimports} 6 | \title{Autoimports Dataset} 7 | \format{ 8 | A data frame with 205 observations on the following 26 variables. 9 | \itemize{ 10 | \item \code{symboling}: 11 | \itemize{ 12 | \item -3, -2, -1, 0, 1, 2, 3. 13 | } 14 | \item \code{normalized_losses}: 15 | \itemize{ 16 | \item continuous from 65 to 256. 17 | } 18 | \item \code{make}: 19 | \itemize{ 20 | \item alfa-romero, audi, bmw, chevrolet, dodge, honda, isuzu, 21 | jaguar, mazda, mercedes-benz, mercury, mitsubishi, nissan, 22 | peugot, plymouth, porsche, renault, saab, subaru, toyota, volkswagen, 23 | volvo 24 | } 25 | \item \code{fuel_type}: 26 | \itemize{ 27 | \item diesel, gas. 28 | } 29 | \item \code{aspiration}: 30 | \itemize{ 31 | \item std, turbo. 32 | } 33 | \item \code{num_of_doors}: 34 | \itemize{ 35 | \item four, two. 36 | } 37 | \item \code{body_style}: 38 | \itemize{ 39 | \item hardtop, wagon, sedan, hatchback, convertible. 40 | } 41 | \item \code{drive_wheels}: 42 | \itemize{ 43 | \item 4wd, fwd, rwd. 44 | } 45 | \item \code{engine_location}: 46 | \itemize{ 47 | \item front, rear. 48 | } 49 | \item \code{wheel_base}: 50 | \itemize{ 51 | \item continuous from 86.6 120.9. 52 | } 53 | \item \code{length}: 54 | \itemize{ 55 | \item continuous from 141.1 to 208.1. 56 | } 57 | \item \code{width}: 58 | \itemize{ 59 | \item continuous from 60.3 to 72.3. 60 | } 61 | \item \code{height}: 62 | \itemize{ 63 | \item continuous from 47.8 to 59.8. 64 | } 65 | \item \code{curb_weight}: 66 | \itemize{ 67 | \item continuous from 1488 to 4066. 68 | } 69 | \item \code{engine_type}: 70 | \itemize{ 71 | \item dohc, dohcv, l, ohc, ohcf, ohcv, rotor. 72 | } 73 | \item \code{num_of_cylinders}: 74 | \itemize{ 75 | \item eight, five, four, six, three, twelve, two. 76 | } 77 | \item \code{engine_size}: 78 | \itemize{ 79 | \item continuous from 61 to 326. 80 | } 81 | \item \code{fuel_system}: 82 | \itemize{ 83 | \item 1bbl, 2bbl, 4bbl, idi, mfi, mpfi, spdi, spfi. 84 | } 85 | \item \code{bore}: 86 | \itemize{ 87 | \item continuous from 2.54 to 3.94. 88 | } 89 | \item \code{stroke}: 90 | \itemize{ 91 | \item continuous from 2.07 to 4.17. 92 | } 93 | \item \code{compression_ratio}: 94 | \itemize{ 95 | \item continuous from 7 to 23. 96 | } 97 | \item \code{horsepower}: 98 | \itemize{ 99 | \item continuous from 48 to 288. 100 | } 101 | \item \code{peak_rpm}: 102 | \itemize{ 103 | \item continuous from 4150 to 6600. 104 | } 105 | \item \code{city_mpg}: 106 | \itemize{ 107 | \item continuous from 13 to 49. 108 | } 109 | \item \code{highway_mpg}: 110 | \itemize{ 111 | \item continuous from 16 to 54. 112 | } 113 | \item \code{price}: 114 | \itemize{ 115 | \item continuous from 5118 to 45400. 116 | } 117 | } 118 | } 119 | \source{ 120 | Donor: Jeffrey C. Schlimmer (Jeffrey.Schlimmer@a.gp.cs.cmu.edu) 121 | 122 | 1985 Model Import Car and Truck Specifications, 1985 Ward's Automotive Yearbook. 123 | 124 | Personal Auto Manuals, Insurance Services Office, 160 Water Street, New York, NY 10038 125 | 126 | Insurance Collision Report, Insurance Institute for Highway Safety, Watergate 600, Washington, DC 20037 127 | } 128 | \usage{ 129 | autoimports 130 | } 131 | \description{ 132 | This data set consists of three types of entities: 133 | (a) the specification of an auto in terms of various characteristics, 134 | (b) its assigned insurance risk rating, 135 | (c) its normalized losses in use as compared to other cars. 136 | The second rating corresponds to the degree to which the auto is more risky 137 | than its price indicates. Cars are initially assigned a risk factor symbol 138 | associated with its price. Then, if it is more risky (or less), this symbol 139 | is adjusted by moving it up (or down) the scale. Actuarians call this 140 | process "symboling". A value of +3 indicates that the auto is risky, -3 141 | that it is probably pretty safe. The third factor is the relative average 142 | loss payment per insured vehicle year. This value is normalized for all autos 143 | within a particular size classification (two-door small, station wagons, 144 | sports/speciality, etc...), and represents the average loss per car per year. 145 | } 146 | \references{ 147 | \url{https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data} 148 | \url{https://archive.ics.uci.edu/ml/datasets/Automobile} 149 | } 150 | \keyword{datasets} 151 | -------------------------------------------------------------------------------- /R/bike_sharing_daily_docs.R: -------------------------------------------------------------------------------- 1 | #' Bike Sharing (Daily) Data Set 2 | #' 3 | #' Bike sharing systems are new generation of traditional bike rentals where 4 | #' whole process from membership, rental and return back has become automatic. 5 | #' Through these systems, user is able to easily rent a bike from a particular 6 | #' position and return back at another position. Currently, there are about 7 | #' over 500 bike-sharing programs around the world which is composed of over 8 | #' 500 thousands bicycles. Today, there exists great interest in these systems 9 | #' due to their important role in traffic, environmental and health issues. 10 | #' 11 | #' Apart from interesting real world applications of bike sharing systems, the 12 | #' characteristics of data being generated by these systems make them attractive 13 | #' for the research. Opposed to other transport services such as bus or subway, 14 | #' the duration of travel, departure and arrival position is explicitly recorded 15 | #' in these systems. This feature turns bike sharing system into a virtual 16 | #' sensor network that can be used for sensing mobility in the city. Hence, it 17 | #' is expected that most of important events in the city could be detected via 18 | #' monitoring these data. 19 | #' 20 | #' @format A data frame with 731 observations on the following 16 variables. 21 | #' - `instant`: Record index 22 | #' - `dteday`: Date 23 | #' - `season`: 24 | #' - 1: Spring 25 | #' - 2: Summer 26 | #' - 3: Fall 27 | #' - 4: Winter 28 | #' - `yr`: 29 | #' - 0: 2011 30 | #' - 1: 2012 31 | #' - `mnth`: 32 | #' - 1: Jan 33 | #' - 2: Feb 34 | #' - 3: Mar 35 | #' - 4: Apr 36 | #' - 5: May 37 | #' - 6: Jun 38 | #' - 7: Jul 39 | #' - 8: Aug 40 | #' - 9: Sep 41 | #' - 10: Oct 42 | #' - 11: Nov 43 | #' - 12: Dec 44 | #' - `hr`: 45 | #' - 0: 12 AM 46 | #' - 1: 1 AM 47 | #' - 2: 2 AM 48 | #' - 3: 3 AM 49 | #' - 4: 4 AM 50 | #' - 5: 5 AM 51 | #' - 6: 6 AM 52 | #' - 7: 7 AM 53 | #' - 8: 8 AM 54 | #' - 9: 9 AM 55 | #' - 10: 10 AM 56 | #' - 11: 11 AM 57 | #' - 12: 12 PM 58 | #' - 13: 1 PM 59 | #' - 14: 2 PM 60 | #' - 15: 3 PM 61 | #' - 16: 4 PM 62 | #' - 17: 5 PM 63 | #' - 18: 6 PM 64 | #' - 19: 7 PM 65 | #' - 20: 8 PM 66 | #' - 21: 9 PM 67 | #' - 22: 10 PM 68 | #' - 23: 11 PM 69 | #' - `holiday`: 70 | #' - Whether the day is a holiday or not according to the [Human Resources page of DC](http://dchr.dc.gov/page/holiday-schedule). 71 | #' - 0: No 72 | #' - 1: Yes 73 | #' - `weekday`: 74 | #' - The day of a week 75 | #' - 0: Sunday 76 | #' - 1: Monday 77 | #' - 2: Tuesday 78 | #' - 3: Wednesday 79 | #' - 4: Thursday 80 | #' - 5: Friday 81 | #' - 6: Saturday 82 | #' - `workingday`: 83 | #' - Whether the day is a workday (Monday - Friday) 84 | #' - 0: No 85 | #' - 1: Yes 86 | #' - `weathersit`: 87 | #' - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 88 | #' - 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 89 | #' - 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 90 | #' - 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 91 | #' - `temp`: 92 | #' - Normalized temperature in Celsius. 93 | #' - The values are derived via \eqn{\frac{(t-t_{min})}{(t_{max}-t_{min})}}{(t-t[min])/(t[max]-t[min])}, t_min=-8, t_max=+39 94 | #' - `atemp`: 95 | #' - Normalized feeling temperature in Celsius. 96 | #' - The values are derived via \eqn{\frac{(t-t_{min})}{(t_{max}-t_{min})}}{(t-t[min])/(t[max]-t[min])}, t_min=-16, t_max=+50 97 | #' - `hum`: 98 | #' - Normalized humidity. 99 | #' - The values are divided to 100 (max) 100 | #' - `windspeed`: 101 | #' - Normalized wind speed. 102 | #' - The values are divided to 67 (max) 103 | #' - `casual`: 104 | #' - Count of casual users 105 | #' - `registered`: 106 | #' - Count of registered users 107 | #' - `cnt`: 108 | #' - Count of total rental bikes including both casual and registered 109 | #' @source 110 | #' Hadi Fanaee-T 111 | #' 112 | #' Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto 113 | #' 114 | #' INESC Porto, Campus da FEUP 115 | #' 116 | #' Rua Dr. Roberto Frias, 378 117 | #' 118 | #' 4200 - 465 Porto, Portugal 119 | #' @references 120 | #' Original Source: 121 | #' 122 | #' Weather Information: 123 | #' 124 | #' Holiday Schedule: 125 | "bike_sharing_daily" 126 | -------------------------------------------------------------------------------- /man/bike_sharing_daily.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/bike_sharing_daily_docs.R 3 | \docType{data} 4 | \name{bike_sharing_daily} 5 | \alias{bike_sharing_daily} 6 | \title{Bike Sharing (Daily) Data Set} 7 | \format{ 8 | A data frame with 731 observations on the following 16 variables. 9 | \itemize{ 10 | \item \code{instant}: Record index 11 | \item \code{dteday}: Date 12 | \item \code{season}: 13 | \itemize{ 14 | \item 1: Spring 15 | \item 2: Summer 16 | \item 3: Fall 17 | \item 4: Winter 18 | } 19 | \item \code{yr}: 20 | \itemize{ 21 | \item 0: 2011 22 | \item 1: 2012 23 | } 24 | \item \code{mnth}: 25 | \itemize{ 26 | \item 1: Jan 27 | \item 2: Feb 28 | \item 3: Mar 29 | \item 4: Apr 30 | \item 5: May 31 | \item 6: Jun 32 | \item 7: Jul 33 | \item 8: Aug 34 | \item 9: Sep 35 | \item 10: Oct 36 | \item 11: Nov 37 | \item 12: Dec 38 | } 39 | \item \code{hr}: 40 | \itemize{ 41 | \item 0: 12 AM 42 | \item 1: 1 AM 43 | \item 2: 2 AM 44 | \item 3: 3 AM 45 | \item 4: 4 AM 46 | \item 5: 5 AM 47 | \item 6: 6 AM 48 | \item 7: 7 AM 49 | \item 8: 8 AM 50 | \item 9: 9 AM 51 | \item 10: 10 AM 52 | \item 11: 11 AM 53 | \item 12: 12 PM 54 | \item 13: 1 PM 55 | \item 14: 2 PM 56 | \item 15: 3 PM 57 | \item 16: 4 PM 58 | \item 17: 5 PM 59 | \item 18: 6 PM 60 | \item 19: 7 PM 61 | \item 20: 8 PM 62 | \item 21: 9 PM 63 | \item 22: 10 PM 64 | \item 23: 11 PM 65 | } 66 | \item \code{holiday}: 67 | \itemize{ 68 | \item Whether the day is a holiday or not according to the \href{http://dchr.dc.gov/page/holiday-schedule}{Human Resources page of DC}. 69 | \item 0: No 70 | \item 1: Yes 71 | } 72 | \item \code{weekday}: 73 | \itemize{ 74 | \item The day of a week 75 | \item 0: Sunday 76 | \item 1: Monday 77 | \item 2: Tuesday 78 | \item 3: Wednesday 79 | \item 4: Thursday 80 | \item 5: Friday 81 | \item 6: Saturday 82 | } 83 | \item \code{workingday}: 84 | \itemize{ 85 | \item Whether the day is a workday (Monday - Friday) 86 | \item 0: No 87 | \item 1: Yes 88 | } 89 | \item \code{weathersit}: 90 | \itemize{ 91 | \item 1: Clear, Few clouds, Partly cloudy, Partly cloudy 92 | \item 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 93 | \item 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 94 | \item 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 95 | } 96 | \item \code{temp}: 97 | \itemize{ 98 | \item Normalized temperature in Celsius. 99 | \item The values are derived via \eqn{\frac{(t-t_{min})}{(t_{max}-t_{min})}}{(t-t[min])/(t[max]-t[min])}, t_min=-8, t_max=+39 100 | } 101 | \item \code{atemp}: 102 | \itemize{ 103 | \item Normalized feeling temperature in Celsius. 104 | \item The values are derived via \eqn{\frac{(t-t_{min})}{(t_{max}-t_{min})}}{(t-t[min])/(t[max]-t[min])}, t_min=-16, t_max=+50 105 | } 106 | \item \code{hum}: 107 | \itemize{ 108 | \item Normalized humidity. 109 | \item The values are divided to 100 (max) 110 | } 111 | \item \code{windspeed}: 112 | \itemize{ 113 | \item Normalized wind speed. 114 | \item The values are divided to 67 (max) 115 | } 116 | \item \code{casual}: 117 | \itemize{ 118 | \item Count of casual users 119 | } 120 | \item \code{registered}: 121 | \itemize{ 122 | \item Count of registered users 123 | } 124 | \item \code{cnt}: 125 | \itemize{ 126 | \item Count of total rental bikes including both casual and registered 127 | } 128 | } 129 | } 130 | \source{ 131 | Hadi Fanaee-T 132 | 133 | Laboratory of Artificial Intelligence and Decision Support (LIAAD), University of Porto 134 | 135 | INESC Porto, Campus da FEUP 136 | 137 | Rua Dr. Roberto Frias, 378 138 | 139 | 4200 - 465 Porto, Portugal 140 | } 141 | \usage{ 142 | bike_sharing_daily 143 | } 144 | \description{ 145 | Bike sharing systems are new generation of traditional bike rentals where 146 | whole process from membership, rental and return back has become automatic. 147 | Through these systems, user is able to easily rent a bike from a particular 148 | position and return back at another position. Currently, there are about 149 | over 500 bike-sharing programs around the world which is composed of over 150 | 500 thousands bicycles. Today, there exists great interest in these systems 151 | due to their important role in traffic, environmental and health issues. 152 | } 153 | \details{ 154 | Apart from interesting real world applications of bike sharing systems, the 155 | characteristics of data being generated by these systems make them attractive 156 | for the research. Opposed to other transport services such as bus or subway, 157 | the duration of travel, departure and arrival position is explicitly recorded 158 | in these systems. This feature turns bike sharing system into a virtual 159 | sensor network that can be used for sensing mobility in the city. Hence, it 160 | is expected that most of important events in the city could be detected via 161 | monitoring these data. 162 | } 163 | \references{ 164 | Original Source: \url{http://capitalbikeshare.com/system-data} 165 | 166 | Weather Information: \url{http://www.freemeteo.com} 167 | 168 | Holiday Schedule: \url{http://dchr.dc.gov/page/holiday-schedule} 169 | } 170 | \keyword{datasets} 171 | --------------------------------------------------------------------------------