├── .Rbuildignore ├── .gitignore ├── .travis.yml ├── ChangeLog ├── DESCRIPTION ├── NAMESPACE ├── R ├── RcppExports.R ├── functions.R ├── init.R └── rhelpers.R ├── README.md ├── cleanup ├── configure ├── demo ├── 00Index ├── rvw_bin.R ├── rvw_df.R ├── rvw_lda.R └── rvw_overview.R ├── docker ├── ci │ └── Dockerfile └── run │ └── Dockerfile ├── inst └── extdata │ ├── binary_train.vw │ ├── binary_valid.vw │ ├── lda_data.vw │ ├── multiclass_train.vw │ ├── multiclass_valid.vw │ └── ref_print.out ├── man ├── add_option.Rd ├── df2vw.Rd ├── print.vw.Rd ├── rvwgsoc-package.Rd ├── vwaudit.Rd ├── vwparams.Rd ├── vwsetup.Rd ├── vwtest.Rd └── vwtrain.Rd ├── rvw.Rproj ├── src ├── Makevars.in ├── RcppExports.cpp ├── extra │ ├── array_parameters.h │ ├── array_parameters_dense.h │ ├── error_reporting.h │ ├── example_predict.h │ ├── hash.h │ ├── no_label.h │ └── parser_helper.h ├── helpers.cpp ├── helpers.h ├── md5.c ├── md5.h └── rvw.cpp ├── tests ├── testthat.R └── testthat │ ├── test-cmdline.R │ ├── test-err.R │ ├── test-parser.R │ ├── test-utils.R │ └── test-vwsetup.R ├── tools └── r_configure.R └── vignettes └── introduction.Rmd /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | docker 4 | .travis.yml 5 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | src/*.o 6 | src/*.so 7 | src/*.dll 8 | src/Makevars 9 | test_dir/* 10 | inst/doc 11 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | # Run Travis CI for R via Docker 2 | # 3 | # Made by Dirk Eddelbuettel in August 2018 and released under GPL (>=2) 4 | 5 | os: linux 6 | dist: trusty 7 | sudo: required 8 | services: docker 9 | 10 | env: 11 | global: 12 | - DOCKER_OPTS="--rm -ti -v $(pwd):/mnt -w /mnt" 13 | DOCKER_CNTR="rvowpalwabbit/ci" 14 | R_BLD_CHK_OPTS="--no-build-vignettes --no-manual" 15 | 16 | before_install: 17 | - docker pull ${DOCKER_CNTR} 18 | - docker run ${DOCKER_OPTS} ${DOCKER_CNTR} r -p -e 'sessionInfo()' 19 | 20 | install: 21 | - docker run ${DOCKER_OPTS} ${DOCKER_CNTR} R CMD build ${R_BLD_CHK_OPTS} . 22 | 23 | script: 24 | - docker run ${DOCKER_OPTS} ${DOCKER_CNTR} R CMD check ${R_BLD_CHK_OPTS} rvw*.tar.gz 25 | 26 | after_failure: 27 | - ./run.sh dump_logs 28 | 29 | notifications: 30 | email: 31 | on_success: change 32 | on_failure: change 33 | 34 | -------------------------------------------------------------------------------- /ChangeLog: -------------------------------------------------------------------------------- 1 | 2018-10-13 Dirk Eddelbuettel 2 | 3 | * docker/ci/Dockerfile: Install mltools (and r-cran-matrix) 4 | 5 | 2018-09-12 Dirk Eddelbuettel 6 | 7 | * README.md: Added brief Docker documentation 8 | 9 | * .travis.yml: Update container name 10 | 11 | 2018-09-11 Dirk Eddelbuettel 12 | 13 | * docker/run/Dockerfile: Added for deployment 14 | 15 | 2018-09-10 Dirk Eddelbuettel 16 | 17 | * .travis.yml: Updated container reference 18 | 19 | 2018-09-09 Dirk Eddelbuettel 20 | 21 | * docker/ci/Dockerfile: Add Dockerfile for Travis 22 | * .travis.yml: Use Docker container in tests 23 | 24 | ---- a lot omitted here, history from rvw-legacy below --- 25 | 26 | 2017-03-19 Dirk Eddelbuettel 27 | 28 | * DESCRIPTION (Version, Date): Roll minor version 29 | 30 | * R/dt2vw.R (dt2vw): Make a copy of the dataset; 31 | test for NA values in input data set 32 | 33 | 2017-03-12 Selim Raboudi 34 | 35 | * R/dt2vw.R (dt2vw): Allow for escaped variable names 36 | 37 | 2016-09-02 Dirk Eddelbuettel 38 | 39 | * demo/vw_example_4.R: Added Rborist 40 | 41 | 2016-08-30 Dirk Eddelbuettel 42 | 43 | * demo/vw_example_4.R: Added ranger 44 | 45 | 2016-08-29 Dirk Eddelbuettel 46 | 47 | * README.md: Added 48 | 49 | 2016-08-23 Dirk Eddelbuettel 50 | 51 | * demo/vw_example_4.R: Rewritten/extended, now with ctree and gbm 52 | 53 | * demo/vw_example_5.R: Plot predicted vs actual for regression example 54 | 55 | * R/vw.R (vw): Set a AUC fallback value 56 | 57 | 2016-08-22 Dirk Eddelbuettel 58 | 59 | * DESCRIPTION (License): Use dual license with GPL (>= 2) for my code, 60 | and BSD 3-clause for existing code 61 | 62 | * NAMESPACE: All importing now via importFrom() 63 | 64 | * demo/vw_example_4.R: Add library(pROC), add a legend to ROC plot 65 | * demo/00Index: Add vw_example4 66 | 67 | * R/plot.R (plotDensity): add utils::globalVariables() for R CMD check 68 | 69 | * R/dt2vw.R (dt2vw): If dependent variable is numeric, do not 70 | check factor levels 71 | 72 | * demo/vw_example_5.R: New regression example 73 | 74 | 2016-08-21 Dirk Eddelbuettel 75 | 76 | * DESCRIPTION (Suggests): Added caret (confusion matrix), ggplot2 77 | and earth (etitanic data) 78 | 79 | * R/vw.R (vw): Steps towards richer return objects, more arguments 80 | documented, all temp files now in current directory, new option to 81 | keep files 82 | * man/vw.Rd: Updated accordingly 83 | 84 | * R/plot.R: New simple density plot function 85 | * man/plotDensity.Rd: Documentation 86 | 87 | * demo/vw_example_4.R: Now plots ROC curve with (up to) three models 88 | 89 | 2016-08-20 Dirk Eddelbuettel 90 | 91 | * R/init.R (.getVW,.getPerf): Helper accessors for vw and perf 92 | binaries 93 | 94 | * R/vw.R (vw): Use vw and perf binaries stored in package 95 | environment, ouput simplification 96 | 97 | 2016-08-19 Dirk Eddelbuettel 98 | 99 | * R/vw.R (vw): Test for vw and perf binaries 100 | 101 | 2016-08-18 Dirk Eddelbuettel 102 | 103 | * R/init.R (.onAttach): Startup code to look for 'vw' and 'perf' 104 | 105 | * R/vw.R: Reindented and other whitespace changes, now uses '<-' 106 | assignments and TRUE/FALSE not T/F, removed @export/@import 107 | * R/dt2vw.R: Ditto 108 | 109 | 2016-08-17 Dirk Eddelbuettel 110 | 111 | * DESCRIPTION (License): Changed to BSD_3_clause which is what 112 | vowpal wabbit itself uses 113 | * LICENSE: Ditto 114 | 115 | * demo/vw_example.R: Updated (old) example, no longer uses system, 116 | files in R's temp directory 117 | 118 | * demo/vw_example_2.R: Updated (old) example_2 119 | 120 | * inst/examples/bostonHousing/: New (simple) regression example 121 | 122 | 2016-08-16 Dirk Eddelbuettel 123 | 124 | * DESCRIPTION (Version): 0.1.1 now passes R CMD check 125 | 126 | * DESCRIPTION: Rewritten / adapted to current standards 127 | * NAMESPACE: Updated 128 | * LICENSE: Added per BSD_2_clause requirements 129 | * R/vw.R: Roxygen documentation update, minimal changes 130 | * R/dt2vw.R: Ditto 131 | * man/vw.Rd: Updated 132 | * man/dt2vw.Rd: Ditto 133 | 134 | * DESCRIPTION (Version): 0.1.0 committed 'as is' as intial commit 135 | 136 | 137 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: rvw 2 | Type: Package 3 | Title: R Interface for Vowpal Wabbit 4 | Version: 0.6.0 5 | Date: 2018-08-20 6 | Author: Ivan Pavlov, Dirk Eddelbuettel, James J Balamuta 7 | Maintainer: Ivan Pavlov 8 | Description: R interface for Vowpal Wabbit using 'Rcpp' and the 'Vowpal Wabbit' library 'libvw' for the Google Summer of Code 2018. 9 | License: GPL (>= 2) 10 | Imports: Rcpp (>= 0.12.16), tools, RApiSerialize, data.table 11 | LinkingTo: Rcpp, RApiSerialize 12 | Suggests: testthat, yaml, utils, 13 | knitr, 14 | rmarkdown, 15 | mltools, 16 | magrittr 17 | SystemRequirements: C++11, Vowpal Wabbit library 18 | RoxygenNote: 6.1.0 19 | VignetteBuilder: knitr 20 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | useDynLib(rvw, .registration=TRUE) 2 | exportPattern("^[[:alpha:]]+") 3 | importFrom(Rcpp, evalCpp) 4 | importFrom(stats, setNames) 5 | import(data.table) 6 | import(RApiSerialize) 7 | importFrom(tools, md5sum) 8 | S3method(print, vw) 9 | S3method(predict, vw) 10 | -------------------------------------------------------------------------------- /R/RcppExports.R: -------------------------------------------------------------------------------- 1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | .get_vw_version <- function() { 5 | .Call(`_rvw_get_vw_version`) 6 | } 7 | 8 | #'Train Vowpal Wabbit model 9 | #' 10 | #'vwtrain is an interface to train VW model from \code{\link{vwsetup}} 11 | #' 12 | #'@param vwmodel [vw] Model of vw class to train 13 | #'@param data [string or data.frame] Path to training data in .vw plain text format or data.frame. 14 | #'If \code{[data.frame]} then will be parsed using \code{df2vw} function. 15 | #'@param readable_model [string] Print trained model in human readable format ("hashed") 16 | #'and also with human readable features ("inverted") 17 | #'@param readable_model_path [string] Path to file where to save readable model. 18 | #'@param quiet [logical] Do not print anything to the console 19 | #'@param update_model [logical] Update an existing model, when training with new data. \code{FALSE} by default. 20 | #'@param passes [int] Number of times the algorithm will cycle over the data (epochs). 21 | #'@param cache [bool] Use a cache for a data file. 22 | #'@param progress [int/real] Progress update frequency. int: additive, real: multiplicative 23 | #'@param namespaces [list or yaml file] For \code{df2vw}. Name of each namespace and 24 | #' each variable for each namespace can be a R list, or a YAML 25 | #' file example namespace with the IRIS database: namespaces = 26 | #' list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length', 27 | #' 'Petal.Width') this creates 2 namespaces (sepal 28 | #' and petal) containing the features defined by elements of this lists. 29 | #'@param keep_space [string vector] For \code{df2vw}. Keep spaces for this features 30 | #'Example:"FERRARI 4Si" 31 | #'With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features 32 | #'Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature 33 | #'@param fixed [string vector] fixed parsing for this features 34 | #'Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'"). 35 | #'Can be used for LDA ("word_1:2 word_2:3" will stay the same), 36 | #'but should be used carefully, because special characters can ruin final VW format file. 37 | #'@param targets [string or string vector] For \code{df2vw}. 38 | #'If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 39 | #'If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 40 | #'multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm. 41 | #'@param probabilities [string vector] For \code{df2vw}. vectors with action probabilities for Contextual Bandit algorithm. 42 | #'@param weight [string] For \code{df2vw}. Weight (importance) of each line of the dataset. 43 | #'@param base [string] For \code{df2vw}. base of each line of the dataset. Used for residual regression. 44 | #'@param tag [string] For \code{df2vw}. Tag of each line of the dataset. 45 | #'@param multiline [integer] number of labels (separate lines) for multilines examle 46 | #'@import tools 47 | #'@examples 48 | #'ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 49 | #'test_vwmodel <- vwsetup() 50 | #'vwtrain(test_vwmodel, data = ext_train_data) 51 | vwtrain <- function(vwmodel, data, readable_model = NULL, readable_model_path = "", quiet = FALSE, update_model = FALSE, passes = 1L, cache = FALSE, progress = NULL, namespaces = NULL, keep_space = NULL, fixed = NULL, targets = NULL, probabilities = NULL, weight = NULL, base = NULL, tag = NULL, multiline = NULL) { 52 | invisible(.Call(`_rvw_vwtrain`, vwmodel, data, readable_model, readable_model_path, quiet, update_model, passes, cache, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline)) 53 | } 54 | 55 | #'Compute predictions using Vowpal Wabbit model 56 | #' 57 | #'\code{vwtest} computes predictions using VW model from \code{\link{vwsetup}} 58 | #'\code{predict.vw} compute predictions using parser settings from \code{\link{vwtrain}} 59 | #' 60 | #'@param vwmodel [vw] Model of vw class to train. 61 | #'@param object Model of vw class to train for \code{predict.vw} 62 | #'@param data [string or data.frame] Path to training data in .vw plain text format or data.frame. 63 | #'If \code{[data.frame]} then will be parsed using \code{df2vw} function. 64 | #'@param probs_path [string] Path to file where to save predictions. 65 | #'@param full_probs [bool] Output full predictions in data.frame format. If not, force predictions into a single vector (default). 66 | #'@param readable_model [string] Print trained model in human readable format ("hashed") 67 | #'and also with human readable features ("inverted"). 68 | #'@param readable_model_path [string] Path to file where to save readable model. 69 | #'@param quiet [bool] Do not print anything to the console. 70 | #'@param passes [int] Number of times the algorithm will cycle over the data (epochs). 71 | #'@param cache [bool] Use a cache for a data file. 72 | #'@param raw [bool] Output unnormalized predictions. Default is FALSE. 73 | #'@param progress [int/real] Progress update frequency. int: additive, real: multiplicative 74 | #'@param namespaces [list or yaml file] For \code{df2vw}. Name of each namespace and 75 | #' each variable for each namespace can be a R list, or a YAML 76 | #' file example namespace with the IRIS database: namespaces = 77 | #' list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length', 78 | #' 'Petal.Width') this creates 2 namespaces (sepal 79 | #' and petal) containing the features defined by elements of this lists. 80 | #'@param keep_space [string vector] For \code{df2vw}. Keep spaces for this features 81 | #'Example:"FERRARI 4Si" 82 | #'With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features 83 | #'Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature 84 | #'@param fixed [string vector] fixed parsing for this features 85 | #'Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'"). 86 | #'Can be used for LDA ("word_1:2 word_2:3" will stay the same), 87 | #'but should be used carefully, because special characters can ruin final VW format file. 88 | #'@param targets [string or string vector] For \code{df2vw}. 89 | #'If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 90 | #'If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 91 | #'multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm. 92 | #'@param probabilities [string vector] For \code{df2vw}. Vectors with action probabilities for Contextual Bandit algorithm. 93 | #'@param weight [string] For \code{df2vw}. Weight (importance) of each line of the dataset. 94 | #'@param base [string] For \code{df2vw}. Base of each line of the dataset. Used for residual regression. 95 | #'@param tag [string] For \code{df2vw}. Tag of each line of the dataset. 96 | #'@param multiline [integer] Number of labels (separate lines) for multilines example 97 | #'@param ... Parameters passed to \code{predict.vw} 98 | #'@return Numerical vector containing predictions 99 | #'@import tools 100 | #'@examples 101 | #'ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 102 | #'ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw") 103 | #'test_vwmodel <- vwsetup() 104 | #'vwtrain(test_vwmodel, data = ext_train_data) 105 | #'vwtest(test_vwmodel, data = ext_test_data) 106 | #'@rdname vwtest 107 | vwtest <- function(vwmodel, data, probs_path = "", full_probs = FALSE, readable_model = NULL, readable_model_path = "", quiet = FALSE, passes = 1L, cache = FALSE, raw = FALSE, progress = NULL, namespaces = NULL, keep_space = NULL, fixed = NULL, targets = NULL, probabilities = NULL, weight = NULL, base = NULL, tag = NULL, multiline = NULL) { 108 | .Call(`_rvw_vwtest`, vwmodel, data, probs_path, full_probs, readable_model, readable_model_path, quiet, passes, cache, raw, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline) 109 | } 110 | 111 | #'Audit Vowpal Wabbit model 112 | #' 113 | #'Get feature names and their model values. 114 | #' 115 | #'@param vwmodel Model of vw class to train 116 | #'@param quiet [bool] Do not print anything to the console. 117 | #'@return Data.frame containing feature names, feature hashes and model values 118 | #'@examples 119 | #'ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 120 | #'test_vwmodel <- vwsetup() 121 | #'vwtrain(test_vwmodel, data = ext_train_data) 122 | #'vwaudit(test_vwmodel) 123 | vwaudit <- function(vwmodel, quiet = FALSE) { 124 | .Call(`_rvw_vwaudit`, vwmodel, quiet) 125 | } 126 | 127 | -------------------------------------------------------------------------------- /R/init.R: -------------------------------------------------------------------------------- 1 | 2 | .rvw_global <- new.env(parent=emptyenv()) 3 | 4 | .onAttach <- function(libname, pkgname) { 5 | # Initialise default/check lists 6 | general_check <- list(random_seed=0, 7 | ring_size=NA_real_, 8 | holdout_off=FALSE, 9 | holdout_period=10, 10 | holdout_after=0, 11 | early_terminate=3, 12 | loss_function=NA_character_, 13 | link=NA_character_, 14 | quantile_tau=0.5) 15 | feature_check <- list(bit_precision=18, 16 | quadratic=NA_character_, 17 | cubic=NA_character_, 18 | interactions=NA_character_, 19 | permutations=FALSE, 20 | leave_duplicate_interactions=FALSE, 21 | noconstant=FALSE, 22 | feature_limit=NA_character_, 23 | ngram=NA_character_, 24 | skips=NA_character_, 25 | hash=NA_character_, 26 | affix=NA_character_, 27 | spelling=NA_character_, 28 | interact=NA_character_) 29 | 30 | optimization_check <- list(learning_rate=0.5, 31 | initial_pass_length=NA_real_, 32 | l1=0, 33 | l2=0, 34 | no_bias_regularization=NA_character_, 35 | feature_mask=NA_character_, 36 | decay_learning_rate=1, 37 | initial_t=0, 38 | power_t=0.5, 39 | initial_weight=0, 40 | random_weights="off", 41 | normal_weights="off", 42 | truncated_normal_weights="off", 43 | sparse_weights=FALSE, 44 | input_feature_regularizer=NA_character_) 45 | 46 | if (.get_vw_version() == "8.6.1") { 47 | # Learning algorithm default/check lists 48 | sgd_check <- list(adaptive=TRUE, 49 | normalized=TRUE, 50 | invariant=TRUE, 51 | adax=FALSE, 52 | sparse_l2=0, 53 | l1_state=0, 54 | l2_state=1) 55 | bfgs_check <- list(conjugate_gradient=FALSE, 56 | hessian_on=FALSE, 57 | mem=15, 58 | termination=0.00100000005) 59 | ftrl_check <- list(ftrl_alpha=0.005, 60 | ftrl_beta=0.1) 61 | pistol_check <- list(ftrl_alpha=0.005, 62 | ftrl_beta=0.1) 63 | ksvm_check <- list(reprocess=1, 64 | kernel="linear", 65 | bandwidth=1.0, 66 | degree=2, 67 | lambda=-1) 68 | OjaNewton_check <- list(sketch_size=10, 69 | epoch_size=1, 70 | alpha=1, 71 | alpha_inverse=NA_real_, 72 | learning_rate_cnt=2, 73 | normalize="on", 74 | random_init="on") 75 | svrg_check <- list(stage_size=1) 76 | 77 | # Learning parameters/reductions default/check lists 78 | binary_check <- list(binary=TRUE) 79 | oaa_check <- list(num_classes=NA_real_, 80 | oaa_subsample=NA_real_ 81 | # probabilities=FALSE, 82 | # scores=FALSE 83 | ) 84 | ect_check <- list(num_classes=NA_real_) 85 | csoaa_check <- list(num_classes=NA_real_, 86 | csoaa_ldf="" 87 | # csoaa_rank=FALSE, 88 | # probabilities=FALSE 89 | ) 90 | wap_check <- list(num_classes=NA_real_, 91 | wap_ldf="" 92 | # csoaa_rank=FALSE, 93 | # probabilities=FALSE 94 | ) 95 | log_multi_check <- list(num_classes=NA_real_, 96 | no_progress=FALSE, 97 | swap_resistance=4) 98 | recall_tree_check <- list(num_classes=NA_real_, 99 | max_candidates=NA_real_, 100 | bern_hyper=1, 101 | max_depth=NA_real_, 102 | node_only=0, 103 | randomized_routing=0) 104 | lda_check <- list(num_topics=NA_real_, 105 | lda_alpha=0.100000001, 106 | lda_rho=0.100000001, 107 | lda_D=10000, 108 | lda_epsilon=0.00100000005, 109 | math_mode=NA_character_, 110 | minibatch=1, 111 | metrics=0) 112 | multilabel_oaa_check <- list(num_labels=NA_real_) 113 | classweight_check <- list(class_multiplier=NA_real_) 114 | new_mf_check <- list(rank=NA_real_) 115 | lrq_check <- list(features=NA_character_, 116 | lrqdropout=FALSE) 117 | stage_poly_check <- list(sched_exponent = 1.0, 118 | batch_sz = 1000, 119 | batch_sz_no_doubling = TRUE) 120 | bootstrap_check <- list(num_rounds=NA_real_, 121 | bs_type="mean") 122 | autolink_check <- list(degree=2) 123 | replay_check <- list(level="b", 124 | buffer=100, 125 | count=1) 126 | explore_eval_check <- list(explore_eval=TRUE, 127 | multiplier=NA_real_) 128 | cb_check<- list(num_costs=NA_real_, 129 | cb_type="dr", 130 | eval=FALSE, 131 | rank_all=FALSE, 132 | no_predict=FALSE) 133 | cb_explore_check <- list(num_actions=NA_real_, 134 | explore_type="epsilon", 135 | explore_arg=0.05, 136 | psi=1, 137 | nounif=FALSE, 138 | mellowness=0.1, 139 | greedify=FALSE, 140 | lambda=-1, 141 | cb_min_cost=0, 142 | cb_max_cost=1, 143 | first_only=FALSE) 144 | cbify_check <- list(num_classes=NA_real_, 145 | cbify_cs=FALSE, 146 | loss0=0, 147 | loss1=1) 148 | multiworld_test_check <- list(features=NA_character_, 149 | learn=NA_real_, 150 | exclude_eval=FALSE) 151 | nn_check <- list(num_hidden=NA_real_, 152 | inpass=FALSE, 153 | multitask=FALSE, 154 | dropout=FALSE, 155 | meanfield=FALSE) 156 | topk_check <- list(num_k=NA_real_) 157 | search_check <- list(id=NA_real_, 158 | search_task=NA_character_, 159 | search_interpolation=NA_character_, 160 | search_rollout=NA_character_, 161 | search_rollin=NA_character_, 162 | search_passes_per_policy=1, 163 | search_beta=0.5, 164 | search_alpha=1e-10, 165 | search_total_nb_policies=NA_real_, 166 | search_trained_nb_policies=NA_real_, 167 | search_allowed_transitions=NA_character_, 168 | search_subsample_time=NA_real_, 169 | search_neighbor_features=NA_character_, 170 | search_rollout_num_steps=NA_real_, 171 | search_history_length=1, 172 | search_no_caching=FALSE, 173 | search_xv=FALSE, 174 | search_perturb_oracle=0, 175 | search_linear_ordering=FALSE, 176 | search_active_verify=NA_real_, 177 | search_save_every_k_runs=NA_real_) 178 | boosting_check <- list(num_learners=NA_real_, 179 | gamma=0.100000001, 180 | alg="BBM") 181 | marginal_check <- list(ids=NA_character_, 182 | initial_denominator=1, 183 | initial_numerator=0.5, 184 | compete=FALSE, 185 | update_before_learn=0, 186 | unweighted_marginals=0, 187 | decay=0) 188 | check_lists <- list(general_check=general_check, feature_check=feature_check, optimization_check=optimization_check, 189 | sgd_check=sgd_check, bfgs_check=bfgs_check, ftrl_check=ftrl_check, pistol_check=pistol_check, ksvm_check=ksvm_check, 190 | OjaNewton_check=OjaNewton_check, svrg_check=svrg_check, 191 | binary_check=binary_check, oaa_check=oaa_check, ect_check=ect_check, csoaa_check=csoaa_check, wap_check=wap_check, 192 | log_multi_check=log_multi_check, recall_tree_check=recall_tree_check, lda_check=lda_check, multilabel_oaa_check=multilabel_oaa_check, 193 | new_mf_check=new_mf_check, classweight_check=classweight_check, lrq_check=lrq_check, stage_poly_check=stage_poly_check, 194 | bootstrap_check=bootstrap_check, autolink_check=autolink_check, replay_check=replay_check, 195 | cb_check=cb_check, explore_eval_check=explore_eval_check, cb_explore_check=cb_explore_check, cbify_check=cbify_check, 196 | multiworld_test_check=multiworld_test_check, nn_check=nn_check, topk_check=topk_check, search_check=search_check, 197 | boosting_check=boosting_check, marginal_check=marginal_check) 198 | } else { 199 | stop("Vowpal Wabbit v8.6.1 or newer required") 200 | } 201 | 202 | flatten_check_lists <- .flatten(check_lists) 203 | 204 | assign("check_lists", check_lists, envir=.rvw_global) 205 | assign("flatten_check_lists", flatten_check_lists, envir=.rvw_global) 206 | } 207 | -------------------------------------------------------------------------------- /R/rhelpers.R: -------------------------------------------------------------------------------- 1 | # Helper functions 2 | 3 | .sprintf2 <- function(fmt, ...) { 4 | MAX_NVAL <- 99L 5 | args <- c(...) 6 | if (length(args) <= MAX_NVAL) 7 | return(do.call(sprintf, c(list(fmt), args))) 8 | stopifnot(length(fmt) == 1L) 9 | not_a_spec_at <- gregexpr("%%", fmt, fixed=TRUE)[[1L]] 10 | not_a_spec_at <- c(not_a_spec_at, not_a_spec_at + 1L) 11 | spec_at <- setdiff(gregexpr("%", fmt, fixed=TRUE)[[1L]], not_a_spec_at) 12 | nspec <- length(spec_at) 13 | if (length(args) < nspec) 14 | stop("too few arguments") 15 | if (nspec <= MAX_NVAL) { 16 | break_points <- integer(0) 17 | } else { 18 | break_points <- seq(MAX_NVAL + 1L, nspec, by=MAX_NVAL) 19 | } 20 | break_from <- c(1L, break_points) 21 | break_to <- c(break_points - 1L, nspec) 22 | fmt_break_at <- spec_at[break_points] 23 | fmt_chunks <- substr(rep.int(fmt, length(fmt_break_at) + 1L), 24 | c(1L, fmt_break_at), 25 | c(fmt_break_at - 1L, nchar(fmt))) 26 | ans_chunks <- mapply( 27 | function(fmt_chunk, from, to) 28 | do.call(sprintf, c(list(fmt_chunk), args[from:to])), 29 | fmt_chunks, 30 | break_from, 31 | break_to 32 | ) 33 | paste(apply(ans_chunks,1, paste, collapse = ""), collapse = "\n") 34 | } 35 | 36 | .check_parameters <- function(params) { 37 | # Helper function to check parameters 38 | check_param_values <- function(input, check) { 39 | bool_check_names <- names(input) %in% names(check) 40 | if(!all(bool_check_names)) { 41 | error_msg <- paste0("Wrong argument names: ", 42 | paste0(names(input)[!bool_check_names], collapse = ", ")) 43 | stop(error_msg, call. = FALSE) 44 | } 45 | 46 | valid_input <- check 47 | bool_check_values <- sapply(names(input), FUN = function(i) { 48 | # First check if types of input argument values are correct (same as of check lists) 49 | bool_check <- (typeof(input[[i]]) == typeof(check[[i]])) | (is.na(input[[i]])) 50 | # Replace default/check values with values from input 51 | valid_input[[i]] <<- input[[i]] 52 | # And return bool values to raise errors 53 | bool_check 54 | }) 55 | if(!all(bool_check_values)) { 56 | error_msg <- paste0("Wrong argument values: ", 57 | paste0(names(input)[!bool_check_values], collapse = ", ")) 58 | stop(error_msg, call. = FALSE) 59 | } 60 | 61 | # Return check with modified values 62 | return(valid_input) 63 | } 64 | 65 | # Create default parameters list if no parameters provided 66 | # Else check parameters and return validated parameters 67 | if(length(params$options) != 0) { 68 | valid_options <- list() 69 | params$options <- sapply(names(params$options), function(option_name) { 70 | option_check_type <- .rvw_global[["check_lists"]][[paste0(option_name, "_check")]] 71 | valid_option <- check_param_values( 72 | input = params$options[[option_name]], 73 | check = option_check_type 74 | ) 75 | # Check for missing first argument value in option parameters 76 | if(is.na(valid_option[[1]])){ 77 | error_msg <- paste0("Missing value for argument: ", 78 | names(valid_option)[[1]], 79 | "\nFor option: ", 80 | option_name, "\n" 81 | ) 82 | stop(error_msg, call. = FALSE) 83 | } 84 | valid_option <- setNames(list(valid_option), option_name) 85 | valid_options <<- c(valid_options, valid_option) 86 | }) 87 | params$options <- valid_options 88 | 89 | } 90 | if(length(params$general_params) == 0) { 91 | params$general_params <- .rvw_global[["check_lists"]][["general_check"]] 92 | } else { 93 | params$general_params <- check_param_values( 94 | input = params$general_params, 95 | # input = c(list(cache=params$cache), params$general_params), 96 | check = .rvw_global[["check_lists"]][["general_check"]] 97 | ) 98 | } 99 | if(length(params$feature_params) == 0) { 100 | params$feature_params <- .rvw_global[["check_lists"]][["feature_check"]] 101 | } else { 102 | params$feature_params <- check_param_values( 103 | input = params$feature_params, 104 | check = .rvw_global[["check_lists"]][["feature_check"]] 105 | ) 106 | } 107 | if(length(params$optimization_params) == 0) { 108 | algorithm_parameters <- .rvw_global[["check_lists"]][[paste0(params$algorithm, "_check")]] 109 | params$optimization_params <- c(algorithm_parameters, .rvw_global[["check_lists"]][["optimization_check"]]) 110 | } else { 111 | algorithm_check_type <- .rvw_global[["check_lists"]][[paste0(params$algorithm, "_check")]] 112 | params$optimization_params <- check_param_values( 113 | input = params$optimization_params, 114 | check = c(algorithm_check_type, .rvw_global[["check_lists"]][["optimization_check"]]) 115 | ) 116 | } 117 | 118 | # # Cache should be created, if passes > 1 119 | # if(params$general_params$passes > 1) { 120 | # params$general_params$cache <- TRUE 121 | # } 122 | # Return validated parameters 123 | return(list(algorithm = params$algorithm, 124 | general_params = params$general_params, 125 | feature_params = params$feature_params, 126 | optimization_params = params$optimization_params, 127 | options = params$options)) 128 | } 129 | 130 | .create_parameters_string <- function(params) { 131 | params_to_strings <- function(i) { 132 | if(is.na(flat_params[[i]]) || isTRUE(flat_params[[i]] == .rvw_global[["flatten_check_lists"]][[i]])) { 133 | return("") 134 | }; 135 | if(is.logical(flat_params[[i]][[1]]) & flat_params[[i]][[1]] == TRUE) { 136 | return(paste0("--",i)) 137 | }; 138 | if(is.logical(flat_params[[i]][[1]]) & flat_params[[i]][[1]] == FALSE) { 139 | return("") 140 | } else { 141 | return(paste0("--",i," ",flat_params[[i]])) 142 | } 143 | } 144 | 145 | temp_params <- params 146 | 147 | # Options exceptions 148 | exceptions_params <- c() 149 | # cb exception 150 | if(isTRUE(temp_params$options[["cb"]][1] == 0)){ 151 | # Use --cb_adf if num_costs == 0 152 | exceptions_params <- c( 153 | exceptions_params, 154 | paste0("--cb_adf") 155 | ) 156 | temp_params$options[["cb"]][1] <- NA 157 | } 158 | # cb_explore exception 159 | if("cb_explore" %in% names(temp_params$options)) { 160 | # Use --cb_explore_adf if num_actions == 0 161 | if(isTRUE(temp_params$options[["cb_explore"]][1] == 0)){ 162 | exceptions_params <- c( 163 | exceptions_params, 164 | paste0("--cb_explore_adf") 165 | ) 166 | temp_params$options[["cb_explore"]][1] <- NA 167 | } 168 | # create exploration type string like "--first arg" 169 | exceptions_params <- c( 170 | exceptions_params, 171 | paste0("--", temp_params$options[["cb_explore"]][2], " ", temp_params$options[["cb_explore"]][3]) 172 | ) 173 | temp_params$options[["cb_explore"]][2] <- NA 174 | temp_params$options[["cb_explore"]][3] <- NA 175 | } 176 | # Experience Replay exception 177 | if("replay" %in% names(temp_params$options)) { 178 | # --replay_c 100, --replay_m 100, --replay_b 100 like exception 179 | exceptions_params <- c( 180 | exceptions_params, 181 | paste0("--replay_", temp_params$options[["replay"]][1], " ", temp_params$options[["replay"]][2]), 182 | paste0("--replay_", temp_params$options[["replay"]][1], "_count ", temp_params$options[["replay"]][3]) 183 | ) 184 | temp_params$options[["replay"]][1] <- NA 185 | temp_params$options[["replay"]][2] <- NA 186 | temp_params$options[["replay"]][3] <- NA 187 | } 188 | exceptions_params <- Filter(exceptions_params, f = function(x) nchar(x) > 0) 189 | exceptions_string <- paste0(exceptions_params, collapse = " ") 190 | 191 | 192 | # Convert different options into string with CL arguments 193 | options_params <- sapply(names(temp_params$options), function(option_name) { 194 | if(is.na(temp_params$options[[option_name]][1])) { 195 | tmp <- "" 196 | } else if (option_name == names(temp_params$options[[option_name]])[1]) { 197 | tmp <- paste0("--", option_name) 198 | } else { 199 | tmp <- paste0("--", option_name, " ", temp_params$options[[option_name]][1]) 200 | } 201 | temp_params$options[[option_name]][1] <<- NA 202 | tmp 203 | }) 204 | 205 | # temp_params$options <- list() 206 | # Filter empty strings 207 | options_params <- Filter(options_params, f = function(x) nchar(x) > 0) 208 | options_string <- paste0(options_params, collapse = " ") 209 | 210 | # Flatten option 211 | flat_params <- .flatten(temp_params$options) 212 | 213 | # Exception for "--math_mode" -> "--math-mode" 214 | if("math_mode" %in% names(flat_params)) { 215 | names(flat_params) <- gsub(pattern = "math_mode", replacement = "math-mode", x = names(flat_params)) 216 | } 217 | 218 | # Convert option parameters list to "--arg _" list 219 | flat_option_params <- sapply(names(flat_params), FUN = params_to_strings) 220 | # Filter empty strings 221 | flat_option_params <- Filter(flat_option_params, f = function(x) nchar(x) > 0) 222 | # Create string "--passes 0 --bit_precision 18" for parser 223 | option_params_string <- paste0(flat_option_params, collapse = " ") 224 | 225 | temp_params$options <- list() 226 | 227 | #Set learning mode string argument 228 | algorithm_string <- switch (temp_params$algorithm, 229 | sgd = {tmp <- ""; tmp}, 230 | bfgs = {tmp <- "--bfgs"; tmp}, 231 | ftrl = {tmp <- "--ftrl"; tmp}, 232 | pistol = {tmp <- "--pistol"; tmp}, 233 | ksvm = {tmp <- "--ksvm"; tmp}, 234 | OjaNewton = {tmp <- "--OjaNewton"; tmp}, 235 | svrg = {tmp <- "--svrg"; tmp} 236 | ) 237 | # # Disable cache here, because it's checked in vwtrain and vwtest 238 | # if (temp_params$general_params$cache) { 239 | # temp_params$general_params$cache <- NA 240 | # } 241 | # Flatten list 242 | flat_params <- .flatten(temp_params[-c(1)]) 243 | # Convert parameters list to "--arg _" list 244 | flat_params <- sapply(names(flat_params), FUN = params_to_strings) 245 | # Filter empty strings 246 | flat_params <- Filter(flat_params, f = function(x) nchar(x) > 0) 247 | # Create string "--passes 0 --bit_precision 18" for parser 248 | parameters_string <- paste0(flat_params, collapse = " ") 249 | final_params <- c(algorithm_string, parameters_string, exceptions_string, options_string, option_params_string) 250 | # parameters_string <- paste(algorithm_string, parameters_string, exceptions_string, options_string, option_params_string, sep = " ") 251 | parameters_string <- paste0( 252 | Filter(final_params,f = function(x) nchar(x) > 0), 253 | collapse = " " 254 | ) 255 | 256 | return(parameters_string) 257 | } 258 | 259 | # Flatten parameters list 260 | .flatten <- function(x) { 261 | repeat { 262 | if(!any(vapply(x,is.list, logical(1)))) return(x) 263 | x <- Reduce(c, x) 264 | } 265 | } 266 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [![Build Status](https://travis-ci.org/rvw-org/rvw.svg?branch=master)](https://travis-ci.org/rvw-org/rvw) 2 | 3 | ## rvw 4 | 5 | Development of **rvw** package started as R Vowpal Wabbit (Google Summer of Code 2018) [project](https://summerofcode.withgoogle.com/projects/#5511455416254464). 6 | 7 | **Vowpal Wabbit** is an online machine learning system that is known for its speed and scalability and is widely used in research and industry. 8 | 9 | This package aims to bring its functionality to **R**. 10 | 11 | ## Installation 12 | 13 | ### From Source 14 | 15 | First, you have to install **Vowpal Wabbit** itself [here](https://github.com/JohnLangford/vowpal_wabbit#getting-the-code). 16 | 17 | Next, once the required library is installed, you can install the **rvw** package using `remotes`: 18 | 19 | ```r 20 | install.packages("remotes") ## or devtools 21 | remotes::install_github("rvw-org/rvw") 22 | ``` 23 | 24 | or (in case you have the package sources) via a standard `R CMD INSTALL .`. 25 | 26 | This installation from source currently works best on Linux; on macOS you 27 | have to locally compile using the R-compatible toolchain (and not the 28 | brew-based one as the Vowpal Wabbit documentation suggests). 29 | 30 | There is one possible shortcut: you can use the Debian/Ubuntu package as our 31 | Docker container does: `sudo apt-get install libvw-dev vowpal-wabbit 32 | libboost-program-options-dev`. 33 | 34 | 35 | ### Using Docker 36 | 37 | We use [Docker](https://www.docker.com) for the [Travis CI](https://www.travis-ci.org) tests, and also provide a container 38 | for deployment. Do 39 | 40 | ```sh 41 | docker pull rvowpalwabbit/run ## one time 42 | docker run --rm -ti rvowpalwabbit/run bash ## launch container 43 | ``` 44 | 45 | to start the container with `rvw` installed. See the 46 | [Boettiger and Eddelbuettel RJournal paper](https://journal.r-project.org/archive/2017/RJ-2017-065/index.html) 47 | for more on Docker for R, and the [Rocker Project](https://www.rocker-project.org) used here. 48 | 49 | ## Getting Started 50 | [Introduction](https://github.com/rvw-org/rvw/wiki/Introduction) 51 | 52 | Examples: 53 | 54 | * [Binary classification](https://github.com/rvw-org/rvw/wiki/Binary-classification) 55 | * [CSOAA multiclass classification](https://github.com/rvw-org/rvw/wiki/CSOAA-multiclass-classification) 56 | * [Topic modeling with Latent Dirichlet Allocation (LDA)](https://github.com/rvw-org/rvw/wiki/Topic-modeling-with-Latent-Dirichlet-Allocation-(LDA)) 57 | 58 | 59 | ## Example 60 | 61 | In this example we will try to predict age groups (based on number of abalone shell rings) from physical measurements. We will use Abalone Data Set from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Abalone). 62 | 63 | First we prepare our data: 64 | 65 | ```r 66 | library(mltools) 67 | library(rvw) 68 | 69 | set.seed(1) 70 | aburl = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' 71 | abnames = c('sex','length','diameter','height','weight.w','weight.s','weight.v','weight.sh','rings') 72 | abalone = read.table(aburl, header = F , sep = ',', col.names = abnames) 73 | data_full <- abalone 74 | 75 | # Split number of rings into groups with equal (as possible) number of observations 76 | data_full$group <- bin_data(data_full$rings, bins=3, binType = "quantile") 77 | group_lvls <- levels(data_full$group) 78 | levels(data_full$group) <- c(1, 2, 3) 79 | 80 | # Prepare indices to split data 81 | ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full)) 82 | # Split data into train and test subsets 83 | df_train <- data_full[ind_train,] 84 | df_test <- data_full[-ind_train,] 85 | ``` 86 | 87 | Then we set up a *Vowpal Wabbit* model: 88 | ```r 89 | vwmodel <- vwsetup(option = "ect", num_classes = 3) 90 | ``` 91 | 92 | * *option = "ect"* - we will use [Error Correcting Tournament](https://github.com/JohnLangford/vowpal_wabbit/wiki/Error-Correcting-Tournament-(ect)-multi-class-example) algorithm to train multiclass classification model; 93 | * *num_classes = 3* - number of classes in our data; 94 | 95 | Now we start training: 96 | 97 | ```r 98 | vwtrain(vwmodel, data = df_train, 99 | namespaces = list(NS1 = list("sex", "rings"), 100 | NS2 = list("weight.w","weight.s","weight.v","weight.sh", "diameter", "length", "height")), 101 | targets = "group" 102 | ) 103 | ``` 104 | And we get: `average loss = 0.278060` 105 | 106 | * *namespaces* - We will split our features into two namespaces `NS1` and `NS2`; 107 | * *targets = "group"* - ground truth labels; 108 | 109 | 110 | And finally compute predictions using trained model: 111 | 112 | ```r 113 | predict.vw(vwmodel, data = df_test) 114 | ``` 115 | Here we get: `average loss = 0.221292` 116 | 117 | We can add more learning algorithms to our model. For example we want to use *boosting* algorithm with 100 "weak" learners. Then we will just add this option to our model and train again: 118 | 119 | ```r 120 | vwmodel <- add_option(vwmodel, option = "boosting", num_learners=100) 121 | 122 | vwtrain(vwmodel, data = df_train, 123 | namespaces = list(NS1 = list("sex", "rings"), 124 | NS2 = list("weight.w","weight.s","weight.v","weight.sh", "diameter", "length", "height")), 125 | targets = "group" 126 | ) 127 | ``` 128 | We get: `average loss = 0.229273` 129 | 130 | And compute predictions: 131 | 132 | ```r 133 | predict.vw(vwmodel, data = df_test) 134 | ``` 135 | Finally we get: `average loss = 0.081340` 136 | 137 | In order to inspect parameters of our model we can simply print it: 138 | 139 | ```r 140 | vwmodel 141 | ``` 142 | 143 | ``` 144 | Vowpal Wabbit model 145 | Learning algorithm: sgd 146 | Working directory: /var/folders/yx/6949djdd3yb4qsw7x_95wfjr0000gn/T//RtmpjO3DD1 147 | Model file: /var/folders/yx/6949djdd3yb4qsw7x_95wfjr0000gn/T//RtmpjO3DD1/vw_1534253637_mdl.vw 148 | General parameters: 149 | random_seed : 0 150 | ring_size : Not defined 151 | holdout_off : FALSE 152 | holdout_period : 10 153 | holdout_after : 0 154 | early_terminate : 3 155 | loss_function : squared 156 | link : identity 157 | quantile_tau : 0.5 158 | Feature parameters: 159 | bit_precision : 18 160 | quadratic : Not defined 161 | cubic : Not defined 162 | interactions : Not defined 163 | permutations : FALSE 164 | leave_duplicate_interactions : FALSE 165 | noconstant : FALSE 166 | feature_limit : Not defined 167 | ngram : Not defined 168 | skips : Not defined 169 | hash : Not defined 170 | affix : Not defined 171 | spelling : Not defined 172 | Learning algorithms / Reductions: 173 | ect : 174 | num_classes : 3 175 | boosting : 176 | num_learners : 100 177 | gamma : 0.1 178 | alg : BBM 179 | Optimization parameters: 180 | adaptive : TRUE 181 | normalized : TRUE 182 | invariant : TRUE 183 | adax : FALSE 184 | sparse_l2 : 0 185 | l1_state : 0 186 | l2_state : 1 187 | learning_rate : 0.5 188 | initial_pass_length : Not defined 189 | l1 : 0 190 | l2 : 0 191 | no_bias_regularization : Not defined 192 | feature_mask : Not defined 193 | decay_learning_rate : 1 194 | initial_t : 0 195 | power_t : 0.5 196 | initial_weight : 0 197 | random_weights : Not defined 198 | normal_weights : Not defined 199 | truncated_normal_weights : Not defined 200 | sparse_weights : FALSE 201 | input_feature_regularizer : Not defined 202 | Model evaluation. Training: 203 | num_examples : 3341 204 | weighted_example_sum : 3341 205 | weighted_label_sum : 0 206 | avg_loss : 0.2292727 207 | total_feature : 33408 208 | Model evaluation. Testing: 209 | num_examples : 836 210 | weighted_example_sum : 836 211 | weighted_label_sum : 0 212 | avg_loss : 0.08133971 213 | total_feature : 8360 214 | ``` 215 | -------------------------------------------------------------------------------- /cleanup: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | rm -f src/Makevars src/*.o src/*.so src/*.dylib 4 | -------------------------------------------------------------------------------- /configure: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | 3 | : ${R_HOME:=`R RHOME`} 4 | 5 | cmd="${R_HOME}/bin/Rscript ${PWD}/tools/r_configure.R" 6 | 7 | $cmd 8 | -------------------------------------------------------------------------------- /demo/00Index: -------------------------------------------------------------------------------- 1 | rvw_overview Basic package usage 2 | rvw_df Using data.frame as input data for CSOAA multiclass classification 3 | rvw_lda LDA example 4 | rvw_bin Simple binary classification 5 | -------------------------------------------------------------------------------- /demo/rvw_bin.R: -------------------------------------------------------------------------------- 1 | library(rvw) 2 | library(mlbench) # For a dataset 3 | 4 | # First, switch to a temporary directory 5 | curr_dir <- getwd() 6 | setwd(tempdir()) 7 | 8 | set.seed(42) 9 | 10 | # We will try to identify benign or malignant class of a tumour using its histology characteristics. 11 | data("BreastCancer", package = "mlbench") 12 | data_full <- BreastCancer 13 | 14 | # First, start with data preprocessing 15 | data_full <- data_full[complete.cases(data_full),] 16 | ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full)) 17 | 18 | str(data_full) 19 | summary(data_full) 20 | # We can see that "benign" cases appear more often in our dataset 21 | # This will be used to set up a baseline model 22 | 23 | data_full <- data_full[,-1] 24 | data_full$Class <- ifelse(data_full$Class == "malignant", 1, -1) 25 | 26 | data_train <- data_full[ind_train,] 27 | data_test <- data_full[-ind_train,] 28 | 29 | # Our baseline model simply reports every tumour class as benign 30 | baseline_pred <- rep(-1, length(data_test$Class)) 31 | 32 | # Accuracy for binary classification case 33 | acc_prc <- function(y_pred, y_true){sum(y_pred == y_true) / length(y_pred) * 100} 34 | 35 | acc_prc(data_test$Class, baseline_pred) 36 | # With our baseline model, we get an accuracy of around 65% 37 | 38 | # Now we a ready to use Vowpal Wabbit models 39 | # Setup model 40 | test_vwmodel <- vwsetup(dir = "./", model = "mdl.vw", 41 | option = "binary") # Convert predictions to {-1,+1} 42 | 43 | # Basic training and testing 44 | vwtrain(vwmodel = test_vwmodel, 45 | data = data_train, 46 | passes = 10, 47 | targets = "Class") 48 | 49 | vw_output <- vwtest(vwmodel = test_vwmodel, data = data_test) 50 | 51 | acc_prc(data_test$Class, vw_output) 52 | # Now we get much better results with an accuracy of around 97% 53 | 54 | # Switch back 55 | setwd(curr_dir) 56 | -------------------------------------------------------------------------------- /demo/rvw_df.R: -------------------------------------------------------------------------------- 1 | library(mltools) 2 | library(rvw) 3 | 4 | curr_dir <- getwd() 5 | setwd(tempdir()) 6 | # We will use abalone dataset and will try to predict age groups (based on number of abalone shell rings) from physical measurements 7 | aburl = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data' 8 | abnames = c('sex','length','diameter','height','weight.w','weight.s','weight.v','weight.sh','rings') 9 | abalone = read.table(aburl, header = F , sep = ',', col.names = abnames) 10 | data_full <- abalone 11 | 12 | # Split number of rings into groups with equal (as possible) number of observations 13 | data_full$group <- bin_data(data_full$rings, bins=3, binType = "quantile") 14 | group_lvls <- levels(data_full$group) 15 | levels(data_full$group) <- c(1, 2, 3) 16 | # Prepare variables for CSOAA algorithm 17 | data_full$cost_class_1 <- ifelse(data_full$group == 1, 0.8, 0.1) 18 | data_full$cost_class_2 <- ifelse(data_full$group == 2, 0.8, 0.1) 19 | data_full$cost_class_3 <- ifelse(data_full$group == 3, 0.8, 0.1) 20 | data_full$rings <- factor(data_full$rings) 21 | data_full$tag <- sapply(1:nrow(data_full), function(x) paste0("ex",x)) 22 | # Prepare indices to split data 23 | ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full)) 24 | # Split data into train and test subsets 25 | df_train <- data_full[ind_train,] 26 | df_test <- data_full[-ind_train,] 27 | 28 | vwmodel <- vwsetup(dir = "./", 29 | option = "csoaa", num_classes = 3) 30 | 31 | vwtrain(vwmodel, data = df_train, 32 | namespaces = list(NS1 = list("sex", "rings"), NS2 = list("diameter", "length", "height")), 33 | targets = c("cost_class_1", "cost_class_2", "cost_class_3"), tag = "tag" 34 | ) 35 | vwpreds <- predict(vwmodel, data = df_test, full_probs = T) 36 | 37 | head(vwpreds) 38 | 39 | setwd(curr_dir) 40 | 41 | -------------------------------------------------------------------------------- /demo/rvw_lda.R: -------------------------------------------------------------------------------- 1 | library(rvw) 2 | 3 | # In this demo, we will take a look at the topic modeling problem. 4 | # For this, we will use Latent Dirichlet Allocation (LDA) method implemented in Vowpal Wabbit (VW). 5 | 6 | # First, switch to a temporary directory. 7 | curr_dir <- getwd() 8 | setwd(tempdir()) 9 | 10 | # Here we prepare our dataset. We consider the WebKB dataset. 11 | # It consists of web pages collected from various Universities and manually classified into seven different classes (topics). 12 | # Original reference: The 4 Universities Data Set 13 | # http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/ 14 | # We use a preprocessed version of this dataset from Ana Cardoso-Cachopo PhD thesis: 15 | # http://ana.cachopo.org/datasets-for-single-label-text-categorization 16 | data_url <- "http://ana.cachopo.org/datasets-for-single-label-text-categorization/webkb-test-stemmed.txt" 17 | lda_data <- read.delim(file = data_url, header = F, stringsAsFactors = F) 18 | names(lda_data) <- c("topic", "text") 19 | 20 | # Clear out empty lines. 21 | lda_data <- lda_data[!(lda_data$text == ""), ] 22 | # Prepare a vocabulary from all documents. 23 | lda_vocab <- sort(unique(unlist(strsplit(lda_data$text, " ")))) 24 | 25 | 26 | # In order to use VW LDA algorithm, we have to convert plain text to "word:word_count word:word_count ..." format. 27 | # Also, we replace the words with their indexes in the vocabulary. 28 | # This is needed if we want to easily decode feature hashes later and show topics in a human-readable format. 29 | lda_data$features <- sapply(lda_data$text, function(x) { 30 | splitted_words <- unlist(strsplit(x, " ")) 31 | counted_words <- aggregate(data.frame(count=splitted_words), list(word=splitted_words), length) 32 | res_str <- paste0(apply(counted_words, 1, function(x){ 33 | paste0( (which(lda_vocab == x[["word"]]) - 1) , ":", as.numeric(x[["count"]])) 34 | # Or use this if no replacement with index is needed: 35 | # paste0(x[["word"]], ":", as.numeric(x[["count"]])) 36 | }), 37 | collapse = " ") 38 | res_str 39 | }) 40 | 41 | # Calculate required number of bits (b) for feature hashes range: [0, 2^(b) - 1]. 42 | bits <- ceiling(log2(length(lda_vocab))) 43 | # Total number of unique documents in data 44 | num_docs <- as.numeric(nrow(lda_data)) 45 | 46 | # Now we can set up a LDA model. 47 | lda_model <- vwsetup(feature_params = list(bit_precision=bits), 48 | optimization_params = list(initial_t=1, power_t=0.5), # Parameters for learning rate schedule 49 | option = "lda", # Enable LDA algorithm 50 | num_topics = 7, # Specify the number of topics to learn (the same as were manually classified) 51 | lda_D = num_docs, 52 | minibatch = 16) # Analyze 16 documents at a time 53 | 54 | # And start learning a set of topics. 55 | vwtrain(vwmodel = lda_model, 56 | data = lda_data, 57 | namespaces = list(" " = "features"), 58 | fixed = "features") 59 | 60 | # Here we get our topic predictions for each word from regressor values. 61 | vwout <- vwaudit(vwmodel = lda_model) 62 | # Each line of vwout corresponds to a single feature (a single word in our case) 63 | # Output contains following columns: 64 | # Names - feature names 65 | # Hashes - feature hashes 66 | # V1-V7 - Regressor values for each topic 67 | 68 | 69 | # Now we need to post-process this output to get final word - topic correspondences. 70 | # First, filter out zero valued features. 71 | selected_rows <- apply(vwout[, 3:9], 1, function(x) { 72 | !all(x == 0) 73 | }) 74 | vwout<- vwout[selected_rows,] 75 | 76 | # And finaly: 77 | # 1) Connect words from prepared vocabulary with feature hashes from our model. 78 | # 2) Connect words with a maximum valued topic prediction. 79 | lda_results <- data.frame( 80 | word = lda_vocab, 81 | topic = apply(vwout[order(vwout$Hashes), 3:9], 1, function(x) { 82 | which.max(x) 83 | }), 84 | value = apply(vwout[order(vwout$Hashes), 3:9], 1, function(x) { 85 | max(x) 86 | }) 87 | ) 88 | 89 | head(lda_results) 90 | 91 | # Switch back. 92 | setwd(curr_dir) 93 | -------------------------------------------------------------------------------- /demo/rvw_overview.R: -------------------------------------------------------------------------------- 1 | library(rvw) 2 | 3 | curr_dir <- getwd() 4 | setwd(tempdir()) 5 | 6 | # Get VW format datafiles 7 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 8 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw") 9 | multiclass_train_data <- system.file("extdata", "multiclass_train.vw", package = "rvw") 10 | multiclass_test_data <- system.file("extdata", "multiclass_valid.vw", package = "rvw") 11 | 12 | # Setup model 13 | test_vwmodel <- vwsetup(dir = "./", model = "mdl.vw", 14 | feature_params = list(hash="all", bit_precision=25), 15 | optimization_params = list(adaptive=FALSE, learning_rate=0.1)) 16 | # Basic training and testing 17 | vwtrain(test_vwmodel, data = ext_train_data) 18 | vw_output <- vwtest(test_vwmodel, data = ext_test_data, probs_path = "./probs.vw") 19 | 20 | # Printing readable model 21 | test_vwmodel <- vwsetup() 22 | vwtrain(test_vwmodel, data = ext_train_data, readable_model = "hashed") 23 | vwtest(test_vwmodel, data = ext_test_data, readable_model = "inverted") 24 | # Model audit 25 | vwaudit(test_vwmodel) 26 | # No console output 27 | vwtrain(test_vwmodel, data = ext_train_data, quiet = T) 28 | vwtest(test_vwmodel, data = ext_train_data, quiet = T) 29 | 30 | # Add learning options 31 | library(magrittr) 32 | test_vwmodel <- vwsetup(dir = "./", model = "mdl.vw", 33 | option = "ect", num_classes=3) %>% 34 | add_option(option = "boosting", num_learners=10) 35 | # Print vwmodel contents 36 | test_vwmodel 37 | # Access vw parameters 38 | vwparams(test_vwmodel, "num_classes") 39 | # Modify parameters 40 | vwparams(test_vwmodel, "num_learners") <- 100 41 | 42 | vwtrain(test_vwmodel, data = multiclass_train_data) 43 | vwtest(test_vwmodel, data = multiclass_test_data) 44 | 45 | setwd(curr_dir) 46 | -------------------------------------------------------------------------------- /docker/ci/Dockerfile: -------------------------------------------------------------------------------- 1 | ## Emacs, make this -*- mode: sh; -*- 2 | 3 | FROM r-base:latest 4 | 5 | LABEL org.label-schema.license="GPL-2.0" \ 6 | org.label-schema.vcs-url="https://github.com/rvw-org" \ 7 | maintainer="Dirk Eddelbuettel " 8 | 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends \ 11 | r-cran-rcpp \ 12 | r-cran-testthat \ 13 | r-cran-runit \ 14 | r-cran-data.table \ 15 | r-cran-knitr \ 16 | r-cran-rmarkdown \ 17 | libvw-dev \ 18 | vowpal-wabbit \ 19 | libboost-program-options-dev \ 20 | r-cran-matrix \ 21 | && install.r RApiSerialize mltools \ 22 | && mkdir ~/.R \ 23 | && echo _R_CHECK_FORCE_SUGGESTS_=FALSE > ~/.R/check.Renviron 24 | 25 | CMD ["bash"] 26 | -------------------------------------------------------------------------------- /docker/run/Dockerfile: -------------------------------------------------------------------------------- 1 | ## Emacs, make this -*- mode: sh; -*- 2 | 3 | FROM rvowpalwabbit/ci 4 | 5 | LABEL org.label-schema.license="GPL-2.0" \ 6 | org.label-schema.vcs-url="https://github.com/rvw-org/rvw" \ 7 | maintainer="Dirk Eddelbuettel " 8 | 9 | ## If on CRAN, install the latest version from CRAN 10 | #RUN install.r ... 11 | 12 | ## Alternatively, install from Github (after first installing remotes) 13 | RUN install.r remotes && installGithub.r rvw-org/rvw 14 | 15 | CMD ["bash"] 16 | -------------------------------------------------------------------------------- /inst/extdata/binary_train.vw: -------------------------------------------------------------------------------- 1 | -1.000000 |A carat:1.590000 depth:62.300000 table:60.000000 price:11613.000000 x:7.420000 z:4.640000 cut_Very_Good color_G clarity_SI1 2 | 1.000000 |A carat:0.420000 depth:61.500000 table:59.000000 price:709.000000 x:4.840000 z:2.960000 cut_Premium color_I clarity_SI2 3 | -1.000000 |A carat:1.040000 depth:61.600000 table:56.000000 price:3960.000000 x:6.560000 z:4.020000 cut_Premium color_I clarity_SI2 4 | 1.000000 |A carat:0.510000 depth:63.300000 table:56.000000 price:1656.000000 x:5.080000 z:3.230000 cut_Good color_E clarity_VS2 5 | 1.000000 |A carat:0.550000 depth:62.100000 table:56.000000 price:1117.000000 x:5.250000 z:3.250000 cut_Ideal color_G clarity_SI2 6 | -1.000000 |A carat:0.900000 depth:61.200000 table:60.000000 price:4304.000000 x:6.190000 z:3.770000 cut_Premium color_D clarity_SI1 7 | -1.000000 |A carat:0.700000 depth:61.800000 table:60.000000 price:2330.000000 x:5.680000 z:3.520000 cut_Premium color_H clarity_VS2 8 | 1.000000 |A carat:0.230000 depth:59.500000 table:58.000000 price:530.000000 x:4.000000 z:2.390000 cut_Very_Good color_F clarity_IF 9 | -1.000000 |A carat:2.040000 depth:61.500000 table:57.000000 price:16800.000000 x:8.150000 z:5.020000 cut_Ideal color_H clarity_SI2 10 | 1.000000 |A carat:0.310000 depth:61.600000 table:54.000000 price:891.000000 x:4.400000 z:2.720000 cut_Very_Good color_G clarity_IF 11 | 1.000000 |A carat:0.410000 depth:62.800000 table:60.000000 price:834.000000 x:4.720000 z:2.980000 cut_Very_Good color_E clarity_VS1 12 | -1.000000 |A carat:1.290000 depth:61.600000 table:57.000000 price:6588.000000 x:7.000000 z:4.300000 cut_Ideal color_H clarity_SI1 13 | 1.000000 |A carat:0.500000 depth:62.100000 table:55.000000 price:1080.000000 x:5.090000 z:3.170000 cut_Ideal color_G clarity_SI2 14 | -1.000000 |A carat:0.710000 depth:55.600000 table:73.000000 price:2368.000000 x:6.010000 z:3.330000 cut_Fair color_D clarity_VS2 15 | -1.000000 |A carat:1.210000 depth:61.500000 table:58.000000 price:5211.000000 x:6.850000 z:4.230000 cut_Very_Good color_J clarity_VS2 16 | 1.000000 |A carat:0.410000 depth:61.400000 table:57.000000 price:1079.000000 x:4.790000 z:2.950000 cut_Ideal color_E clarity_VS1 17 | 1.000000 |A carat:0.500000 depth:62.900000 table:61.000000 price:1845.000000 x:5.030000 z:3.140000 cut_Premium color_D clarity_VS2 18 | -1.000000 |A carat:1.050000 depth:61.900000 table:56.000000 price:4586.000000 x:6.560000 z:4.040000 cut_Ideal color_H clarity_SI2 19 | -1.000000 |A carat:1.060000 depth:61.800000 table:55.000000 price:5697.000000 x:6.540000 z:4.040000 cut_Ideal color_I clarity_SI1 20 | 1.000000 |A carat:0.270000 depth:62.300000 table:54.000000 price:500.000000 x:4.160000 z:2.600000 cut_Ideal color_H clarity_VS1 21 | 1.000000 |A carat:0.410000 depth:61.700000 table:56.000000 price:1115.000000 x:4.780000 z:2.960000 cut_Ideal color_F clarity_VVS2 22 | -1.000000 |A carat:1.160000 depth:62.400000 table:55.000000 price:3800.000000 x:6.760000 z:4.200000 cut_Ideal color_I clarity_SI2 23 | 1.000000 |A carat:0.720000 depth:63.900000 table:62.000000 price:2188.000000 x:5.700000 z:3.620000 cut_Good color_F clarity_SI1 24 | -1.000000 |A carat:0.910000 depth:62.700000 table:59.000000 price:4720.000000 x:6.130000 z:3.850000 cut_Very_Good color_D clarity_VS2 25 | 1.000000 |A carat:0.300000 depth:62.200000 table:54.000000 price:819.000000 x:4.330000 z:2.680000 cut_Ideal color_G clarity_VVS2 26 | 1.000000 |A carat:0.330000 depth:60.100000 table:58.000000 price:1052.000000 x:4.510000 z:2.700000 cut_Premium color_E clarity_VVS1 27 | 1.000000 |A carat:0.550000 depth:61.200000 table:56.400000 price:1975.000000 x:5.280000 z:3.250000 cut_Very_Good color_F clarity_VS1 28 | -1.000000 |A carat:1.730000 depth:61.700000 table:56.000000 price:12998.000000 x:7.640000 z:4.730000 cut_Very_Good color_H clarity_VS2 29 | 1.000000 |A carat:0.400000 depth:59.200000 table:61.000000 price:900.000000 x:4.810000 z:2.840000 cut_Premium color_G clarity_SI1 30 | -1.000000 |A carat:0.900000 depth:61.600000 table:55.000000 price:3587.000000 x:6.240000 z:3.850000 cut_Ideal color_J clarity_VS1 31 | 1.000000 |A carat:0.580000 depth:61.100000 table:56.000000 price:1719.000000 x:5.400000 z:3.310000 cut_Ideal color_G clarity_VS2 32 | -1.000000 |A carat:1.120000 depth:59.600000 table:56.000000 price:8973.000000 x:6.800000 z:4.070000 cut_Good color_G clarity_IF 33 | -1.000000 |A carat:0.760000 depth:61.700000 table:55.000000 price:2553.000000 x:5.870000 z:3.630000 cut_Ideal color_G clarity_SI2 34 | -1.000000 |A carat:1.010000 depth:62.000000 table:58.000000 price:10688.000000 x:6.410000 z:3.990000 cut_Ideal color_F clarity_IF 35 | -1.000000 |A carat:0.700000 depth:62.400000 table:53.000000 price:2839.000000 x:5.730000 z:3.570000 cut_Ideal color_F clarity_VS1 36 | -1.000000 |A carat:0.910000 depth:62.200000 table:57.000000 price:3884.000000 x:6.120000 z:3.820000 cut_Very_Good color_G clarity_SI1 37 | -1.000000 |A carat:1.070000 depth:62.300000 table:53.000000 price:6049.000000 x:6.570000 z:4.100000 cut_Ideal color_F clarity_SI1 38 | 1.000000 |A carat:0.300000 depth:63.100000 table:56.000000 price:675.000000 x:4.280000 z:2.690000 cut_Very_Good color_G clarity_VS2 39 | -1.000000 |A carat:1.060000 depth:61.000000 table:57.000000 price:7900.000000 x:6.550000 z:4.000000 cut_Ideal color_G clarity_VVS2 40 | -1.000000 |A carat:0.930000 depth:61.900000 table:55.000000 price:4511.000000 x:6.260000 z:3.890000 cut_Ideal color_E clarity_SI1 41 | 1.000000 |A carat:0.310000 depth:61.100000 table:55.000000 price:840.000000 x:4.380000 z:2.680000 cut_Ideal color_F clarity_VVS1 42 | -1.000000 |A carat:1.210000 depth:61.000000 table:60.000000 price:4675.000000 x:6.880000 z:4.180000 cut_Premium color_J clarity_VS1 43 | 1.000000 |A carat:0.300000 depth:62.200000 table:60.000000 price:526.000000 x:4.240000 z:2.640000 cut_Very_Good color_G clarity_VS2 44 | 1.000000 |A carat:0.330000 depth:61.100000 table:56.000000 price:539.000000 x:4.500000 z:2.760000 cut_Ideal color_H clarity_VS2 45 | -1.000000 |A carat:1.700000 depth:63.200000 table:58.000000 price:7730.000000 x:7.560000 z:4.760000 cut_Very_Good color_J clarity_VS2 46 | -1.000000 |A carat:0.900000 depth:62.300000 table:64.000000 price:3605.000000 x:6.100000 z:3.810000 cut_Very_Good color_I clarity_VS2 47 | 1.000000 |A carat:0.310000 depth:59.000000 table:57.000000 price:446.000000 x:4.400000 z:2.610000 cut_Very_Good color_H clarity_SI1 48 | -1.000000 |A carat:0.930000 depth:61.800000 table:55.000000 price:5375.000000 x:6.280000 z:3.880000 cut_Ideal color_H clarity_VS1 49 | 1.000000 |A carat:0.700000 depth:61.800000 table:57.000000 price:2058.000000 x:5.710000 z:3.520000 cut_Ideal color_E clarity_SI1 50 | -1.000000 |A carat:1.010000 depth:61.200000 table:61.000000 price:5294.000000 x:6.440000 z:3.930000 cut_Premium color_H clarity_VS1 51 | 1.000000 |A carat:0.500000 depth:61.600000 table:62.000000 price:2437.000000 x:5.040000 z:3.120000 cut_Good color_E clarity_VVS2 52 | -1.000000 |A carat:1.550000 depth:60.100000 table:58.000000 price:11871.000000 x:7.550000 z:4.520000 cut_Premium color_E clarity_SI1 53 | 1.000000 |A carat:0.260000 depth:61.700000 table:57.000000 price:599.000000 x:4.070000 z:2.520000 cut_Ideal color_E clarity_VVS1 54 | -1.000000 |A carat:1.080000 depth:61.800000 table:56.000000 price:6078.000000 x:6.620000 z:4.110000 cut_Ideal color_D clarity_SI1 55 | -1.000000 |A carat:0.920000 depth:62.500000 table:59.000000 price:3613.000000 x:6.200000 z:3.890000 cut_Very_Good color_D clarity_SI2 56 | 1.000000 |A carat:0.420000 depth:62.400000 table:55.000000 price:1031.000000 x:4.790000 z:3.000000 cut_Ideal color_G clarity_VVS2 57 | 1.000000 |A carat:0.320000 depth:60.700000 table:58.000000 price:720.000000 x:4.420000 z:2.670000 cut_Premium color_G clarity_VS2 58 | -1.000000 |A carat:1.000000 depth:61.400000 table:58.000000 price:8216.000000 x:6.360000 z:3.920000 cut_Very_Good color_F clarity_VVS2 59 | -1.000000 |A carat:0.770000 depth:62.000000 table:56.000000 price:3697.000000 x:5.870000 z:3.650000 cut_Ideal color_E clarity_VS1 60 | -1.000000 |A carat:1.010000 depth:63.600000 table:57.000000 price:5251.000000 x:6.270000 z:4.000000 cut_Good color_E clarity_SI1 61 | 1.000000 |A carat:0.300000 depth:63.500000 table:54.000000 price:608.000000 x:4.300000 z:2.720000 cut_Very_Good color_H clarity_VS2 62 | 1.000000 |A carat:0.260000 depth:60.900000 table:57.000000 price:580.000000 x:4.130000 z:2.510000 cut_Ideal color_F clarity_VS1 63 | -1.000000 |A carat:0.690000 depth:57.800000 table:66.000000 price:2070.000000 x:5.900000 z:3.390000 cut_Fair color_G clarity_VS1 64 | -1.000000 |A carat:1.100000 depth:61.600000 table:56.000000 price:9051.000000 x:6.620000 z:4.100000 cut_Ideal color_G clarity_VVS2 65 | -1.000000 |A carat:0.710000 depth:58.300000 table:61.000000 price:2131.000000 x:5.840000 z:3.390000 cut_Good color_H clarity_SI1 66 | 1.000000 |A carat:0.270000 depth:61.000000 table:61.000000 price:544.000000 x:4.140000 z:2.530000 cut_Very_Good color_D clarity_VVS1 67 | 1.000000 |A carat:0.340000 depth:63.900000 table:56.000000 price:765.000000 x:4.480000 z:2.850000 cut_Good color_E clarity_SI1 68 | -1.000000 |A carat:1.180000 depth:61.600000 table:56.000000 price:4229.000000 x:6.820000 z:4.190000 cut_Ideal color_E clarity_I1 69 | 1.000000 |A carat:0.670000 depth:62.500000 table:59.000000 price:2211.000000 x:5.580000 z:3.470000 cut_Premium color_F clarity_VS2 70 | -1.000000 |A carat:1.580000 depth:62.300000 table:55.000000 price:9457.000000 x:7.420000 z:4.640000 cut_Ideal color_I clarity_SI1 71 | 1.000000 |A carat:0.240000 depth:60.600000 table:62.000000 price:478.000000 x:4.030000 z:2.450000 cut_Very_Good color_E clarity_VVS2 72 | -1.000000 |A carat:0.920000 depth:58.500000 table:57.000000 price:2947.000000 x:6.370000 z:3.720000 cut_Ideal color_H clarity_SI2 73 | 1.000000 |A carat:0.310000 depth:61.900000 table:58.000000 price:625.000000 x:4.300000 z:2.670000 cut_Very_Good color_H clarity_VVS2 74 | -1.000000 |A carat:1.510000 depth:62.800000 table:59.000000 price:7553.000000 x:7.280000 z:4.550000 cut_Premium color_J clarity_VS2 75 | -1.000000 |A carat:1.220000 depth:62.300000 table:56.000000 price:10221.000000 x:6.840000 z:4.250000 cut_Ideal color_G clarity_VVS2 76 | 1.000000 |A carat:0.300000 depth:62.000000 table:55.000000 price:776.000000 x:4.330000 z:2.680000 cut_Ideal color_F clarity_VS2 77 | 1.000000 |A carat:0.380000 depth:61.900000 table:56.000000 price:1327.000000 x:4.670000 z:2.880000 cut_Ideal color_E clarity_VVS1 78 | -1.000000 |A carat:0.780000 depth:60.400000 table:58.000000 price:3531.000000 x:6.000000 z:3.610000 cut_Ideal color_F clarity_VS2 79 | 1.000000 |A carat:0.700000 depth:63.100000 table:58.000000 price:2643.000000 x:5.610000 z:3.550000 cut_Very_Good color_E clarity_SI1 80 | -1.000000 |A carat:1.020000 depth:63.900000 table:55.000000 price:4476.000000 x:6.370000 z:4.080000 cut_Good color_H clarity_SI1 81 | -1.000000 |A carat:0.840000 depth:62.800000 table:57.000000 price:2656.000000 x:5.990000 z:3.780000 cut_Very_Good color_F clarity_SI2 82 | 1.000000 |A carat:0.380000 depth:62.100000 table:55.000000 price:633.000000 x:4.620000 z:2.880000 cut_Very_Good color_D clarity_SI2 83 | 1.000000 |A carat:0.370000 depth:61.800000 table:54.000000 price:1082.000000 x:4.630000 z:2.850000 cut_Ideal color_E clarity_VVS2 84 | 1.000000 |A carat:0.310000 depth:63.800000 table:56.000000 price:489.000000 x:4.270000 z:2.740000 cut_Good color_I clarity_VS1 85 | -1.000000 |A carat:1.020000 depth:61.600000 table:56.000000 price:4547.000000 x:6.550000 z:3.990000 cut_Ideal color_E clarity_SI2 86 | -1.000000 |A carat:1.060000 depth:62.600000 table:58.000000 price:5889.000000 x:6.540000 z:4.080000 cut_Premium color_H clarity_VS2 87 | -1.000000 |A carat:1.020000 depth:61.900000 table:53.000000 price:6169.000000 x:6.500000 z:4.020000 cut_Ideal color_G clarity_VS2 88 | -1.000000 |A carat:1.030000 depth:61.300000 table:57.000000 price:6981.000000 x:6.510000 z:4.000000 cut_Ideal color_G clarity_VS2 89 | -1.000000 |A carat:0.710000 depth:61.300000 table:56.000000 price:3406.000000 x:5.750000 z:3.530000 cut_Ideal color_D clarity_VS1 90 | 1.000000 |A carat:0.500000 depth:62.500000 table:58.000000 price:1746.000000 x:5.110000 z:3.180000 cut_Premium color_G clarity_VS1 91 | 1.000000 |A carat:0.320000 depth:61.900000 table:55.000000 price:915.000000 x:4.380000 z:2.730000 cut_Ideal color_F clarity_IF 92 | 1.000000 |A carat:0.310000 depth:60.500000 table:55.000000 price:877.000000 x:4.430000 z:2.670000 cut_Ideal color_D clarity_VS1 93 | -1.000000 |A carat:0.700000 depth:58.200000 table:59.000000 price:2513.000000 x:5.870000 z:3.440000 cut_Very_Good color_G clarity_VS2 94 | -1.000000 |A carat:1.080000 depth:62.600000 table:56.000000 price:4407.000000 x:6.550000 z:4.120000 cut_Very_Good color_F clarity_SI2 95 | -1.000000 |A carat:0.900000 depth:63.400000 table:57.000000 price:4447.000000 x:6.130000 z:3.880000 cut_Very_Good color_D clarity_SI1 96 | -1.000000 |A carat:1.060000 depth:63.100000 table:59.000000 price:6212.000000 x:6.450000 z:4.080000 cut_Good color_G clarity_VS2 97 | -1.000000 |A carat:1.560000 depth:61.300000 table:56.000000 price:14237.000000 x:7.480000 z:4.590000 cut_Ideal color_G clarity_VS2 98 | -1.000000 |A carat:1.000000 depth:59.500000 table:63.000000 price:4077.000000 x:6.470000 z:3.830000 cut_Very_Good color_E clarity_SI2 99 | 1.000000 |A carat:0.310000 depth:62.000000 table:55.200000 price:515.000000 x:4.320000 z:2.690000 cut_Ideal color_F clarity_SI1 100 | 1.000000 |A carat:0.700000 depth:64.700000 table:58.000000 price:2051.000000 x:5.590000 z:3.600000 cut_Fair color_H clarity_SI1 101 | -------------------------------------------------------------------------------- /inst/extdata/binary_valid.vw: -------------------------------------------------------------------------------- 1 | 1.000000 |A carat:0.330000 depth:62.200000 table:58.000000 price:854.000000 x:4.420000 z:2.740000 cut_Premium color_G clarity_VS1 2 | 1.000000 |A carat:0.330000 depth:61.600000 table:58.000000 price:854.000000 x:4.440000 z:2.720000 cut_Premium color_G clarity_VS1 3 | 1.000000 |A carat:0.330000 depth:60.100000 table:58.000000 price:854.000000 x:4.470000 z:2.680000 cut_Premium color_F clarity_VS2 4 | 1.000000 |A carat:0.330000 depth:62.300000 table:55.000000 price:854.000000 x:4.470000 z:2.770000 cut_Ideal color_F clarity_VS2 5 | 1.000000 |A carat:0.330000 depth:61.900000 table:57.000000 price:854.000000 x:4.420000 z:2.730000 cut_Ideal color_F clarity_VS2 6 | 1.000000 |A carat:0.330000 depth:61.900000 table:57.000000 price:854.000000 x:4.440000 z:2.740000 cut_Premium color_F clarity_VS2 7 | 1.000000 |A carat:0.390000 depth:61.700000 table:57.000000 price:854.000000 x:4.770000 z:2.920000 cut_Premium color_G clarity_SI1 8 | 1.000000 |A carat:0.330000 depth:61.600000 table:55.000000 price:854.000000 x:4.480000 z:2.750000 cut_Ideal color_F clarity_VS2 9 | 1.000000 |A carat:0.330000 depth:60.300000 table:55.000000 price:855.000000 x:4.520000 z:2.720000 cut_Ideal color_D clarity_VS2 10 | 1.000000 |A carat:0.390000 depth:62.600000 table:58.000000 price:855.000000 x:4.660000 z:2.930000 cut_Premium color_G clarity_VS1 11 | 1.000000 |A carat:0.390000 depth:62.100000 table:58.000000 price:855.000000 x:4.630000 z:2.880000 cut_Very_Good color_H clarity_VVS2 12 | 1.000000 |A carat:0.500000 depth:65.300000 table:56.000000 price:855.000000 x:4.910000 z:3.240000 cut_Good color_I clarity_SI2 13 | 1.000000 |A carat:0.380000 depth:62.400000 table:59.000000 price:855.000000 x:4.640000 z:2.880000 cut_Premium color_G clarity_SI1 14 | 1.000000 |A carat:0.400000 depth:61.400000 table:59.000000 price:855.000000 x:4.750000 z:2.900000 cut_Premium color_E clarity_SI2 15 | 1.000000 |A carat:0.400000 depth:62.100000 table:60.000000 price:855.000000 x:4.750000 z:2.930000 cut_Premium color_D clarity_SI2 16 | 1.000000 |A carat:0.380000 depth:61.400000 table:61.000000 price:855.000000 x:4.660000 z:2.850000 cut_Premium color_G clarity_SI1 17 | 1.000000 |A carat:0.380000 depth:62.400000 table:57.000000 price:855.000000 x:4.650000 z:2.890000 cut_Ideal color_G clarity_SI1 18 | 1.000000 |A carat:0.380000 depth:60.900000 table:56.000000 price:855.000000 x:4.710000 z:2.860000 cut_Ideal color_G clarity_SI1 19 | 1.000000 |A carat:0.460000 depth:66.800000 table:55.000000 price:855.000000 x:4.820000 z:3.200000 cut_Fair color_I clarity_VS2 20 | 1.000000 |A carat:0.330000 depth:61.900000 table:56.000000 price:856.000000 x:4.440000 z:2.760000 cut_Very_Good color_D clarity_VVS2 21 | 1.000000 |A carat:0.530000 depth:61.900000 table:54.000000 price:856.000000 x:5.200000 z:3.230000 cut_Ideal color_J clarity_SI2 22 | 1.000000 |A carat:0.370000 depth:62.400000 table:56.000000 price:857.000000 x:4.560000 z:2.870000 cut_Ideal color_D clarity_VVS2 23 | 1.000000 |A carat:0.310000 depth:63.500000 table:55.000000 price:465.000000 x:4.330000 z:2.740000 cut_Very_Good color_J clarity_SI1 24 | 1.000000 |A carat:0.250000 depth:61.600000 table:56.000000 price:467.000000 x:4.070000 z:2.510000 cut_Very_Good color_H clarity_VVS1 25 | 1.000000 |A carat:0.280000 depth:61.900000 table:55.000000 price:467.000000 x:4.200000 z:2.610000 cut_Ideal color_H clarity_VS2 26 | 1.000000 |A carat:0.310000 depth:62.000000 table:56.100000 price:468.000000 x:4.310000 z:2.680000 cut_Ideal color_J clarity_VVS2 27 | 1.000000 |A carat:0.260000 depth:59.800000 table:59.000000 price:468.000000 x:4.120000 z:2.480000 cut_Premium color_H clarity_VVS1 28 | 1.000000 |A carat:0.260000 depth:63.300000 table:58.000000 price:468.000000 x:4.040000 z:2.570000 cut_Good color_H clarity_VVS2 29 | 1.000000 |A carat:0.300000 depth:61.800000 table:55.000000 price:857.000000 x:4.320000 z:2.680000 cut_Ideal color_F clarity_VVS1 30 | 1.000000 |A carat:0.300000 depth:62.200000 table:53.000000 price:857.000000 x:4.300000 z:2.680000 cut_Ideal color_F clarity_VVS1 31 | 1.000000 |A carat:0.390000 depth:62.500000 table:54.000000 price:857.000000 x:4.670000 z:2.930000 cut_Ideal color_G clarity_VS2 32 | 1.000000 |A carat:0.300000 depth:60.900000 table:57.000000 price:857.000000 x:4.340000 z:2.650000 cut_Ideal color_D clarity_VS2 33 | 1.000000 |A carat:0.400000 depth:64.700000 table:58.000000 price:857.000000 x:4.670000 z:3.010000 cut_Fair color_F clarity_SI1 34 | 1.000000 |A carat:0.300000 depth:58.100000 table:61.000000 price:858.000000 x:4.390000 z:2.560000 cut_Very_Good color_F clarity_VVS1 35 | 1.000000 |A carat:0.400000 depth:62.900000 table:59.000000 price:858.000000 x:4.700000 z:2.970000 cut_Very_Good color_E clarity_VS2 36 | 1.000000 |A carat:0.360000 depth:62.400000 table:63.000000 price:858.000000 x:4.460000 z:2.800000 cut_Good color_F clarity_VVS1 37 | 1.000000 |A carat:0.290000 depth:62.700000 table:55.000000 price:858.000000 x:4.220000 z:2.640000 cut_Ideal color_E clarity_VVS1 38 | 1.000000 |A carat:0.430000 depth:62.100000 table:57.000000 price:858.000000 x:4.830000 z:2.980000 cut_Ideal color_H clarity_SI1 39 | 1.000000 |A carat:0.330000 depth:62.000000 table:56.000000 price:859.000000 x:4.410000 z:2.740000 cut_Very_Good color_F clarity_VVS1 40 | 1.000000 |A carat:0.350000 depth:61.800000 table:57.000000 price:859.000000 x:4.500000 z:2.790000 cut_Very_Good color_G clarity_VVS1 41 | 1.000000 |A carat:0.350000 depth:61.700000 table:54.000000 price:859.000000 x:4.530000 z:2.800000 cut_Ideal color_H clarity_IF 42 | 1.000000 |A carat:0.390000 depth:59.100000 table:59.000000 price:860.000000 x:4.810000 z:2.830000 cut_Premium color_E clarity_SI1 43 | 1.000000 |A carat:0.400000 depth:62.900000 table:54.800000 price:861.000000 x:4.710000 z:2.980000 cut_Very_Good color_G clarity_VS1 44 | 1.000000 |A carat:0.410000 depth:62.200000 table:56.000000 price:861.000000 x:4.770000 z:2.960000 cut_Ideal color_G clarity_SI1 45 | 1.000000 |A carat:0.300000 depth:62.100000 table:57.000000 price:862.000000 x:4.270000 z:2.660000 cut_Ideal color_E clarity_VVS1 46 | 1.000000 |A carat:0.300000 depth:62.300000 table:55.000000 price:862.000000 x:4.310000 z:2.690000 cut_Ideal color_E clarity_VVS1 47 | 1.000000 |A carat:0.320000 depth:61.500000 table:56.000000 price:862.000000 x:4.410000 z:2.720000 cut_Ideal color_D clarity_VS1 48 | 1.000000 |A carat:0.420000 depth:62.000000 table:59.000000 price:862.000000 x:4.830000 z:2.980000 cut_Premium color_D clarity_SI2 49 | 1.000000 |A carat:0.300000 depth:62.000000 table:55.000000 price:863.000000 x:4.310000 z:2.680000 cut_Very_Good color_D clarity_VS1 50 | 1.000000 |A carat:0.410000 depth:63.200000 table:57.000000 price:863.000000 x:4.740000 z:3.000000 cut_Good color_F clarity_VS2 51 | 1.000000 |A carat:0.410000 depth:61.100000 table:56.000000 price:863.000000 x:4.800000 z:2.940000 cut_Ideal color_F clarity_VS2 52 | 1.000000 |A carat:0.410000 depth:62.300000 table:53.000000 price:863.000000 x:4.740000 z:2.960000 cut_Ideal color_E clarity_VS1 53 | 1.000000 |A carat:0.410000 depth:61.600000 table:59.000000 price:863.000000 x:4.740000 z:2.930000 cut_Premium color_F clarity_VS2 54 | 1.000000 |A carat:0.410000 depth:62.300000 table:57.000000 price:863.000000 x:4.750000 z:2.970000 cut_Ideal color_F clarity_VS2 55 | 1.000000 |A carat:0.410000 depth:60.000000 table:56.000000 price:863.000000 x:4.820000 z:2.910000 cut_Ideal color_F clarity_VS2 56 | 1.000000 |A carat:0.410000 depth:62.600000 table:58.000000 price:863.000000 x:4.750000 z:2.990000 cut_Very_Good color_F clarity_VS2 57 | 1.000000 |A carat:0.410000 depth:59.800000 table:61.000000 price:863.000000 x:4.790000 z:2.880000 cut_Very_Good color_I clarity_VVS1 58 | 1.000000 |A carat:0.410000 depth:62.600000 table:57.000000 price:863.000000 x:4.710000 z:2.960000 cut_Very_Good color_F clarity_VS2 59 | 1.000000 |A carat:0.410000 depth:60.300000 table:60.000000 price:863.000000 x:4.810000 z:2.910000 cut_Premium color_F clarity_VS2 60 | 1.000000 |A carat:0.300000 depth:60.500000 table:56.000000 price:863.000000 x:4.340000 z:2.660000 cut_Ideal color_F clarity_VVS1 61 | 1.000000 |A carat:0.310000 depth:61.100000 table:56.000000 price:863.000000 x:4.360000 z:2.670000 cut_Ideal color_E clarity_VVS1 62 | 1.000000 |A carat:0.340000 depth:61.600000 table:54.000000 price:863.000000 x:4.490000 z:2.780000 cut_Ideal color_H clarity_IF 63 | 1.000000 |A carat:0.340000 depth:61.800000 table:54.000000 price:863.000000 x:4.510000 z:2.800000 cut_Ideal color_H clarity_IF 64 | 1.000000 |A carat:0.340000 depth:61.700000 table:55.000000 price:863.000000 x:4.500000 z:2.790000 cut_Ideal color_H clarity_IF 65 | 1.000000 |A carat:0.300000 depth:61.300000 table:55.000000 price:863.000000 x:4.330000 z:2.660000 cut_Ideal color_G clarity_IF 66 | 1.000000 |A carat:0.300000 depth:62.300000 table:56.000000 price:863.000000 x:4.310000 z:2.690000 cut_Ideal color_G clarity_IF 67 | 1.000000 |A carat:0.300000 depth:62.400000 table:54.000000 price:863.000000 x:4.310000 z:2.700000 cut_Ideal color_G clarity_IF 68 | 1.000000 |A carat:0.300000 depth:60.700000 table:57.000000 price:863.000000 x:4.340000 z:2.650000 cut_Ideal color_G clarity_IF 69 | 1.000000 |A carat:0.300000 depth:62.000000 table:55.000000 price:863.000000 x:4.330000 z:2.690000 cut_Ideal color_G clarity_IF 70 | 1.000000 |A carat:0.300000 depth:61.200000 table:57.000000 price:863.000000 x:4.350000 z:2.670000 cut_Ideal color_G clarity_IF 71 | 1.000000 |A carat:0.300000 depth:62.100000 table:55.000000 price:863.000000 x:4.320000 z:2.690000 cut_Ideal color_G clarity_IF 72 | 1.000000 |A carat:0.300000 depth:60.900000 table:58.000000 price:863.000000 x:4.320000 z:2.640000 cut_Ideal color_G clarity_IF 73 | 1.000000 |A carat:0.300000 depth:61.600000 table:56.000000 price:863.000000 x:4.340000 z:2.680000 cut_Ideal color_G clarity_IF 74 | 1.000000 |A carat:0.300000 depth:61.800000 table:55.000000 price:863.000000 x:4.320000 z:2.680000 cut_Ideal color_G clarity_IF 75 | 1.000000 |A carat:0.300000 depth:60.900000 table:56.000000 price:863.000000 x:4.340000 z:2.650000 cut_Ideal color_G clarity_IF 76 | 1.000000 |A carat:0.300000 depth:62.300000 table:56.000000 price:863.000000 x:4.290000 z:2.680000 cut_Ideal color_G clarity_IF 77 | 1.000000 |A carat:0.500000 depth:62.400000 table:61.000000 price:863.000000 x:5.020000 z:3.120000 cut_Premium color_G clarity_SI2 78 | 1.000000 |A carat:0.320000 depth:62.400000 table:56.000000 price:864.000000 x:4.390000 z:2.750000 cut_Ideal color_E clarity_VVS2 79 | 1.000000 |A carat:0.310000 depth:60.900000 table:55.000000 price:864.000000 x:4.420000 z:2.700000 cut_Ideal color_E clarity_VVS2 80 | 1.000000 |A carat:0.310000 depth:62.000000 table:54.000000 price:864.000000 x:4.380000 z:2.720000 cut_Ideal color_E clarity_VVS2 81 | 1.000000 |A carat:0.320000 depth:61.900000 table:59.000000 price:864.000000 x:4.390000 z:2.710000 cut_Premium color_I clarity_IF 82 | 1.000000 |A carat:0.360000 depth:62.300000 table:56.000000 price:865.000000 x:4.550000 z:2.850000 cut_Very_Good color_H clarity_IF 83 | 1.000000 |A carat:0.340000 depth:60.900000 table:56.000000 price:865.000000 x:4.510000 z:2.760000 cut_Ideal color_D clarity_VS1 84 | 1.000000 |A carat:0.420000 depth:60.500000 table:57.000000 price:865.000000 x:4.840000 z:2.940000 cut_Ideal color_I clarity_VVS1 85 | 1.000000 |A carat:0.310000 depth:61.500000 table:56.000000 price:865.000000 x:4.360000 z:2.690000 cut_Ideal color_E clarity_VVS1 86 | 1.000000 |A carat:0.310000 depth:61.300000 table:56.000000 price:865.000000 x:4.380000 z:2.690000 cut_Ideal color_E clarity_VVS1 87 | 1.000000 |A carat:0.310000 depth:60.200000 table:61.000000 price:865.000000 x:4.400000 z:2.660000 cut_Ideal color_E clarity_VVS1 88 | 1.000000 |A carat:0.380000 depth:61.300000 table:56.000000 price:865.000000 x:4.670000 z:2.870000 cut_Ideal color_F clarity_VS1 89 | 1.000000 |A carat:0.410000 depth:59.700000 table:58.000000 price:866.000000 x:4.790000 z:2.870000 cut_Very_Good color_E clarity_VS2 90 | 1.000000 |A carat:0.380000 depth:59.200000 table:60.000000 price:866.000000 x:4.730000 z:2.810000 cut_Very_Good color_E clarity_VS2 91 | 1.000000 |A carat:0.380000 depth:62.200000 table:59.000000 price:866.000000 x:4.630000 z:2.890000 cut_Premium color_E clarity_VS2 92 | 1.000000 |A carat:0.380000 depth:60.500000 table:58.000000 price:866.000000 x:4.660000 z:2.830000 cut_Very_Good color_E clarity_VS2 93 | 1.000000 |A carat:0.380000 depth:60.100000 table:56.000000 price:866.000000 x:4.690000 z:2.830000 cut_Ideal color_E clarity_VS2 94 | 1.000000 |A carat:0.380000 depth:61.800000 table:58.000000 price:866.000000 x:4.610000 z:2.860000 cut_Premium color_E clarity_VS2 95 | 1.000000 |A carat:0.330000 depth:60.600000 table:58.000000 price:866.000000 x:4.490000 z:2.710000 cut_Premium color_E clarity_VS2 96 | 1.000000 |A carat:0.430000 depth:60.800000 table:59.000000 price:867.000000 x:4.860000 z:2.970000 cut_Premium color_G clarity_VS2 97 | 1.000000 |A carat:0.430000 depth:59.100000 table:60.000000 price:867.000000 x:4.880000 z:2.900000 cut_Very_Good color_G clarity_VS2 98 | 1.000000 |A carat:0.430000 depth:62.300000 table:58.000000 price:867.000000 x:4.750000 z:2.980000 cut_Premium color_H clarity_VS1 99 | 1.000000 |A carat:0.430000 depth:61.100000 table:59.000000 price:867.000000 x:4.830000 z:2.960000 cut_Premium color_G clarity_VS2 100 | 1.000000 |A carat:0.320000 depth:61.500000 table:56.000000 price:867.000000 x:4.410000 z:2.720000 cut_Ideal color_F clarity_VVS1 101 | -------------------------------------------------------------------------------- /inst/extdata/ref_print.out: -------------------------------------------------------------------------------- 1 | Vowpal Wabbit model 2 | Working directory: . 3 | Model file: ./pk_mdl.vw 4 | Learning algorithm: sgd 5 | General parameters: 6 | random_seed : 0 7 | ring_size : Not defined 8 | holdout_off : FALSE 9 | holdout_period : 10 10 | holdout_after : 0 11 | early_terminate : 3 12 | loss_function : Not defined 13 | link : Not defined 14 | quantile_tau : 0.5 15 | Feature parameters: 16 | bit_precision : 18 17 | quadratic : Not defined 18 | cubic : Not defined 19 | interactions : Not defined 20 | permutations : FALSE 21 | leave_duplicate_interactions : FALSE 22 | noconstant : FALSE 23 | feature_limit : Not defined 24 | ngram : Not defined 25 | skips : Not defined 26 | hash : Not defined 27 | affix : Not defined 28 | spelling : Not defined 29 | interact : Not defined 30 | Learning algorithms / Reductions: 31 | boosting : 32 | num_learners : 10 33 | gamma : 0.1 34 | alg : BBM 35 | Optimization parameters: 36 | adaptive : TRUE 37 | normalized : TRUE 38 | invariant : TRUE 39 | adax : FALSE 40 | sparse_l2 : 0 41 | l1_state : 0 42 | l2_state : 1 43 | learning_rate : 0.5 44 | initial_pass_length : Not defined 45 | l1 : 0 46 | l2 : 0 47 | no_bias_regularization : Not defined 48 | feature_mask : Not defined 49 | decay_learning_rate : 1 50 | initial_t : 0 51 | power_t : 0.5 52 | initial_weight : 0 53 | random_weights : off 54 | normal_weights : off 55 | truncated_normal_weights : off 56 | sparse_weights : FALSE 57 | input_feature_regularizer : Not defined 58 | Model evaluation. Training: 59 | num_examples : 100 60 | weighted_example_sum : 100 61 | weighted_label_sum : -10 62 | avg_loss : 0.32 63 | best_const : -0.1 64 | best_const_loss : 0.99 65 | total_feature : 1000 66 | Model evaluation. Testing: 67 | num_examples : 100 68 | weighted_example_sum : 100 69 | weighted_label_sum : 100 70 | avg_loss : 0.12 71 | best_const : 1 72 | best_const_loss : 0 73 | total_feature : 1000 74 | -------------------------------------------------------------------------------- /man/add_option.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{add_option} 4 | \alias{add_option} 5 | \title{Add option to the model} 6 | \usage{ 7 | add_option(vwmodel, option = c("binary", "oaa", "ect", "csoaa", "wap", 8 | "log_multi", "recall_tree", "lda", "multilabel_oaa", "classweight", 9 | "new_mf", "lrq", "stage_poly", "bootstrap", "autolink", "replay", 10 | "explore_eval", "cb", "cb_explore", "cbify", "multiworld_test_check", 11 | "nn", "topk", "search", "boosting", "marginal"), ...) 12 | } 13 | \arguments{ 14 | \item{vwmodel}{[vw] Model of vw class} 15 | 16 | \item{option}{[string] Name of an option} 17 | 18 | \item{...}{Additional options for a learning algorithm / reduction} 19 | } 20 | \description{ 21 | Add a learning algorithm / reduction to the option stack inside model 22 | } 23 | -------------------------------------------------------------------------------- /man/df2vw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{df2vw} 4 | \alias{df2vw} 5 | \title{Create a VW data file from a R data.frame object} 6 | \usage{ 7 | df2vw(data, file_path, namespaces = NULL, keep_space = NULL, 8 | fixed = NULL, targets = NULL, probabilities = NULL, 9 | weight = NULL, base = NULL, tag = NULL, multiline = NULL, 10 | append = FALSE) 11 | } 12 | \arguments{ 13 | \item{data}{[data.frame] data.frame object to be converted} 14 | 15 | \item{file_path}{[string] file name of the resulting data in 16 | VW-friendly format} 17 | 18 | \item{namespaces}{[list or yaml file] name of each namespace and 19 | each variable for each namespace can be a R list, or a YAML 20 | file example namespace with the IRIS database: namespaces = 21 | list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length', 22 | 'Petal.Width') this creates 2 namespaces (sepal 23 | and petal) containing the features defined by elements of this lists.} 24 | 25 | \item{keep_space}{[string vector] keep spaces for this features 26 | Example:"FERRARI 4Si" 27 | With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features 28 | Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature} 29 | 30 | \item{fixed}{[string vector] fixed parsing for this features 31 | Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'"). 32 | Can be used for LDA ("word_1:2 word_2:3" will stay the same), 33 | but should be used carefully, because special characters can ruin final VW format file.} 34 | 35 | \item{targets}{[string or string vector] 36 | If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 37 | If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 38 | multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm.} 39 | 40 | \item{probabilities}{[string vector] vectors with action probabilities for Contextual Bandit algorithm.} 41 | 42 | \item{weight}{[string] weight (importance) of each line of the dataset.} 43 | 44 | \item{base}{[string] base of each line of the dataset. Used for residual regression.} 45 | 46 | \item{tag}{[string] tag of each line of the dataset.} 47 | 48 | \item{multiline}{[integer] number of labels (separate lines) for multilines example} 49 | 50 | \item{append}{[bool] data to be appended to the result file} 51 | } 52 | \description{ 53 | Create a VW data file from a R data.frame object 54 | } 55 | -------------------------------------------------------------------------------- /man/print.vw.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{print.vw} 4 | \alias{print.vw} 5 | \title{Print VW model} 6 | \usage{ 7 | \method{print}{vw}(x, ...) 8 | } 9 | \arguments{ 10 | \item{x}{[vw] Model of vw class} 11 | 12 | \item{...}{Not used currently} 13 | } 14 | \description{ 15 | Print information about Vowpal Wabbit model 16 | } 17 | \examples{ 18 | vwmodel <- vwsetup() 19 | print(vwmodel) 20 | 21 | } 22 | -------------------------------------------------------------------------------- /man/rvwgsoc-package.Rd: -------------------------------------------------------------------------------- 1 | \name{rvw-package} 2 | \alias{rvw-package} 3 | \alias{rvw} 4 | \docType{package} 5 | \title{ 6 | R interface for Vowpal Wabbit 7 | } 8 | \description{ 9 | R interface for Vowpal Wabbit using Rcpp and libvw for GSoC 2018. 10 | } 11 | \details{ 12 | This section should provide a more detailed overview of how to use the 13 | package, including the most important functions. 14 | } 15 | \author{ 16 | Ivan Pavlov, Dirk Eddelbuettel, James J Balamuta 17 | 18 | Maintainer: Ivan Pavlov 19 | } 20 | \references{ 21 | This optional section can contain literature or other references for 22 | background information. 23 | } 24 | \keyword{ package } 25 | \seealso{ 26 | Optional links to other man pages 27 | } 28 | \examples{ 29 | \dontrun{ 30 | ## Optional simple examples of the most important functions 31 | ## These can be in \dontrun{} and \donttest{} blocks. 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /man/vwaudit.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{vwaudit} 4 | \alias{vwaudit} 5 | \title{Audit Vowpal Wabbit model} 6 | \usage{ 7 | vwaudit(vwmodel, quiet = FALSE) 8 | } 9 | \arguments{ 10 | \item{vwmodel}{Model of vw class to train} 11 | 12 | \item{quiet}{[bool] Do not print anything to the console.} 13 | } 14 | \value{ 15 | Data.frame containing feature names, feature hashes and model values 16 | } 17 | \description{ 18 | Get feature names and their model values. 19 | } 20 | \examples{ 21 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 22 | test_vwmodel <- vwsetup() 23 | vwtrain(test_vwmodel, data = ext_train_data) 24 | vwaudit(test_vwmodel) 25 | } 26 | -------------------------------------------------------------------------------- /man/vwparams.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{vwparams} 4 | \alias{vwparams} 5 | \alias{vwparams<-} 6 | \title{Access and modify parameters of VW model} 7 | \usage{ 8 | vwparams(vwmodel, name) 9 | 10 | vwparams(vwmodel, name) <- value 11 | } 12 | \arguments{ 13 | \item{vwmodel}{[vw] Model of vw class} 14 | 15 | \item{name}{[string] Name of VW parameter} 16 | 17 | \item{value}{[string/int/real/bool] Replacment value of a parameter} 18 | } 19 | \value{ 20 | Value of a parameter 21 | } 22 | \description{ 23 | These functions allow to access VW model parameters by name and correctly modify them 24 | } 25 | \examples{ 26 | vwmodel <- vwsetup() 27 | # Access parameter 28 | vwparams(vwmodel, "bit_precision") 29 | # Modify parameter 30 | vwparams(vwmodel, "bit_precision") <- 25 31 | 32 | } 33 | -------------------------------------------------------------------------------- /man/vwsetup.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/functions.R 3 | \name{vwsetup} 4 | \alias{vwsetup} 5 | \title{Create Vowpal Wabbit model, setup model parameters and data} 6 | \usage{ 7 | vwsetup(algorithm = c("sgd", "bfgs", "ftrl", "pistol", "ksvm", 8 | "OjaNewton", "svrg"), general_params = list(), 9 | feature_params = list(), optimization_params = list(), 10 | dir = tempdir(), model = NULL, params_str = NULL, option = c("", 11 | "binary", "oaa", "ect", "csoaa", "wap", "log_multi", "recall_tree", 12 | "lda", "multilabel_oaa", "classweight", "new_mf", "lrq", "stage_poly", 13 | "bootstrap", "autolink", "replay", "explore_eval", "cb", "cb_explore", 14 | "cbify", "multiworld_test_check", "nn", "topk", "search", "boosting", 15 | "marginal"), ...) 16 | } 17 | \arguments{ 18 | \item{algorithm}{[string] Optimzation algorithm 19 | \itemize{ 20 | \item \code{sgd} - adaptive, normalized, invariant stochastic gradient descent 21 | \item \code{bfgs} - Limited-memory Broyden-Fletcher-Goldfarb-Shanno optimization algorithm 22 | \item \code{ftrl} - FTRL: Follow the Regularized Leader optimization algorithm 23 | \item \code{pistol} - FTRL: Parameter-free Stochastic Learning 24 | \item \code{ksvm} - Kernel svm 25 | \item \code{OjaNewton} - Online Newton with Oja's Sketch 26 | \item \code{svrg} - Stochastic Variance Reduced Gradient 27 | }} 28 | 29 | \item{general_params}{List of parameters: 30 | \itemize{ 31 | \item \code{random_seed} [int] - Seed random number generator (default: 0) 32 | \item \code{ring_size} [int] - Size of example ring 33 | \item \code{holdout_off} [bool] - No holdout data in multiple passes (default: FALSE) 34 | \item \code{holdout_period} [int] - Holdout period for test only (default: 10) 35 | \item \code{holdout_after} [int] - Holdout after n training examples, default off (disables holdout_period) (default: 0) 36 | \item \code{early_terminate} [int] - Specify the number of passes tolerated when holdout loss doesn't decrease before early termination (default: 3) 37 | \item \code{loss_function} [string] - Specify the loss function to be used, uses squared by default. Currently available ones are: squared, classic, hinge, logistic, quantile and poisson. (default: squared) 38 | \item \code{link} [string] - Specify the link function: identity, logistic, glf1 or poisson. (default: identity) 39 | \item \code{quantile_tau} [real] - Parameter "tau" associated with Quantileloss. (default: 0.5) 40 | }} 41 | 42 | \item{feature_params}{List of parameters: 43 | More information about "interactions" option (also "quadratic", "cubic") avaliable here \url{https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Command-line-arguments#example-manipulation-options} 44 | \itemize{ 45 | \item \code{bit_precision} [int] - Number of bits in the feature table (default: 18) 46 | \item \code{quadratic} [string] - Create and use quadratic features (Specify 2 namespaces) 47 | \item \code{cubic} [string] - Create and use cubic features (Specify 3 namespaces) 48 | \item \code{interactions} [string] - Create feature interactions of any level between namespaces (Specify several namespaces) 49 | \item \code{permutations} [bool] - Use permutations instead of combinations for feature interactions of same namespace (default: FALSE) 50 | \item \code{leave_duplicate_interactions} [bool] - Don't remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: 'quadratic="ab", quadratic="ba"' and a lot more in 'quadratic="::"'. (default: FALSE) 51 | \item \code{noconstant} [bool] - Don't add a constant feature (default: FALSE) 52 | \item \code{feature_limit} [string] - limit to N features. To apply to a single namespace 'foo', arg should be "fN" 53 | \item \code{ngram} [string] - Generate N grams. To generate N grams for a single namespace 'foo', arg should be "fN". 54 | \item \code{skips} [string] - Use second derivative in line searchGenerate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be "fN". 55 | \item \code{hash} [string] - How to hash the features. Available options: "strings", "all" (default: "strings") 56 | \item \code{affix} [string] - Generate prefixes/suffixes of features; argument "+2a,-3b,+1" means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace 57 | \item \code{spelling} [string] - Compute spelling features for a given namespace (use '_' for default namespace) 58 | \item \code{interact} [string] - Put weights on feature products from namespaces and 59 | }} 60 | 61 | \item{optimization_params}{List of parameters: 62 | \itemize{ 63 | \item \code{learning_rate} [real] - Set initial learning Rate (default: 0.5) 64 | \item \code{initial_pass_length} [int] - Initial number of examples per pass 65 | \item \code{l1} [real] - L1 regularization (default: 0) 66 | \item \code{l2} [real] - L2 regularization (default: 0) 67 | \item \code{no_bias_regularization} [string] - no bias in regularization (Available options: "on", "off") 68 | \item \code{feature_mask} [string] - Use existing regressor to determine which parameters may be updated. If no initial_regressor given, also used for initial weights. 69 | \item \code{decay_learning_rate} [real] - Set Decay factor for learning_rate between passes (default: 1) 70 | \item \code{initial_t} [real] - initial t value (default: 0) 71 | \item \code{power_t} [real] - t power value (default: 0.5) 72 | \item \code{initial_weight} [int] - Set all weights to an initial value of arg (default: 0) 73 | \item \code{random_weights} [string] - Make initial weights random (Available options: "on", "off") (default: "off") 74 | \item \code{normal_weights} [string] - Make initial weights normal (Available options: "on", "off") (default: "off") 75 | \item \code{truncated_normal_weights} [string] - Make initial weights truncated normal (Available options: "on", "off") (default: "off") 76 | \item \code{sparse_weights} [bool] - Use a sparse datastructure for weights. 77 | \item \code{input_feature_regularizer} [string] - Per feature regularization input file. 78 | } 79 | Additional parameters depending on \code{algorithm} choice: 80 | \itemize{ 81 | \item \code{sgd}: 82 | \itemize{ 83 | \item \code{adaptive} [bool] - Use adaptive, individual learning rates (default: TRUE) 84 | \item \code{normalized} [bool] - Use per feature normalized updates (default: TRUE) 85 | \item \code{invariant} [bool] - Use safe/importance aware updates (default: TRUE) 86 | \item \code{adax} [bool] - Use adaptive learning rates with x^2 instead of g^2x^2 (default: FALSE) 87 | \item \code{sparse_l2} [real] - use per feature normalized updates (default: 0) 88 | \item \code{l1_state} [real] - use per feature normalized updates (default: 0) 89 | \item \code{l2_state} [real] - use per feature normalized updates (default: 1) 90 | } 91 | \item \code{bfgs}: 92 | \itemize{ 93 | \item \code{conjugate_gradient} [bool] - Use conjugate gradient based optimization (default: FALSE) 94 | \item \code{hessian_on} [bool] - Use second derivative in line search (default: FALSE) 95 | \item \code{mem} [int] - Memory in bfgs. (default: 15) 96 | \item \code{termination} [real] - Termination threshold. (default: 0.00100000005) 97 | } 98 | \item \code{ftrl}: 99 | \itemize{ 100 | \item \code{ftrl_alpha} [real] - Learning rate for FTRL optimization (default: 0.005) 101 | \item \code{ftrl_beta} [real] - FTRL beta parameter (default: 0.1) 102 | } 103 | \item \code{pistol}: 104 | \itemize{ 105 | \item \code{ftrl_alpha} [real] - Learning rate for FTRL optimization (default: 0.005) 106 | \item \code{ftrl_beta} [real] - FTRL beta parameter (default: 0.1) 107 | } 108 | \item \code{ksvm}: 109 | \itemize{ 110 | \item \code{reprocess} [int] - number of reprocess steps for LASVM (default: 1) 111 | \item \code{kernel} [string] - type of kernel (rbf or linear) (default: "linear") 112 | \item \code{bandwidth} [real] - bandwidth of rbf kernel (default: 1.0) 113 | \item \code{degree} [int] - degree of poly kernel (default: 2) 114 | \item \code{lambda} [real] - saving regularization for test time (default: -1) 115 | } 116 | \item \code{OjaNewton}: 117 | \itemize{ 118 | \item \code{sketch_size} [int] - size of sketch (default: 10) 119 | \item \code{epoch_size} [int] - size of epoch (default: 1) 120 | \item \code{alpha} [real] - multiplicative constant for identity (default: 1) 121 | \item \code{alpha_inverse} [real] - one over alpha, similar to learning rate 122 | \item \code{learning_rate_cnt} - constant for the learning rate 1/t (default: 2) 123 | \item \code{normalize} [string] - normalize the features or not (Available options: "on", "off") (default: "on") 124 | \item \code{random_init} [string] - randomize initialization of Oja or not (Available options: "on", "off") (default: "on") 125 | } 126 | \item \code{svrg}: 127 | \itemize{ 128 | \item \code{stage_size} [int] - Number of passes per SVRG stage (default: 1) 129 | } 130 | }} 131 | 132 | \item{dir}{[string] Working directory path, default is tempdir()} 133 | 134 | \item{model}{[string] File name for model weights or path to existng model file.} 135 | 136 | \item{params_str}{[string] Pass cmd line parameters directly, bypassing the default approach. 137 | For compatibility, parameters from vwtrain,vwtest, predict.vw can't be used here and functions add_option, vwparams aren't supported.} 138 | 139 | \item{option}{[string] Add Learning algorithm / reduction option: 140 | \itemize{ 141 | \item \code{binary} - Reports loss as binary classification with -1,1 labels 142 | \item \code{oaa} - One-against-all multiclass learning with labels 143 | \item \code{ect} - Error correcting tournament with labels 144 | \item \code{csoaa} - One-against-all multiclass learning with costs 145 | \item \code{wap} - Weighted all-pairs multiclass learning with costs 146 | \item \code{multilabel_oaa} - One-against-all multilabel with multiple labels 147 | \item \code{log_multi} - Online (decision) trees for classes 148 | \item \code{classweight} - Importance weight classes 149 | \item \code{lda} - Latent Dirichlet Allocation 150 | \item \code{recall_tree} - Use online tree for multiclass 151 | \item \code{new_mf} - Matrix factorization mode 152 | \item \code{lrq} - Low rank quadratic features 153 | \item \code{stage_poly} - Stagewise polynomial features 154 | \item \code{bootstrap} - bootstrap with K rounds by online importance resampling 155 | \item \code{autolink} - Create link function with polynomial N 156 | \item \code{replay} - Experience Replay 157 | \item \code{explore_eval} - Explore evaluation 158 | \item \code{cb} - Contextual bandit learning 159 | \item \code{cb_explore} - Contextual Bandit Exploration 160 | \item \code{cbify} - Convert multiclass on K classes into a contextual bandit problem 161 | \item \code{multiworld_test} - Multiworld Testing 162 | \item \code{nn} - Sigmoidal feedforward network 163 | \item \code{topk} - Top K recommendation 164 | \item \code{struct_search} - Search-based structured prediction (SEARN or DAgger) 165 | \item \code{boosting} - Online boosting with weak learners 166 | \item \code{marginal} - Substitute marginal label estimates for ids 167 | }} 168 | 169 | \item{...}{Additional options for a learning algorithm / reduction 170 | \itemize{ 171 | \item \code{oaa} or \code{ect}: 172 | \itemize{ 173 | \item \code{num_classes} [int] - Number of classes 174 | \item \code{oaa_subsample} [int] - Subsample this number of negative examples when learning 175 | } 176 | \item \code{multilabel_oaa}: 177 | \itemize{ 178 | \item \code{num_labels} [int] - Number of labels 179 | } 180 | \item \code{csoaa} or \code{wap}: 181 | \itemize{ 182 | \item \code{num_classes} [int] - Number of classes 183 | \item \code{csoaa_ldf} or \code{wap_ldf} - \code{singleline} (Default) or \code{multiline} label dependent features 184 | } 185 | \item \code{log_multi}: 186 | \itemize{ 187 | \item \code{num_classes} [int] - Number of classes 188 | \item \code{no_progress} [bool] - Disable progressive validation (default: FALSE) 189 | \item \code{swap_resistance} [int] - Higher = more resistance to swap, (default: 4) 190 | } 191 | \item \code{classweight}: 192 | \itemize{ 193 | \item \code{class_multiplier} [real] - importance weight multiplier for class 194 | } 195 | \item \code{recall_tree}: 196 | \itemize{ 197 | \item \code{num_classes} [int] - Number of classes 198 | \item \code{max_candidates} [int] - Maximum number of labels per leaf in the tree 199 | \item \code{bern_hyper} [real] - Recall tree depth penalty (default: 1) 200 | \item \code{max_depth} [int] - Maximum depth of the tree, (default: log_2(number of classes) ) 201 | \item \code{node_only} [string] - Only use node features, not full path (Available options: "on", "off") (default: "off") 202 | \item \code{randomized_routing} [string] - Randomized routing (Available options: "on", "off") (default: "off") 203 | } 204 | \item \code{lda}: 205 | \itemize{ 206 | \item \code{num_topics} [int] - Number of topics 207 | \item \code{lda_alpha} [real] - Prior on sparsity of per-document topic weights (default: 0.100000001) 208 | \item \code{lda_rho} [real] - Prior on sparsity of topic distributions (default: 0.100000001) 209 | \item \code{lda_D} [int] - Number of documents (default: 10000) 210 | \item \code{lda_epsilon} [real] - Loop convergence threshold (default: 0.00100000005) 211 | \item \code{math-mode} [string] - Math mode: simd, accuracy, fast-approx 212 | \item \code{minibatch} [int] - Minibatch size (default: 1) 213 | \item \code{metrics} [string] - Compute metrics (Available options: "on", "off") (default: "off") 214 | } 215 | \item \code{new_mf}: 216 | \itemize{ 217 | \item \code{rank} [int] - rank for matrix factorization 218 | } 219 | \item \code{lrq}: 220 | \itemize{ 221 | \item \code{features} [string] - low rank quadratic features 222 | \item \code{lrqdropout} [bool] - use dropout training for low rank quadratic features (default: FALSE) 223 | } 224 | \item \code{stage_poly}: 225 | \itemize{ 226 | \item \code{sched_exponent} [real] - exponent controlling quantity of included features (default: 1.0) 227 | \item \code{batch_sz} [int] - multiplier on batch size before including more features (default: 1000) 228 | \item \code{batch_sz_no_doubling} [bool] - batch_sz does not double (default: TRUE) 229 | } 230 | \item \code{bootstrap}: 231 | \itemize{ 232 | \item \code{num_rounds} [int] - number of rounds 233 | \item \code{bs_type} [string] - the bootstrap mode: 'mean' or 'vote' (default: "mean") 234 | } 235 | \item \code{autolink}: 236 | \itemize{ 237 | \item \code{degree} [int] - polynomial degree (default: 2) 238 | } 239 | \item \code{replay}: 240 | \itemize{ 241 | \item \code{level} [string] - Use experience replay at a specified level (b=classification/regression, m=multiclass, c=cost sensitive) 242 | \item \code{buffer} [int] - Buffer size (default: 100) 243 | \item \code{count} [int] - how many times (in expectation) should each example be played (default: 1 = permuting) 244 | } 245 | \item \code{explore_eval}: 246 | \itemize{ 247 | \item \code{multiplier} [real] - Multiplier used to make all rejection sample probabilities <= 1 248 | } 249 | \item \code{cb}: 250 | \itemize{ 251 | \item \code{num_costs} [int] - number of num_costs If costs=0, contextual bandit learning 252 | with multiline action dependent features (ADF) is triggered ("--cb_adf"). 253 | \item \code{cb_type} [string] - contextual bandit method to use in {ips,dm,dr, mtr (for ADF)} (default: "dr") 254 | \item \code{eval} [bool] - Evaluate a policy rather than optimizing (default: FALSE) 255 | \item \code{rank_all} [bool] - Return actions sorted by score order. (for ADF) (default: FALSE) 256 | \item \code{no_predict} [bool] - Do not do a prediction when training. (for ADF) (default: FALSE) 257 | } 258 | \item \code{cb_explore}: 259 | \itemize{ 260 | \item \code{num_actions} [bool] - number of actions in online explore-exploit for a action contextual bandit problem. 261 | If num_actions=0, online explore-exploit for a contextual bandit problem with multiline action dependent features (ADF) is triggered ("--cb_explore_adf"). 262 | \item \code{explore_type} [string] - Type of exploration to use: "epsilon" (epsilon-greedy exploration) (default), 263 | "first" (tau-first exploration), "bag" (bagging-based exploration), "cover" (Online cover based exploration), "softmax" (softmax exploration), 264 | "regcb" (RegCB-elim exploration), "regcbopt" (RegCB optimistic exploration). "softmax", "regcb" and "regcbopt" types are only avaliable for exploration with ADF. (default: "epsilon") 265 | \item \code{explore_arg} [real] - Parameter for exploration algorithm. Applicable for "epsilon", "first", "bag" and "cover" types of exploration. (default: 0.05) 266 | \item \code{psi} [real] - Disagreement parameter for "cover" algorithm. (default: 1) 267 | \item \code{nounif} [bool] - Do not explore uniformly on zero-probability actions in "cover" algorithm. (default: FALSE) 268 | \item \code{mellowness} [real] - "RegCB" mellowness parameter c_0. (default: 0.1) 269 | \item \code{greedify} [bool] - Always update first policy once in "bag" (default: FALSE) 270 | \item \code{lambda} [real] - Parameter for "softmax". (default: -1) 271 | \item \code{cb_min_cost} [real] - Lower bound on cost. (default: 0) For ADF only 272 | \item \code{cb_max_cost} [real] - Upper bound on cost. (default: 1) For ADF only 273 | \item \code{first_only} [bool] - Only explore the first action in a tie-breaking event. For ADF only (default: FALSE) 274 | } 275 | \item \code{cbify}: 276 | \itemize{ 277 | \item \code{num_classes} [int] - number of classes 278 | \item \code{cbify_cs} [bool] - consume cost-sensitive classification examples instead of multiclass (default: FALSE) 279 | \item \code{loss0} [real] - loss for correct label (default: 0) 280 | \item \code{loss1} [real] - loss for incorrect label (default: 1) 281 | } 282 | \item \code{multiworld_test}: 283 | \itemize{ 284 | \item \code{features} [string] - Evaluate features as a policies 285 | \item \code{learn} [int] - Do Contextual Bandit learning on classes. 286 | \item \code{num_classes} [bool] - Discard mwt policy features before learning (default: FALSE) 287 | } 288 | \item \code{nn}: 289 | \itemize{ 290 | \item \code{num_hidden} [int] - number of hidden units 291 | \item \code{inpass} [bool] - Train or test sigmoidal feedforward network with input passthrough (default: FALSE) 292 | \item \code{multitask} [bool] - Share hidden layer across all reduced tasks (default: FALSE) 293 | \item \code{dropout} [bool] - Train or test sigmoidal feedforward network using dropout (default: FALSE) 294 | \item \code{meanfield} [bool] - Train or test sigmoidal feedforward network using mean field (default: FALSE) 295 | } 296 | \item \code{topk}: 297 | \itemize{ 298 | \item \code{num_k} [int] - number of top k recomendations 299 | } 300 | \item \code{struct_search}: 301 | \itemize{ 302 | \item \code{id} [int] - maximum action id or 0 for LDF 303 | \item \code{search_task} [string] - search task: sequence, sequencespan, sequence_ctg, argmax, sequence_demoldf, multiclasstask, dep_parser, entity_relation, hook, graph 304 | \item \code{search_interpolation} [string] - at what level should interpolation happen? (data or policy) 305 | \item \code{search_rollout} [string] - how should rollouts be executed? (policy, oracle, mix_per_state, mix_per_roll, none) 306 | \item \code{search_rollin} [string] - how should past trajectories be generated? (policy, oracle, mix_per_state, mix_per_roll) 307 | \item \code{search_passes_per_policy} [int] - number of passes per policy (only valid for search_interpolation=policy). (default: 1) 308 | \item \code{search_beta} [real] - interpolation rate for policies (only valid for search_interpolation=policy). (default: 0.5) 309 | \item \code{search_alpha} [real] - annealed beta = 1-(1-alpha)^t (only valid for search_interpolation=data). (default: 1e-10) 310 | \item \code{search_total_nb_policies} [int] - if we are going to train the policies through multiple separate calls to vw, we need to specify this parameter and tell vw how many policies are eventually going to be trained 311 | \item \code{search_trained_nb_policies} [int] - the number of trained policies in a file 312 | \item \code{search_allowed_transitions} [string] - read file of allowed transitions. default: all transitions are allowed 313 | \item \code{search_subsample_time} [real] - instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example, if v<=-1, use active learning 314 | \item \code{search_neighbor_features} [string] - copy features from neighboring lines. argument looks like: '-1:a,+2' meaning copy previous line from namespace "a" and next line from namespace "unnamed", where ',' separates them 315 | \item \code{search_rollout_num_steps} [int] - how many calls of "loss" before we stop really predicting on rollouts and switch to oracle (default means "infinite") 316 | \item \code{search_history_length} [int] - some tasks allow you to specify how much history their depend on; specify that here. (default: 1) 317 | \item \code{search_no_caching} [bool] - turn off the built-in caching ability (makes things slower, but technically more safe) (default: FALSE) 318 | \item \code{search_xv} [bool] - train two separate policies, alternating prediction/learning. (default: FALSE) 319 | \item \code{search_perturb_oracle} [real] - perturb the oracle on rollin with this probability. (default: 0) 320 | \item \code{search_linear_ordering} [bool] - insist on generating examples in linear order. (default: FALSE and using hoopla permutation) 321 | \item \code{search_active_verify} [real] - verify that active learning is doing the right thing (arg = multiplier, should be = cost_range * range_c) 322 | \item \code{search_save_every_k_runs} [int] - save model every k runs 323 | } 324 | \item \code{boosting}: 325 | \itemize{ 326 | \item \code{num_learners} [int] - number of weak learners 327 | \item \code{gamma} [real] - weak learner's edge (=0.1), used only by online BBM (default: 0.100000001) 328 | \item \code{alg} - specify the boosting algorithm: BBM (default), logistic (AdaBoost.OL.W), adaptive (AdaBoost.OL) (default: "BBM") 329 | } 330 | \item \code{marginal}: 331 | \itemize{ 332 | \item \code{ids} [string] - Substitute marginal label estimates for ids 333 | \item \code{initial_denominator} [real] - Initial denominator (default: 1) 334 | \item \code{initial_numerator} [real] - Initial numerator (default: 0.5) 335 | \item \code{compete} [bool] - Enable competition with marginal features (default: FALSE) 336 | \item \code{update_before_learn} [string] - Update marginal values before learning (Available options: "on", "off") (default: "off") 337 | \item \code{unweighted_marginals} [string] - Ignore importance weights when computing marginals (Available options: "on", "off") (default: "off") 338 | \item \code{decay} [real] - Decay multiplier per event (1e-3 for example) (default=0) 339 | } 340 | }} 341 | } 342 | \value{ 343 | vwmodel list class 344 | } 345 | \description{ 346 | Sets up VW model together with parameters and data 347 | } 348 | \examples{ 349 | vwsetup( 350 | dir = tempdir(), 351 | model = "pk_mdl.vw", 352 | general_params = list(loss_function="logistic", link="logistic"), 353 | optimization_params = list(adaptive=FALSE), 354 | option = "binary" 355 | ) 356 | 357 | } 358 | -------------------------------------------------------------------------------- /man/vwtest.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R, R/functions.R 3 | \name{vwtest} 4 | \alias{vwtest} 5 | \alias{predict.vw} 6 | \title{Compute predictions using Vowpal Wabbit model} 7 | \usage{ 8 | vwtest(vwmodel, data, probs_path = "", full_probs = FALSE, 9 | readable_model = NULL, readable_model_path = "", quiet = FALSE, 10 | passes = 1L, cache = FALSE, raw = FALSE, progress = NULL, 11 | namespaces = NULL, keep_space = NULL, fixed = NULL, 12 | targets = NULL, probabilities = NULL, weight = NULL, base = NULL, 13 | tag = NULL, multiline = NULL) 14 | 15 | \method{predict}{vw}(object, data, probs_path = "", full_probs = FALSE, 16 | readable_model = NULL, quiet = FALSE, ...) 17 | } 18 | \arguments{ 19 | \item{vwmodel}{[vw] Model of vw class to train.} 20 | 21 | \item{data}{[string or data.frame] Path to training data in .vw plain text format or data.frame. 22 | If \code{[data.frame]} then will be parsed using \code{df2vw} function.} 23 | 24 | \item{probs_path}{[string] Path to file where to save predictions.} 25 | 26 | \item{full_probs}{[bool] Output full predictions in data.frame format. If not, force predictions into a single vector (default).} 27 | 28 | \item{readable_model}{[string] Print trained model in human readable format ("hashed") 29 | and also with human readable features ("inverted").} 30 | 31 | \item{readable_model_path}{[string] Path to file where to save readable model.} 32 | 33 | \item{quiet}{[bool] Do not print anything to the console.} 34 | 35 | \item{passes}{[int] Number of times the algorithm will cycle over the data (epochs).} 36 | 37 | \item{cache}{[bool] Use a cache for a data file.} 38 | 39 | \item{raw}{[bool] Output unnormalized predictions. Default is FALSE.} 40 | 41 | \item{progress}{[int/real] Progress update frequency. int: additive, real: multiplicative} 42 | 43 | \item{namespaces}{[list or yaml file] For \code{df2vw}. Name of each namespace and 44 | each variable for each namespace can be a R list, or a YAML 45 | file example namespace with the IRIS database: namespaces = 46 | list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length', 47 | 'Petal.Width') this creates 2 namespaces (sepal 48 | and petal) containing the features defined by elements of this lists.} 49 | 50 | \item{keep_space}{[string vector] For \code{df2vw}. Keep spaces for this features 51 | Example:"FERRARI 4Si" 52 | With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features 53 | Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature} 54 | 55 | \item{fixed}{[string vector] fixed parsing for this features 56 | Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'"). 57 | Can be used for LDA ("word_1:2 word_2:3" will stay the same), 58 | but should be used carefully, because special characters can ruin final VW format file.} 59 | 60 | \item{targets}{[string or string vector] For \code{df2vw}. 61 | If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 62 | If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 63 | multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm.} 64 | 65 | \item{probabilities}{[string vector] For \code{df2vw}. Vectors with action probabilities for Contextual Bandit algorithm.} 66 | 67 | \item{weight}{[string] For \code{df2vw}. Weight (importance) of each line of the dataset.} 68 | 69 | \item{base}{[string] For \code{df2vw}. Base of each line of the dataset. Used for residual regression.} 70 | 71 | \item{tag}{[string] For \code{df2vw}. Tag of each line of the dataset.} 72 | 73 | \item{multiline}{[integer] Number of labels (separate lines) for multilines example} 74 | 75 | \item{object}{Model of vw class to train for \code{predict.vw}} 76 | 77 | \item{...}{Parameters passed to \code{predict.vw}} 78 | } 79 | \value{ 80 | Numerical vector containing predictions 81 | } 82 | \description{ 83 | \code{vwtest} computes predictions using VW model from \code{\link{vwsetup}} 84 | \code{predict.vw} compute predictions using parser settings from \code{\link{vwtrain}} 85 | } 86 | \examples{ 87 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 88 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw") 89 | test_vwmodel <- vwsetup() 90 | vwtrain(test_vwmodel, data = ext_train_data) 91 | vwtest(test_vwmodel, data = ext_test_data) 92 | } 93 | -------------------------------------------------------------------------------- /man/vwtrain.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/RcppExports.R 3 | \name{vwtrain} 4 | \alias{vwtrain} 5 | \title{Train Vowpal Wabbit model} 6 | \usage{ 7 | vwtrain(vwmodel, data, readable_model = NULL, readable_model_path = "", 8 | quiet = FALSE, update_model = FALSE, passes = 1L, cache = FALSE, 9 | progress = NULL, namespaces = NULL, keep_space = NULL, 10 | fixed = NULL, targets = NULL, probabilities = NULL, 11 | weight = NULL, base = NULL, tag = NULL, multiline = NULL) 12 | } 13 | \arguments{ 14 | \item{vwmodel}{[vw] Model of vw class to train} 15 | 16 | \item{data}{[string or data.frame] Path to training data in .vw plain text format or data.frame. 17 | If \code{[data.frame]} then will be parsed using \code{df2vw} function.} 18 | 19 | \item{readable_model}{[string] Print trained model in human readable format ("hashed") 20 | and also with human readable features ("inverted")} 21 | 22 | \item{readable_model_path}{[string] Path to file where to save readable model.} 23 | 24 | \item{quiet}{[logical] Do not print anything to the console} 25 | 26 | \item{update_model}{[logical] Update an existing model, when training with new data. \code{FALSE} by default.} 27 | 28 | \item{passes}{[int] Number of times the algorithm will cycle over the data (epochs).} 29 | 30 | \item{cache}{[bool] Use a cache for a data file.} 31 | 32 | \item{progress}{[int/real] Progress update frequency. int: additive, real: multiplicative} 33 | 34 | \item{namespaces}{[list or yaml file] For \code{df2vw}. Name of each namespace and 35 | each variable for each namespace can be a R list, or a YAML 36 | file example namespace with the IRIS database: namespaces = 37 | list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length', 38 | 'Petal.Width') this creates 2 namespaces (sepal 39 | and petal) containing the features defined by elements of this lists.} 40 | 41 | \item{keep_space}{[string vector] For \code{df2vw}. Keep spaces for this features 42 | Example:"FERRARI 4Si" 43 | With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features 44 | Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature} 45 | 46 | \item{fixed}{[string vector] fixed parsing for this features 47 | Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'"). 48 | Can be used for LDA ("word_1:2 word_2:3" will stay the same), 49 | but should be used carefully, because special characters can ruin final VW format file.} 50 | 51 | \item{targets}{[string or string vector] For \code{df2vw}. 52 | If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 53 | If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 54 | multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm.} 55 | 56 | \item{probabilities}{[string vector] For \code{df2vw}. vectors with action probabilities for Contextual Bandit algorithm.} 57 | 58 | \item{weight}{[string] For \code{df2vw}. Weight (importance) of each line of the dataset.} 59 | 60 | \item{base}{[string] For \code{df2vw}. base of each line of the dataset. Used for residual regression.} 61 | 62 | \item{tag}{[string] For \code{df2vw}. Tag of each line of the dataset.} 63 | 64 | \item{multiline}{[integer] number of labels (separate lines) for multilines examle} 65 | } 66 | \description{ 67 | vwtrain is an interface to train VW model from \code{\link{vwsetup}} 68 | } 69 | \examples{ 70 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 71 | test_vwmodel <- vwsetup() 72 | vwtrain(test_vwmodel, data = ext_train_data) 73 | } 74 | -------------------------------------------------------------------------------- /rvw.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageInstallArgs: --no-multiarch --with-keep.source --clean 17 | PackageCheckArgs: --as-cran 18 | PackageRoxygenize: rd 19 | -------------------------------------------------------------------------------- /src/Makevars.in: -------------------------------------------------------------------------------- 1 | PKG_LIBS = -lvw 2 | CXX_STD = CXX11 3 | -------------------------------------------------------------------------------- /src/RcppExports.cpp: -------------------------------------------------------------------------------- 1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand 2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393 3 | 4 | #include 5 | 6 | using namespace Rcpp; 7 | 8 | // get_vw_version 9 | std::string get_vw_version(); 10 | RcppExport SEXP _rvw_get_vw_version() { 11 | BEGIN_RCPP 12 | Rcpp::RObject rcpp_result_gen; 13 | Rcpp::RNGScope rcpp_rngScope_gen; 14 | rcpp_result_gen = Rcpp::wrap(get_vw_version()); 15 | return rcpp_result_gen; 16 | END_RCPP 17 | } 18 | // vwtrain 19 | void vwtrain(Rcpp::List& vwmodel, SEXP data, Rcpp::Nullable readable_model, std::string readable_model_path, bool quiet, bool update_model, int passes, bool cache, Rcpp::Nullable progress, Rcpp::Nullable namespaces, Rcpp::Nullable keep_space, Rcpp::Nullable fixed, Rcpp::Nullable targets, Rcpp::Nullable probabilities, Rcpp::Nullable weight, Rcpp::Nullable base, Rcpp::Nullable tag, Rcpp::Nullable multiline); 20 | RcppExport SEXP _rvw_vwtrain(SEXP vwmodelSEXP, SEXP dataSEXP, SEXP readable_modelSEXP, SEXP readable_model_pathSEXP, SEXP quietSEXP, SEXP update_modelSEXP, SEXP passesSEXP, SEXP cacheSEXP, SEXP progressSEXP, SEXP namespacesSEXP, SEXP keep_spaceSEXP, SEXP fixedSEXP, SEXP targetsSEXP, SEXP probabilitiesSEXP, SEXP weightSEXP, SEXP baseSEXP, SEXP tagSEXP, SEXP multilineSEXP) { 21 | BEGIN_RCPP 22 | Rcpp::RNGScope rcpp_rngScope_gen; 23 | Rcpp::traits::input_parameter< Rcpp::List& >::type vwmodel(vwmodelSEXP); 24 | Rcpp::traits::input_parameter< SEXP >::type data(dataSEXP); 25 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type readable_model(readable_modelSEXP); 26 | Rcpp::traits::input_parameter< std::string >::type readable_model_path(readable_model_pathSEXP); 27 | Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP); 28 | Rcpp::traits::input_parameter< bool >::type update_model(update_modelSEXP); 29 | Rcpp::traits::input_parameter< int >::type passes(passesSEXP); 30 | Rcpp::traits::input_parameter< bool >::type cache(cacheSEXP); 31 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type progress(progressSEXP); 32 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type namespaces(namespacesSEXP); 33 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type keep_space(keep_spaceSEXP); 34 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type fixed(fixedSEXP); 35 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type targets(targetsSEXP); 36 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type probabilities(probabilitiesSEXP); 37 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type weight(weightSEXP); 38 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type base(baseSEXP); 39 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type tag(tagSEXP); 40 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type multiline(multilineSEXP); 41 | vwtrain(vwmodel, data, readable_model, readable_model_path, quiet, update_model, passes, cache, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline); 42 | return R_NilValue; 43 | END_RCPP 44 | } 45 | // vwtest 46 | SEXP vwtest(Rcpp::List& vwmodel, SEXP data, std::string probs_path, bool full_probs, Rcpp::Nullable readable_model, std::string readable_model_path, bool quiet, int passes, bool cache, bool raw, Rcpp::Nullable progress, Rcpp::Nullable namespaces, Rcpp::Nullable keep_space, Rcpp::Nullable fixed, Rcpp::Nullable targets, Rcpp::Nullable probabilities, Rcpp::Nullable weight, Rcpp::Nullable base, Rcpp::Nullable tag, Rcpp::Nullable multiline); 47 | RcppExport SEXP _rvw_vwtest(SEXP vwmodelSEXP, SEXP dataSEXP, SEXP probs_pathSEXP, SEXP full_probsSEXP, SEXP readable_modelSEXP, SEXP readable_model_pathSEXP, SEXP quietSEXP, SEXP passesSEXP, SEXP cacheSEXP, SEXP rawSEXP, SEXP progressSEXP, SEXP namespacesSEXP, SEXP keep_spaceSEXP, SEXP fixedSEXP, SEXP targetsSEXP, SEXP probabilitiesSEXP, SEXP weightSEXP, SEXP baseSEXP, SEXP tagSEXP, SEXP multilineSEXP) { 48 | BEGIN_RCPP 49 | Rcpp::RObject rcpp_result_gen; 50 | Rcpp::RNGScope rcpp_rngScope_gen; 51 | Rcpp::traits::input_parameter< Rcpp::List& >::type vwmodel(vwmodelSEXP); 52 | Rcpp::traits::input_parameter< SEXP >::type data(dataSEXP); 53 | Rcpp::traits::input_parameter< std::string >::type probs_path(probs_pathSEXP); 54 | Rcpp::traits::input_parameter< bool >::type full_probs(full_probsSEXP); 55 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type readable_model(readable_modelSEXP); 56 | Rcpp::traits::input_parameter< std::string >::type readable_model_path(readable_model_pathSEXP); 57 | Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP); 58 | Rcpp::traits::input_parameter< int >::type passes(passesSEXP); 59 | Rcpp::traits::input_parameter< bool >::type cache(cacheSEXP); 60 | Rcpp::traits::input_parameter< bool >::type raw(rawSEXP); 61 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type progress(progressSEXP); 62 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type namespaces(namespacesSEXP); 63 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type keep_space(keep_spaceSEXP); 64 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type fixed(fixedSEXP); 65 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type targets(targetsSEXP); 66 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type probabilities(probabilitiesSEXP); 67 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type weight(weightSEXP); 68 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type base(baseSEXP); 69 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type tag(tagSEXP); 70 | Rcpp::traits::input_parameter< Rcpp::Nullable >::type multiline(multilineSEXP); 71 | rcpp_result_gen = Rcpp::wrap(vwtest(vwmodel, data, probs_path, full_probs, readable_model, readable_model_path, quiet, passes, cache, raw, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline)); 72 | return rcpp_result_gen; 73 | END_RCPP 74 | } 75 | // vwaudit 76 | Rcpp::DataFrame vwaudit(Rcpp::List& vwmodel, bool quiet); 77 | RcppExport SEXP _rvw_vwaudit(SEXP vwmodelSEXP, SEXP quietSEXP) { 78 | BEGIN_RCPP 79 | Rcpp::RObject rcpp_result_gen; 80 | Rcpp::RNGScope rcpp_rngScope_gen; 81 | Rcpp::traits::input_parameter< Rcpp::List& >::type vwmodel(vwmodelSEXP); 82 | Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP); 83 | rcpp_result_gen = Rcpp::wrap(vwaudit(vwmodel, quiet)); 84 | return rcpp_result_gen; 85 | END_RCPP 86 | } 87 | 88 | static const R_CallMethodDef CallEntries[] = { 89 | {"_rvw_get_vw_version", (DL_FUNC) &_rvw_get_vw_version, 0}, 90 | {"_rvw_vwtrain", (DL_FUNC) &_rvw_vwtrain, 18}, 91 | {"_rvw_vwtest", (DL_FUNC) &_rvw_vwtest, 20}, 92 | {"_rvw_vwaudit", (DL_FUNC) &_rvw_vwaudit, 2}, 93 | {NULL, NULL, 0} 94 | }; 95 | 96 | RcppExport void R_init_rvw(DllInfo *dll) { 97 | R_registerRoutines(dll, NULL, CallEntries, NULL, NULL); 98 | R_useDynamicSymbols(dll, FALSE); 99 | } 100 | -------------------------------------------------------------------------------- /src/extra/array_parameters.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #ifndef _WIN32 5 | #include 6 | #endif 7 | 8 | // It appears that on OSX MAP_ANONYMOUS is mapped to MAP_ANON 9 | // https://github.com/leftmike/foment/issues/4 10 | #ifdef __APPLE__ 11 | #define MAP_ANONYMOUS MAP_ANON 12 | #endif 13 | 14 | #include "array_parameters_dense.h" 15 | 16 | class sparse_parameters; 17 | typedef std::unordered_map weight_map; 18 | 19 | template 20 | class sparse_iterator 21 | { 22 | private: 23 | weight_map::iterator _iter; 24 | uint32_t _stride; 25 | 26 | public: 27 | typedef std::forward_iterator_tag iterator_category; 28 | typedef T value_type; 29 | typedef ptrdiff_t difference_type; 30 | typedef T* pointer; 31 | typedef T& reference; 32 | 33 | sparse_iterator(weight_map::iterator& iter, uint32_t stride) 34 | : _iter(iter), _stride(stride) 35 | { } 36 | 37 | sparse_iterator& operator=(const sparse_iterator& other) 38 | { 39 | _iter = other._iter; 40 | _stride = other._stride; 41 | return *this; 42 | 43 | } 44 | uint64_t index() { return _iter->first; } 45 | 46 | T& operator*() { return *(_iter->second); } 47 | 48 | sparse_iterator& operator++() 49 | { 50 | _iter++; 51 | return *this; 52 | } 53 | 54 | bool operator==(const sparse_iterator& rhs) const { return _iter == rhs._iter; } 55 | bool operator!=(const sparse_iterator& rhs) const { return _iter != rhs._iter; } 56 | }; 57 | 58 | 59 | class sparse_parameters 60 | { 61 | private: 62 | weight_map _map; 63 | uint64_t _weight_mask; // (stride*(1 << num_bits) -1) 64 | uint32_t _stride_shift; 65 | bool _seeded; // whether the instance is sharing model state with others 66 | bool _delete; 67 | void* default_data; 68 | float* default_value; 69 | public: 70 | typedef sparse_iterator iterator; 71 | typedef sparse_iterator const_iterator; 72 | private: 73 | void(*fun)(const weight*, void*); 74 | public: 75 | 76 | sparse_parameters(size_t length, uint32_t stride_shift = 0) 77 | : _map(), 78 | _weight_mask((length << stride_shift) - 1), 79 | _stride_shift(stride_shift), 80 | _seeded(false), _delete(false), default_data(nullptr), 81 | fun(nullptr) 82 | { default_value = calloc_mergable_or_throw(stride());} 83 | 84 | sparse_parameters() 85 | : _map(), _weight_mask(0), _stride_shift(0), _seeded(false), _delete(false), default_data(nullptr), fun(nullptr) 86 | { default_value = calloc_mergable_or_throw(stride());} 87 | 88 | bool not_null() { return (_weight_mask > 0 && !_map.empty()); } 89 | 90 | sparse_parameters(const sparse_parameters &other) { shallow_copy(other); } 91 | sparse_parameters(sparse_parameters &&) = delete; 92 | 93 | weight* first() { throw 1; } //TODO: Throw better exceptions. Allreduce currently not supported in sparse. 94 | 95 | //iterator with stride 96 | iterator begin() { weight_map::iterator i = _map.begin(); return iterator(i, stride()); } 97 | iterator end() { weight_map::iterator i = _map.end(); return iterator(i, stride()); } 98 | 99 | //const iterator 100 | const_iterator cbegin() { weight_map::iterator i = _map.begin(); return const_iterator(i, stride()); } 101 | const_iterator cend() { weight_map::iterator i = _map.begin(); return const_iterator(i, stride()); } 102 | 103 | inline weight& operator[](size_t i) 104 | { uint64_t index = i & _weight_mask; 105 | weight_map::iterator iter = _map.find(index); 106 | if (iter == _map.end()) 107 | { _map.insert(std::make_pair(index, calloc_mergable_or_throw(stride()))); 108 | iter = _map.find(index); 109 | if (fun != nullptr) 110 | fun(iter->second, default_data); 111 | } 112 | return *(iter->second); 113 | } 114 | 115 | inline const weight& operator[](size_t i) const 116 | { uint64_t index = i & _weight_mask; 117 | weight_map::const_iterator iter = _map.find(index); 118 | if (iter == _map.end()) 119 | return *default_value; 120 | return *(iter->second); 121 | } 122 | 123 | inline weight& strided_index(size_t index) { return operator[](index << _stride_shift); } 124 | 125 | void shallow_copy(const sparse_parameters& input) 126 | { 127 | // TODO: this is level-1 copy (weight* are stilled shared) 128 | if (!_seeded) 129 | { 130 | for (auto iter = _map.begin(); iter != _map.end(); ++iter) 131 | free(iter->second); 132 | } 133 | _map = input._map; 134 | _weight_mask = input._weight_mask; 135 | _stride_shift = input._stride_shift; 136 | free(default_value); 137 | default_value = calloc_mergable_or_throw(stride()); 138 | memcpy(default_value, input.default_value, stride()); 139 | default_data = input.default_data; 140 | _seeded = true; 141 | } 142 | 143 | template void set_default(R& info) 144 | { 145 | R& new_R = calloc_or_throw(); 146 | new_R = info; 147 | default_data = &new_R; 148 | fun = (void(*)(const weight*, void*))T::func; 149 | fun(default_value, default_data); 150 | } 151 | 152 | template void set_default() { fun = (void(*)(const weight*, void*))T::func; } 153 | 154 | void set_zero(size_t offset) 155 | { 156 | for (weight_map::iterator iter = _map.begin(); iter != _map.end(); ++iter) 157 | (&(*(iter->second)))[offset] = 0; 158 | } 159 | 160 | uint64_t mask() const { return _weight_mask; } 161 | 162 | uint64_t seeded() const { return _seeded; } 163 | 164 | uint32_t stride() const { return 1 << _stride_shift; } 165 | 166 | uint32_t stride_shift() const { return _stride_shift; } 167 | 168 | void stride_shift(uint32_t stride_shift) { 169 | _stride_shift = stride_shift; 170 | free(default_value); 171 | default_value = calloc_mergable_or_throw(stride()); 172 | if (fun != nullptr) 173 | fun(default_value, default_data); 174 | } 175 | 176 | #ifndef _WIN32 177 | void share(size_t length) 178 | {throw 1; //TODO: add better exceptions 179 | } 180 | #endif 181 | 182 | ~sparse_parameters() 183 | {if (!_delete && !_seeded) // don't free weight vector if it is shared with another instance 184 | { 185 | for (auto iter = _map.begin(); iter != _map.end(); ++iter) 186 | free(iter->second); 187 | _map.clear(); 188 | _delete = true; 189 | } 190 | if (default_data != nullptr) 191 | free(default_data); 192 | free(default_value); 193 | } 194 | }; 195 | 196 | class parameters { 197 | public: 198 | bool sparse; 199 | dense_parameters dense_weights; 200 | sparse_parameters sparse_weights; 201 | 202 | inline weight& operator[](size_t i) 203 | { 204 | if (sparse) 205 | return sparse_weights[i]; 206 | else 207 | return dense_weights[i]; 208 | } 209 | 210 | inline uint32_t stride_shift() 211 | { 212 | if (sparse) 213 | return sparse_weights.stride_shift(); 214 | else 215 | return dense_weights.stride_shift(); 216 | } 217 | 218 | inline uint32_t stride() 219 | { 220 | if (sparse) 221 | return sparse_weights.stride(); 222 | else 223 | return dense_weights.stride(); 224 | } 225 | 226 | inline uint64_t mask() 227 | { 228 | if (sparse) 229 | return sparse_weights.mask(); 230 | else 231 | return dense_weights.mask(); 232 | } 233 | 234 | inline uint64_t seeded() 235 | { 236 | if (sparse) 237 | return sparse_weights.seeded(); 238 | else 239 | return dense_weights.seeded(); 240 | } 241 | 242 | inline void shallow_copy(const parameters& input) 243 | { 244 | if (sparse) 245 | sparse_weights.shallow_copy(input.sparse_weights); 246 | else 247 | dense_weights.shallow_copy(input.dense_weights); 248 | } 249 | 250 | inline void set_zero(size_t offset) 251 | { 252 | if (sparse) 253 | sparse_weights.set_zero(offset); 254 | else 255 | dense_weights.set_zero(offset); 256 | } 257 | #ifndef _WIN32 258 | inline void share(size_t length) 259 | { 260 | if (sparse) 261 | sparse_weights.share(length); 262 | else 263 | dense_weights.share(length); 264 | } 265 | #endif 266 | 267 | inline void stride_shift(uint32_t stride_shift) 268 | { if (sparse) 269 | sparse_weights.stride_shift(stride_shift); 270 | else 271 | dense_weights.stride_shift(stride_shift); 272 | } 273 | 274 | inline weight& strided_index(size_t index) 275 | { 276 | if (sparse) 277 | return sparse_weights.strided_index(index); 278 | else 279 | return dense_weights.strided_index(index); 280 | } 281 | 282 | inline bool not_null() 283 | { 284 | if (sparse) 285 | return sparse_weights.not_null(); 286 | else 287 | return dense_weights.not_null(); 288 | } 289 | }; 290 | -------------------------------------------------------------------------------- /src/extra/array_parameters_dense.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | 3 | #include 4 | #include "memory.h" 5 | 6 | typedef float weight; 7 | 8 | template 9 | class dense_iterator 10 | { 11 | private: 12 | T * _current; 13 | T* _begin; 14 | uint32_t _stride; 15 | 16 | public: 17 | typedef std::forward_iterator_tag iterator_category; 18 | typedef T value_type; 19 | typedef std::ptrdiff_t difference_type; 20 | typedef T* pointer; 21 | typedef T& reference; 22 | 23 | dense_iterator(T* current, T* begin, uint32_t stride) 24 | : _current(current), _begin(begin), _stride(stride) 25 | { } 26 | 27 | T& operator*() { return *_current; } 28 | 29 | size_t index() { return _current - _begin; } 30 | 31 | dense_iterator& operator++() 32 | { 33 | _current += _stride; 34 | return *this; 35 | } 36 | 37 | bool operator==(const dense_iterator& rhs) const { return _current == rhs._current; } 38 | bool operator!=(const dense_iterator& rhs) const { return _current != rhs._current; } 39 | }; 40 | 41 | class dense_parameters 42 | { 43 | private: 44 | weight * _begin; 45 | uint64_t _weight_mask; // (stride*(1 << num_bits) -1) 46 | uint32_t _stride_shift; 47 | bool _seeded; // whether the instance is sharing model state with others 48 | 49 | public: 50 | typedef dense_iterator iterator; 51 | typedef dense_iterator const_iterator; 52 | dense_parameters(size_t length, uint32_t stride_shift = 0) 53 | : _begin(calloc_mergable_or_throw(length << stride_shift)), 54 | _weight_mask((length << stride_shift) - 1), 55 | _stride_shift(stride_shift), 56 | _seeded(false) 57 | { } 58 | 59 | dense_parameters() 60 | : _begin(nullptr), _weight_mask(0), _stride_shift(0), _seeded(false) 61 | {} 62 | 63 | bool not_null() { return (_weight_mask > 0 && _begin != nullptr); } 64 | 65 | dense_parameters(const dense_parameters &other) { shallow_copy(other); } 66 | dense_parameters(dense_parameters &&) = delete; 67 | 68 | weight* first() { return _begin; } //TODO: Temporary fix for allreduce. 69 | 70 | //iterator with stride 71 | iterator begin() { return iterator(_begin, _begin, stride()); } 72 | iterator end() { return iterator(_begin + _weight_mask + 1, _begin, stride()); } 73 | 74 | //const iterator 75 | const_iterator cbegin() { return const_iterator(_begin, _begin, stride()); } 76 | const_iterator cend() { return const_iterator(_begin + _weight_mask + 1, _begin, stride()); } 77 | 78 | inline weight& operator[](size_t i) const { return _begin[i & _weight_mask]; } 79 | void shallow_copy(const dense_parameters& input) 80 | { 81 | if (!_seeded) 82 | free(_begin); 83 | _begin = input._begin; 84 | _weight_mask = input._weight_mask; 85 | _stride_shift = input._stride_shift; 86 | _seeded = true; 87 | } 88 | 89 | inline weight& strided_index(size_t index) { return operator[](index << _stride_shift); } 90 | 91 | template void set_default(R& info) 92 | { 93 | iterator iter = begin(); 94 | for (size_t i = 0; iter != end(); ++iter, i += stride()) 95 | T::func(*iter, info, iter.index()); 96 | } 97 | 98 | template void set_default() 99 | { 100 | iterator iter = begin(); 101 | for (size_t i = 0; iter != end(); ++iter, i += stride()) 102 | T::func(*iter, iter.index()); 103 | } 104 | 105 | void set_zero(size_t offset) 106 | { 107 | for (iterator iter = begin(); iter != end(); ++iter) 108 | (&(*iter))[offset] = 0; 109 | } 110 | 111 | uint64_t mask() const { return _weight_mask; } 112 | 113 | uint64_t seeded() const { return _seeded; } 114 | 115 | uint32_t stride() const { return 1 << _stride_shift; } 116 | 117 | uint32_t stride_shift() const { return _stride_shift; } 118 | 119 | void stride_shift(uint32_t stride_shift) { _stride_shift = stride_shift; } 120 | 121 | #ifndef _WIN32 122 | #ifndef DISABLE_SHARED_WEIGHTS 123 | void share(size_t length) 124 | { 125 | float* shared_weights = (float*)mmap(0, (length << _stride_shift) * sizeof(float), 126 | PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); 127 | size_t float_count = length << _stride_shift; 128 | weight* dest = shared_weights; 129 | memcpy(dest, _begin, float_count * sizeof(float)); 130 | free(_begin); 131 | _begin = dest; 132 | } 133 | #endif 134 | #endif 135 | 136 | ~dense_parameters() 137 | { 138 | if (_begin != nullptr && !_seeded) // don't free weight vector if it is shared with another instance 139 | { 140 | free(_begin); 141 | _begin = nullptr; 142 | } 143 | } 144 | }; 145 | -------------------------------------------------------------------------------- /src/extra/error_reporting.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | typedef void(*trace_message_t)(void *context, const std::string&); 3 | 4 | // TODO: change to virtual class 5 | 6 | // invoke trace_listener when << endl is encountered. 7 | class vw_ostream : public std::ostream 8 | { 9 | class vw_streambuf : public std::stringbuf 10 | { 11 | vw_ostream& parent; 12 | public: 13 | vw_streambuf(vw_ostream& str) : parent(str){}; 14 | 15 | virtual int sync(); 16 | }; 17 | vw_streambuf buf; 18 | 19 | public: 20 | vw_ostream(); 21 | 22 | void* trace_context; 23 | trace_message_t trace_listener; 24 | }; 25 | -------------------------------------------------------------------------------- /src/extra/example_predict.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) by respective owners including Yahoo!, Microsoft, and 3 | individual contributors. All rights reserved. Released under a BSD 4 | license as described in the file LICENSE. 5 | */ 6 | #pragma once 7 | 8 | typedef unsigned char namespace_index; 9 | 10 | #include "v_array.h" 11 | #include "feature_group.h" 12 | 13 | struct example_predict 14 | { 15 | class iterator 16 | { 17 | features* _feature_space; 18 | namespace_index* _index; 19 | public: 20 | iterator(features* feature_space, namespace_index* index) 21 | : _feature_space(feature_space), _index(index) 22 | { } 23 | 24 | features& operator*() 25 | { 26 | return _feature_space[*_index]; 27 | } 28 | 29 | iterator& operator++() 30 | { 31 | _index++; 32 | return *this; 33 | } 34 | 35 | namespace_index index() { return *_index; } 36 | 37 | bool operator==(const iterator& rhs) { return _index == rhs._index; } 38 | bool operator!=(const iterator& rhs) { return _index != rhs._index; } 39 | }; 40 | 41 | v_array indices; 42 | features feature_space[256]; //Groups of feature values. 43 | uint64_t ft_offset;//An offset for all feature values. 44 | 45 | iterator begin() { return iterator(feature_space, indices.begin()); } 46 | iterator end() { return iterator(feature_space, indices.end()); } 47 | }; 48 | 49 | // make sure we have an exception safe version of example_predict 50 | class safe_example_predict : public example_predict 51 | { 52 | public: 53 | safe_example_predict(); 54 | ~safe_example_predict(); 55 | }; 56 | -------------------------------------------------------------------------------- /src/extra/hash.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) by respective owners including Yahoo!, Microsoft, and 3 | individual contributors. All rights reserved. Released under a BSD 4 | license as described in the file LICENSE. 5 | */ 6 | #pragma once 7 | 8 | #include // defines size_t 9 | 10 | // Platform-specific functions and macros 11 | #if defined(_MSC_VER) // Microsoft Visual Studio 12 | # include 13 | 14 | # include 15 | # define ROTL32(x,y) _rotl(x,y) 16 | # define BIG_CONSTANT(x) (x) 17 | 18 | #else // Other compilers 19 | # include // defines uint32_t etc 20 | 21 | inline uint32_t rotl32(uint32_t x, int8_t r) 22 | { return (x << r) | (x >> (32 - r)); 23 | } 24 | 25 | # define ROTL32(x,y) rotl32(x,y) 26 | # define BIG_CONSTANT(x) (x##LLU) 27 | 28 | #endif // !defined(_MSC_VER) 29 | 30 | namespace MURMUR_HASH_3 31 | { 32 | 33 | //----------------------------------------------------------------------------- 34 | // Finalization mix - force all bits of a hash block to avalanche 35 | 36 | static inline uint32_t fmix(uint32_t h) 37 | { h ^= h >> 16; 38 | h *= 0x85ebca6b; 39 | h ^= h >> 13; 40 | h *= 0xc2b2ae35; 41 | h ^= h >> 16; 42 | 43 | return h; 44 | } 45 | } 46 | 47 | const uint32_t hash_base = 0; 48 | 49 | uint64_t uniform_hash(const void *key, size_t length, uint64_t seed); 50 | -------------------------------------------------------------------------------- /src/extra/no_label.h: -------------------------------------------------------------------------------- 1 | /* 2 | Copyright (c) by respective owners including Yahoo!, Microsoft, and 3 | individual contributors. All rights reserved. Released under a BSD 4 | license as described in the file LICENSE. 5 | */ 6 | #pragma once 7 | #include "label_parser.h" 8 | 9 | struct example; 10 | struct vw; 11 | 12 | namespace no_label { 13 | typedef char no_label; 14 | 15 | void return_no_label_example(vw& all, void*, example& ec); 16 | 17 | extern label_parser no_label_parser; 18 | 19 | void print_no_label_update(vw& all, example &ec); 20 | void output_and_account_no_label_example(vw& all, example& ec); 21 | } 22 | -------------------------------------------------------------------------------- /src/extra/parser_helper.h: -------------------------------------------------------------------------------- 1 | #pragma once 2 | #include 3 | #include 4 | #include "vw_exception.h" 5 | #include "error_reporting.h" 6 | #include 7 | #include 8 | namespace po = boost::program_options; 9 | 10 | struct vw; 11 | 12 | class arguments { 13 | po::options_description new_od;//a set of options 14 | po::variables_map add_options_skip_duplicates(po::options_description& opts, bool do_notify); 15 | bool missing_critical; 16 | 17 | std::string long_only(const char* in) 18 | {//strip off a trailing short option 19 | char* maybe = strchr(const_cast(in), ','); 20 | if (maybe==nullptr) 21 | return std::string(in); 22 | else 23 | return std::string(in,maybe-in); 24 | } 25 | 26 | public: 27 | po::options_description all_opts; //All specified options. 28 | po::options_description opts; //Critical options and their dependencies. 29 | vw_ostream trace_message;//error reporting 30 | std::stringstream* file_options; // the set of options to store in the model file. 31 | po::variables_map vm; //A stored map from option to value. 32 | std::vector args;//All arguments 33 | vw* all;//backdoor that should go away over time. 34 | 35 | //initialization 36 | arguments(vw& all_in, std::string name_in=""):new_od(name_in), missing_critical(false), all(&all_in) {file_options = new std::stringstream;}; 37 | arguments():missing_critical(false){};//this should not be used but appears sometimes unavoidable. Do an in-place allocation with the upper initializer after it is used. 38 | ~arguments(){ delete file_options;}; 39 | 40 | //reinitialization 41 | arguments& new_options(std::string name_in="") 42 | { 43 | (&new_od)->~options_description();//in place delete 44 | new (&new_od) po::options_description(name_in); 45 | missing_critical=false; 46 | return *this; 47 | } 48 | 49 | //insert arguments 50 | arguments& operator()(const char* option, const char* description) 51 | { 52 | new_od.add_options()(option, description); 53 | return *this; 54 | } 55 | arguments& operator()(bool& exists, const char* option, const char* description) 56 | { return operator()(option, po::bool_switch(&exists), description); } 57 | template arguments& operator()(const char* option, T& location, const char* description) 58 | { return operator()(option, po::value(&location), description); } 59 | template arguments& operator()(const char* option, T& location, T def, const char* description) 60 | { return operator()(option, po::value(&location)->default_value(def), description); } 61 | arguments& operator()(const char* option, const po::value_semantic* type, const char* description) 62 | { 63 | new_od.add_options()(option, type, description); 64 | return *this; 65 | } 66 | //A keep option is kept in the model. 67 | template arguments& keep(const char* option, T& store, const char* description) 68 | { return keep(option, po::value(&store), description); } 69 | template arguments& keep(const char* option, T& store, T def, const char* description) 70 | { 71 | return operator()(option, 72 | po::value(&store)->default_value(def) 73 | ->notifier([this, option, def] (T arg) 74 | { 75 | *this->file_options << " --" << long_only(option) << " " << arg; 76 | }), 77 | description); 78 | } 79 | template arguments& keep(const char* option, po::typed_value* type, const char* description) 80 | { 81 | return operator()(option, 82 | type->notifier([this, option] (T arg) 83 | { *this->file_options << " --" << long_only(option) << " " << arg; }), 84 | description); 85 | } 86 | template arguments& keep_vector(const char* option, po::typed_value>* type, const char* description) 87 | { 88 | return operator()(option, 89 | type->multitoken()->composing() 90 | ->notifier([this, option] (std::vector arg) 91 | { 92 | for (auto i : arg) 93 | *this->file_options << " --" << long_only(option) << " " << i; 94 | }), 95 | description); 96 | } 97 | arguments& keep(bool& exists, const char* option, const char* description) 98 | { 99 | return operator()(option, 100 | po::bool_switch(&exists) 101 | ->notifier([this, option] (bool v) 102 | { if (v) *this->file_options << " --" << long_only(option); }), 103 | description); 104 | } 105 | arguments& keep(const char* option, const char* description) 106 | { 107 | bool temp=false; 108 | return keep(temp, option, description); 109 | } 110 | 111 | //A missing critical argument raises the missing flag. Critical implies keep. 112 | template arguments& critical(const char* option, T& store, const char* description) 113 | { return critical(option, po::value(&store), description); } 114 | template arguments& critical(const char* option, po::typed_value* type, const char* description) 115 | { 116 | keep(option, type, description); 117 | missing(); 118 | new_options(); 119 | missing_critical = !vm.count(option); 120 | return *this; 121 | } 122 | template arguments& critical_vector(const char* option, po::typed_value>* type, const char* description, bool keep = true) 123 | { 124 | if (keep) 125 | keep_vector(option, type, description); 126 | else 127 | operator()(option, type->multitoken()->composing(), description); 128 | missing(); 129 | new_options(); 130 | missing_critical = !vm.count(option); 131 | return *this; 132 | } 133 | template arguments& critical(const char* option, const char* description) 134 | { return critical(option, po::value(), description); } 135 | arguments& critical(const char* option, const char* description) 136 | { 137 | keep(option, description); 138 | missing(); 139 | new_options(); 140 | missing_critical = !vm[option].as(); 141 | return *this; 142 | } 143 | 144 | bool missing() //Return true if key options are missing. 145 | { 146 | all_opts.add(new_od); 147 | if (!missing_critical) 148 | { 149 | opts.add(new_od); //compile options 150 | auto new_vm = add_options_skip_duplicates(new_od, true);//do notify 151 | for (auto& it : new_vm) 152 | vm.insert(it); 153 | } 154 | return missing_critical; 155 | } 156 | }; 157 | -------------------------------------------------------------------------------- /src/helpers.cpp: -------------------------------------------------------------------------------- 1 | #include "vw.h" 2 | 3 | #include 4 | #include "helpers.h" 5 | 6 | #include 7 | #include 8 | #include 9 | #include 10 | #include 11 | #include 12 | #include 13 | 14 | 15 | #include 16 | 17 | #include 18 | #include 19 | 20 | extern "C" { 21 | #include "md5.h" 22 | } 23 | 24 | #ifdef _WIN32 25 | #define PATH_SEPARATOR '\\' 26 | #else 27 | #define PATH_SEPARATOR '/' 28 | #endif 29 | 30 | 31 | // Based on code from R digest package http://dirk.eddelbuettel.com/code/digest.html 32 | // Copyright (C) 2003 - 2016 Dirk Eddelbuettel 33 | std::string md5sum(char * char_x, uint32_t nChar) { 34 | char output[33+1]; 35 | md5_context ctx; 36 | unsigned char md5sum[16]; 37 | int j; 38 | md5_starts( &ctx ); 39 | md5_update( &ctx, (uint8 *) char_x, nChar); 40 | md5_finish( &ctx, md5sum ); 41 | memcpy(output, md5sum, 16); 42 | 43 | for (j = 0; j < 16; j++) 44 | sprintf(output + j * 2, "%02x", md5sum[j]); 45 | 46 | std::string output_str(output); 47 | 48 | return(output_str); 49 | } 50 | 51 | Rcpp::String check_data(Rcpp::List & vwmodel, std::string & valid_data_str, SEXP data, bool quiet, std::string mode, 52 | Rcpp::Nullable namespaces, Rcpp::Nullable keep_space, 53 | Rcpp::Nullable fixed, 54 | Rcpp::Nullable targets, Rcpp::Nullable probabilities, 55 | Rcpp::Nullable weight, Rcpp::Nullable base, 56 | Rcpp::Nullable tag, Rcpp::Nullable multiline) { 57 | 58 | // Check if: 59 | // no previous parser options are in the model 60 | // OR 61 | // any of passed parser options is NOT NULL (possible when using "predict.vw" function) 62 | if(Rf_isNull(vwmodel["parser_opts"]) || (!Rf_isNull(namespaces) || !Rf_isNull(keep_space) || !Rf_isNull(fixed) || !Rf_isNull(targets) || 63 | !Rf_isNull(probabilities) || !Rf_isNull(weight) || !Rf_isNull(base) || 64 | !Rf_isNull(tag) || !Rf_isNull(multiline))) { 65 | 66 | // In this case we want to use parser options that were passed with "vwtest" so we save parser options 67 | vwmodel["parser_opts"] = Rcpp::List::create(Rcpp::Named("namespaces") = namespaces , Rcpp::Named("keep_space") = keep_space, 68 | Rcpp::Named("fixed") = fixed, 69 | Rcpp::Named("targets") = targets, Rcpp::Named("probabilities") = probabilities, 70 | Rcpp::Named("weight") = weight, Rcpp::Named("base") = base, 71 | Rcpp::Named("tag") = tag, Rcpp::Named("multiline") = multiline); 72 | 73 | } else { 74 | // In this case we use previously saved parser options 75 | if(!quiet){ 76 | Rcpp::Rcout << "Using parser options from the previous session" << std::endl; 77 | } 78 | } 79 | 80 | Rcpp::String data_md5sum(""); 81 | uint32_t nChar; 82 | char * char_x; 83 | if(TYPEOF(data) == STRSXP) { 84 | // Use path to file as model input 85 | valid_data_str = Rcpp::as(data); 86 | 87 | // Check path for whitespace 88 | if(valid_data_str.find_first_of("\t\n ") != valid_data_str.npos) { 89 | Rcpp::stop("Whitespace characters are not allowed in `data` path"); 90 | } 91 | 92 | std::ifstream data_instream(valid_data_str); 93 | std::string data_contents((std::istreambuf_iterator(data_instream)), 94 | std::istreambuf_iterator()); 95 | 96 | char_x = &data_contents[0u]; 97 | nChar = data_contents.length(); 98 | data_md5sum = md5sum(char_x, nChar); 99 | } else if(TYPEOF(data) == VECSXP) { 100 | // Parse data frame and use VW file as model input 101 | 102 | // Update valid data string 103 | valid_data_str = Rcpp::as(vwmodel["dir"]) + PATH_SEPARATOR + mode + ".vw"; 104 | // Compute md5sum of data.frame 105 | Rcpp::RawVector x = serializeToRaw(data); 106 | char_x = (char*) RAW(x); 107 | nChar = XLENGTH(x); 108 | 109 | data_md5sum = md5sum(char_x, nChar); 110 | 111 | // Compare new md5sum with old md5sum 112 | Rcpp::List vwmodel_md5sums = vwmodel["data_md5sum"]; 113 | Rcpp::String model_md5sum = vwmodel_md5sums[mode]; 114 | 115 | 116 | if (model_md5sum != data_md5sum) { 117 | if(!quiet){ 118 | Rcpp::Rcout << "Converting data.frame to VW format" << std::endl; 119 | } 120 | Rcpp::Environment env("package:rvw"); 121 | Rcpp::Function r_df2vw = env["df2vw"]; 122 | // Convert data.frame to VW 123 | 124 | Rcpp::List saved_parser_opts = vwmodel["parser_opts"]; 125 | 126 | // Rcpp::Nullable valid_namespaces = saved_parser_opts["namespaces"]; 127 | // Rcpp::Nullable valid_keep_space = saved_parser_opts["keep_space"]; 128 | // Rcpp::Nullable valid_targets = saved_parser_opts["targets"]; 129 | // Rcpp::Nullable valid_probabilities = saved_parser_opts["probabilities"]; 130 | // Rcpp::Nullable valid_weight = saved_parser_opts["weight"]; 131 | // Rcpp::Nullable valid_base = saved_parser_opts["base"]; 132 | // Rcpp::Nullable valid_tag = saved_parser_opts["tag"]; 133 | // Rcpp::Nullable valid_multiline = saved_parser_opts["multiline"]; 134 | 135 | r_df2vw(data, valid_data_str, 136 | saved_parser_opts["namespaces"], saved_parser_opts["keep_space"], saved_parser_opts["fixed"], 137 | saved_parser_opts["targets"], saved_parser_opts["probabilities"], 138 | saved_parser_opts["weight"], saved_parser_opts["base"], saved_parser_opts["tag"], saved_parser_opts["multiline"], 139 | false 140 | ); 141 | } 142 | 143 | } else { 144 | Rcpp::stop("Only String and data.frame types are supported"); 145 | } 146 | return data_md5sum; 147 | } 148 | 149 | // Get number of examples used in model 150 | int get_num_example(vw& all) { 151 | return all.sd->example_number + all.sd->weighted_holdout_examples; 152 | } 153 | 154 | bool file_exists(std::string file_name) 155 | { 156 | std::ifstream infile (file_name.c_str()); 157 | return infile.good(); 158 | } 159 | 160 | // setup function from VW main.cc file 161 | // modified to work in library mode using Rcpp 162 | vw* setup_model(std::string args_str) { 163 | 164 | int argc; 165 | char** argv = VW::get_argv_from_string(args_str, argc); 166 | 167 | vw* all = nullptr; 168 | try { all = VW::initialize(argc, argv); 169 | } 170 | catch(const VW::vw_exception& ex){ 171 | Rcpp::Rcout << ex.what() << std::endl; 172 | throw; 173 | } 174 | catch(...) 175 | { 176 | Rcpp::Rcout << "unknown exception" << std::endl; 177 | throw; 178 | } 179 | 180 | if (!all->quiet && !all->bfgs && !all->searchstr && !all->opts_n_args.vm.count("audit_regressor")) 181 | { 182 | Rcpp::Rcout << std::left 183 | << std::setw(shared_data::col_avg_loss) << std::left << "average" 184 | << " " 185 | << std::setw(shared_data::col_since_last) << std::left << "since" 186 | << " " 187 | << std::right 188 | << std::setw(shared_data::col_example_counter) << "example" 189 | << " " 190 | << std::setw(shared_data::col_example_weight) << "example" 191 | << " " 192 | << std::setw(shared_data::col_current_label) << "current" 193 | << " " 194 | << std::setw(shared_data::col_current_predict) << "current" 195 | << " " 196 | << std::setw(shared_data::col_current_features) << "current" 197 | << std::endl; 198 | Rcpp::Rcout << std::left 199 | << std::setw(shared_data::col_avg_loss) << std::left << "loss" 200 | << " " 201 | << std::setw(shared_data::col_since_last) << std::left << "last" 202 | << " " 203 | << std::right 204 | << std::setw(shared_data::col_example_counter) << "counter" 205 | << " " 206 | << std::setw(shared_data::col_example_weight) << "weight" 207 | << " " 208 | << std::setw(shared_data::col_current_label) << "label" 209 | << " " 210 | << std::setw(shared_data::col_current_predict) << "predict" 211 | << " " 212 | << std::setw(shared_data::col_current_features) << "features" 213 | << std::endl; 214 | } 215 | 216 | return all; 217 | } 218 | 219 | // Collect final performance evaluation results 220 | Rcpp::List get_eval(vw& all) 221 | { 222 | int num_examples = all.sd->example_number; 223 | double weighted_example_sum = all.sd->weighted_examples(); 224 | double weighted_label_sum = all.sd->weighted_labels; 225 | double avg_loss = NA_REAL; 226 | double avg_multiclass_log_loss = NA_REAL; 227 | float best_const = NA_REAL; 228 | float best_const_loss = NA_REAL; 229 | int total_feature = all.sd->total_features; 230 | 231 | if(all.holdout_set_off) { 232 | if (all.sd->weighted_labeled_examples > 0) { 233 | avg_loss = all.sd->sum_loss / all.sd->weighted_labeled_examples; 234 | } else { 235 | avg_loss = NA_REAL; 236 | } 237 | } else if((all.sd->holdout_best_loss == FLT_MAX) || (all.sd->holdout_best_loss == FLT_MAX * 0.5)) { 238 | avg_loss = NA_REAL; 239 | } else { 240 | avg_loss = all.sd->holdout_best_loss; 241 | } 242 | if (all.sd->report_multiclass_log_loss) 243 | { 244 | if (all.holdout_set_off) { 245 | avg_multiclass_log_loss = all.sd->multiclass_log_loss / all.sd->weighted_labeled_examples; 246 | } else { 247 | avg_multiclass_log_loss = all.sd->holdout_multiclass_log_loss / all.sd->weighted_labeled_examples; 248 | } 249 | } 250 | // Get best_const and best_const_loss 251 | copy_get_best_constant(all, best_const, best_const_loss); 252 | 253 | Rcpp::List eval_list = Rcpp::List::create( 254 | Rcpp::Named("num_examples") = num_examples, 255 | Rcpp::Named("weighted_example_sum") = weighted_example_sum, 256 | Rcpp::Named("weighted_label_sum") = weighted_label_sum, 257 | Rcpp::Named("avg_loss") = avg_loss, 258 | Rcpp::Named("avg_multiclass_log_loss") = avg_multiclass_log_loss, 259 | Rcpp::Named("best_const") = best_const, 260 | Rcpp::Named("best_const_loss") = best_const_loss, 261 | Rcpp::Named("total_feature") = total_feature 262 | ); 263 | 264 | return(eval_list); 265 | } 266 | 267 | // Copy of get_best_constant function from best_constant.cc file 268 | bool copy_get_best_constant(vw& all, float& best_constant, float& best_constant_loss) 269 | { 270 | if (all.sd->first_observed_label == FLT_MAX || // no non-test labels observed or function was never called 271 | (all.loss == nullptr) || (all.sd == nullptr)) return false; 272 | 273 | float label1 = all.sd->first_observed_label; // observed labels might be inside [sd->Min_label, sd->Max_label], so can't use Min/Max 274 | float label2 = (all.sd->second_observed_label == FLT_MAX)?0: all.sd->second_observed_label; // if only one label observed, second might be 0 275 | if (label1 > label2) {float tmp = label1; label1 = label2; label2 = tmp;} // as don't use min/max - make sure label1 < label2 276 | 277 | float label1_cnt; 278 | float label2_cnt; 279 | 280 | if (label1 != label2) 281 | { 282 | label1_cnt = (float) (all.sd->weighted_labels - label2*all.sd->weighted_labeled_examples)/(label1 - label2); 283 | label2_cnt = (float)all.sd->weighted_labeled_examples - label1_cnt; 284 | } 285 | else 286 | return false; 287 | 288 | if ( (label1_cnt + label2_cnt) <= 0.) return false; 289 | 290 | 291 | po::variables_map& vm = all.opts_n_args.vm; 292 | 293 | std::string funcName; 294 | if(vm.count("loss_function")) 295 | funcName = vm["loss_function"].as(); 296 | else 297 | funcName = "squared"; 298 | 299 | if(funcName.compare("squared") == 0 || funcName.compare("Huber") == 0 || funcName.compare("classic") == 0) 300 | best_constant = (float) all.sd->weighted_labels / (float) (all.sd->weighted_labeled_examples); 301 | else if (all.sd->is_more_than_two_labels_observed) 302 | { 303 | //loss functions below don't have generic formuas for constant yet. 304 | return false; 305 | 306 | } 307 | else if(funcName.compare("hinge") == 0) 308 | { 309 | 310 | best_constant = label2_cnt <= label1_cnt ? -1.f: 1.f; 311 | 312 | } 313 | else if(funcName.compare("logistic") == 0) 314 | { 315 | 316 | label1 = -1.; //override {-50, 50} to get proper loss 317 | label2 = 1.; 318 | 319 | if (label1_cnt <= 0) best_constant = 1.; 320 | else if (label2_cnt <= 0) best_constant = -1.; 321 | else 322 | best_constant = std::log(label2_cnt/label1_cnt); 323 | 324 | } 325 | else if(funcName.compare("quantile") == 0 || funcName.compare("pinball") == 0 || funcName.compare("absolute") == 0) 326 | { 327 | 328 | float tau = 0.5; 329 | if(vm.count("quantile_tau")) 330 | tau = vm["quantile_tau"].as(); 331 | 332 | float q = tau*(label1_cnt + label2_cnt); 333 | if (q < label2_cnt) best_constant = label2; 334 | else best_constant = label1; 335 | } 336 | else 337 | return false; 338 | 339 | if (!all.sd->is_more_than_two_labels_observed) 340 | { 341 | best_constant_loss = (label1_cnt>0)?all.loss->getLoss(all.sd, best_constant, label1) * label1_cnt:0.0f; 342 | best_constant_loss += (label2_cnt>0)?all.loss->getLoss(all.sd, best_constant, label2) * label2_cnt:0.0f; 343 | best_constant_loss /= label1_cnt + label2_cnt; 344 | } 345 | else best_constant_loss = FLT_MIN; 346 | 347 | return true; 348 | } 349 | 350 | std::vector split_str(const std::string &s, char del) { 351 | std::stringstream s_stream(s); 352 | std::string item; 353 | std::vector elems; 354 | while ( getline(s_stream, item, del) ) { 355 | elems.push_back(item); 356 | } 357 | return elems; 358 | } 359 | -------------------------------------------------------------------------------- /src/helpers.h: -------------------------------------------------------------------------------- 1 | #include "vw.h" 2 | 3 | #include 4 | 5 | 6 | 7 | 8 | // Helper functions 9 | 10 | // Check if data from vwmodel should be used or from function arguments 11 | Rcpp::String check_data(Rcpp::List & vwmodel, std::string & valid_data_str, SEXP data, bool quiet, std::string mode="train", 12 | Rcpp::Nullable namespaces=R_NilValue, Rcpp::Nullable keep_space=R_NilValue, 13 | Rcpp::Nullable fixed=R_NilValue, 14 | Rcpp::Nullable targets=R_NilValue, Rcpp::Nullable probabilities=R_NilValue, 15 | Rcpp::Nullable weight=R_NilValue, Rcpp::Nullable base=R_NilValue, 16 | Rcpp::Nullable tag=R_NilValue, Rcpp::Nullable multiline=R_NilValue); 17 | 18 | // Get number of examples used in model 19 | int get_num_example(vw& all); 20 | 21 | // Custom driver to test example creation using libvw 22 | void custom_driver(vw& model, std::string & file_path); 23 | 24 | bool file_exists(std::string file_name); 25 | 26 | // setup function from VW main.cc file 27 | // modified to work in library mode using Rcpp 28 | vw* setup_model(std::string args_str); 29 | 30 | // Collect final performance evaluation results 31 | Rcpp::List get_eval(vw& all); 32 | 33 | // Copy of get_best_constant function from best_constant.cc file 34 | bool copy_get_best_constant(vw& all, float& best_constant, float& best_constant_loss); 35 | 36 | std::vector split_str(const std::string &s, char del); 37 | -------------------------------------------------------------------------------- /src/md5.c: -------------------------------------------------------------------------------- 1 | /* 2 | * RFC 1321 compliant MD5 implementation, 3 | * by Christophe Devine ; 4 | * this program is licensed under the GPL. 5 | */ 6 | 7 | #include 8 | 9 | #include "md5.h" 10 | 11 | #define GET_UINT32(n,b,i) \ 12 | { \ 13 | (n) = ( (uint32) (b)[(i) ] ) \ 14 | | ( (uint32) (b)[(i) + 1] << 8 ) \ 15 | | ( (uint32) (b)[(i) + 2] << 16 ) \ 16 | | ( (uint32) (b)[(i) + 3] << 24 ); \ 17 | } 18 | 19 | #define PUT_UINT32(n,b,i) \ 20 | { \ 21 | (b)[(i) ] = (uint8) ( (n) ); \ 22 | (b)[(i) + 1] = (uint8) ( (n) >> 8 ); \ 23 | (b)[(i) + 2] = (uint8) ( (n) >> 16 ); \ 24 | (b)[(i) + 3] = (uint8) ( (n) >> 24 ); \ 25 | } 26 | 27 | void md5_starts( md5_context *ctx ) 28 | { 29 | ctx->total[0] = 0; 30 | ctx->total[1] = 0; 31 | 32 | ctx->state[0] = 0x67452301; 33 | ctx->state[1] = 0xEFCDAB89; 34 | ctx->state[2] = 0x98BADCFE; 35 | ctx->state[3] = 0x10325476; 36 | } 37 | 38 | void md5_process( md5_context *ctx, uint8 data[64] ) 39 | { 40 | uint32 X[16], A, B, C, D; 41 | 42 | GET_UINT32( X[0], data, 0 ); 43 | GET_UINT32( X[1], data, 4 ); 44 | GET_UINT32( X[2], data, 8 ); 45 | GET_UINT32( X[3], data, 12 ); 46 | GET_UINT32( X[4], data, 16 ); 47 | GET_UINT32( X[5], data, 20 ); 48 | GET_UINT32( X[6], data, 24 ); 49 | GET_UINT32( X[7], data, 28 ); 50 | GET_UINT32( X[8], data, 32 ); 51 | GET_UINT32( X[9], data, 36 ); 52 | GET_UINT32( X[10], data, 40 ); 53 | GET_UINT32( X[11], data, 44 ); 54 | GET_UINT32( X[12], data, 48 ); 55 | GET_UINT32( X[13], data, 52 ); 56 | GET_UINT32( X[14], data, 56 ); 57 | GET_UINT32( X[15], data, 60 ); 58 | 59 | #define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n))) 60 | 61 | #define P(a,b,c,d,k,s,t) \ 62 | { \ 63 | a += F(b,c,d) + X[k] + t; a = S(a,s) + b; \ 64 | } 65 | 66 | A = ctx->state[0]; 67 | B = ctx->state[1]; 68 | C = ctx->state[2]; 69 | D = ctx->state[3]; 70 | 71 | #define F(x,y,z) (z ^ (x & (y ^ z))) 72 | 73 | P( A, B, C, D, 0, 7, 0xD76AA478 ); 74 | P( D, A, B, C, 1, 12, 0xE8C7B756 ); 75 | P( C, D, A, B, 2, 17, 0x242070DB ); 76 | P( B, C, D, A, 3, 22, 0xC1BDCEEE ); 77 | P( A, B, C, D, 4, 7, 0xF57C0FAF ); 78 | P( D, A, B, C, 5, 12, 0x4787C62A ); 79 | P( C, D, A, B, 6, 17, 0xA8304613 ); 80 | P( B, C, D, A, 7, 22, 0xFD469501 ); 81 | P( A, B, C, D, 8, 7, 0x698098D8 ); 82 | P( D, A, B, C, 9, 12, 0x8B44F7AF ); 83 | P( C, D, A, B, 10, 17, 0xFFFF5BB1 ); 84 | P( B, C, D, A, 11, 22, 0x895CD7BE ); 85 | P( A, B, C, D, 12, 7, 0x6B901122 ); 86 | P( D, A, B, C, 13, 12, 0xFD987193 ); 87 | P( C, D, A, B, 14, 17, 0xA679438E ); 88 | P( B, C, D, A, 15, 22, 0x49B40821 ); 89 | 90 | #undef F 91 | 92 | #define F(x,y,z) (y ^ (z & (x ^ y))) 93 | 94 | P( A, B, C, D, 1, 5, 0xF61E2562 ); 95 | P( D, A, B, C, 6, 9, 0xC040B340 ); 96 | P( C, D, A, B, 11, 14, 0x265E5A51 ); 97 | P( B, C, D, A, 0, 20, 0xE9B6C7AA ); 98 | P( A, B, C, D, 5, 5, 0xD62F105D ); 99 | P( D, A, B, C, 10, 9, 0x02441453 ); 100 | P( C, D, A, B, 15, 14, 0xD8A1E681 ); 101 | P( B, C, D, A, 4, 20, 0xE7D3FBC8 ); 102 | P( A, B, C, D, 9, 5, 0x21E1CDE6 ); 103 | P( D, A, B, C, 14, 9, 0xC33707D6 ); 104 | P( C, D, A, B, 3, 14, 0xF4D50D87 ); 105 | P( B, C, D, A, 8, 20, 0x455A14ED ); 106 | P( A, B, C, D, 13, 5, 0xA9E3E905 ); 107 | P( D, A, B, C, 2, 9, 0xFCEFA3F8 ); 108 | P( C, D, A, B, 7, 14, 0x676F02D9 ); 109 | P( B, C, D, A, 12, 20, 0x8D2A4C8A ); 110 | 111 | #undef F 112 | 113 | #define F(x,y,z) (x ^ y ^ z) 114 | 115 | P( A, B, C, D, 5, 4, 0xFFFA3942 ); 116 | P( D, A, B, C, 8, 11, 0x8771F681 ); 117 | P( C, D, A, B, 11, 16, 0x6D9D6122 ); 118 | P( B, C, D, A, 14, 23, 0xFDE5380C ); 119 | P( A, B, C, D, 1, 4, 0xA4BEEA44 ); 120 | P( D, A, B, C, 4, 11, 0x4BDECFA9 ); 121 | P( C, D, A, B, 7, 16, 0xF6BB4B60 ); 122 | P( B, C, D, A, 10, 23, 0xBEBFBC70 ); 123 | P( A, B, C, D, 13, 4, 0x289B7EC6 ); 124 | P( D, A, B, C, 0, 11, 0xEAA127FA ); 125 | P( C, D, A, B, 3, 16, 0xD4EF3085 ); 126 | P( B, C, D, A, 6, 23, 0x04881D05 ); 127 | P( A, B, C, D, 9, 4, 0xD9D4D039 ); 128 | P( D, A, B, C, 12, 11, 0xE6DB99E5 ); 129 | P( C, D, A, B, 15, 16, 0x1FA27CF8 ); 130 | P( B, C, D, A, 2, 23, 0xC4AC5665 ); 131 | 132 | #undef F 133 | 134 | #define F(x,y,z) (y ^ (x | ~z)) 135 | 136 | P( A, B, C, D, 0, 6, 0xF4292244 ); 137 | P( D, A, B, C, 7, 10, 0x432AFF97 ); 138 | P( C, D, A, B, 14, 15, 0xAB9423A7 ); 139 | P( B, C, D, A, 5, 21, 0xFC93A039 ); 140 | P( A, B, C, D, 12, 6, 0x655B59C3 ); 141 | P( D, A, B, C, 3, 10, 0x8F0CCC92 ); 142 | P( C, D, A, B, 10, 15, 0xFFEFF47D ); 143 | P( B, C, D, A, 1, 21, 0x85845DD1 ); 144 | P( A, B, C, D, 8, 6, 0x6FA87E4F ); 145 | P( D, A, B, C, 15, 10, 0xFE2CE6E0 ); 146 | P( C, D, A, B, 6, 15, 0xA3014314 ); 147 | P( B, C, D, A, 13, 21, 0x4E0811A1 ); 148 | P( A, B, C, D, 4, 6, 0xF7537E82 ); 149 | P( D, A, B, C, 11, 10, 0xBD3AF235 ); 150 | P( C, D, A, B, 2, 15, 0x2AD7D2BB ); 151 | P( B, C, D, A, 9, 21, 0xEB86D391 ); 152 | 153 | #undef F 154 | 155 | ctx->state[0] += A; 156 | ctx->state[1] += B; 157 | ctx->state[2] += C; 158 | ctx->state[3] += D; 159 | } 160 | 161 | void md5_update( md5_context *ctx, uint8 *input, uint32 length ) 162 | { 163 | uint32 left, fill; 164 | 165 | if( ! length ) return; 166 | 167 | left = ctx->total[0] & 0x3F; 168 | fill = 64 - left; 169 | 170 | ctx->total[0] += length; 171 | ctx->total[0] &= 0xFFFFFFFF; 172 | 173 | if( ctx->total[0] < length ) 174 | ctx->total[1]++; /* #nocov */ 175 | 176 | if( left && length >= fill ) 177 | { 178 | memcpy( (void *) (ctx->buffer + left), 179 | (void *) input, fill ); 180 | md5_process( ctx, ctx->buffer ); 181 | length -= fill; 182 | input += fill; 183 | left = 0; 184 | } 185 | 186 | while( length >= 64 ) 187 | { 188 | md5_process( ctx, input ); 189 | length -= 64; 190 | input += 64; 191 | } 192 | 193 | if( length ) 194 | { 195 | memcpy( (void *) (ctx->buffer + left), 196 | (void *) input, length ); 197 | } 198 | } 199 | 200 | static uint8 md5_padding[64] = 201 | { 202 | 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 203 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 204 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 205 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 206 | }; 207 | 208 | void md5_finish( md5_context *ctx, uint8 digest[16] ) 209 | { 210 | uint32 last, padn; 211 | uint32 high, low; 212 | uint8 msglen[8]; 213 | 214 | high = ( ctx->total[0] >> 29 ) 215 | | ( ctx->total[1] << 3 ); 216 | low = ( ctx->total[0] << 3 ); 217 | 218 | PUT_UINT32( low, msglen, 0 ); 219 | PUT_UINT32( high, msglen, 4 ); 220 | 221 | last = ctx->total[0] & 0x3F; 222 | padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last ); 223 | 224 | md5_update( ctx, md5_padding, padn ); 225 | md5_update( ctx, msglen, 8 ); 226 | 227 | PUT_UINT32( ctx->state[0], digest, 0 ); 228 | PUT_UINT32( ctx->state[1], digest, 4 ); 229 | PUT_UINT32( ctx->state[2], digest, 8 ); 230 | PUT_UINT32( ctx->state[3], digest, 12 ); 231 | } 232 | 233 | #ifdef TEST 234 | 235 | #include 236 | #include 237 | 238 | /* 239 | * those are the standard RFC 1321 test vectors 240 | */ 241 | 242 | static char *msg[] = 243 | { 244 | "", 245 | "a", 246 | "abc", 247 | "message digest", 248 | "abcdefghijklmnopqrstuvwxyz", 249 | "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", 250 | "12345678901234567890123456789012345678901234567890123456789012" \ 251 | "345678901234567890" 252 | }; 253 | 254 | static char *val[] = 255 | { 256 | "d41d8cd98f00b204e9800998ecf8427e", 257 | "0cc175b9c0f1b6a831c399e269772661", 258 | "900150983cd24fb0d6963f7d28e17f72", 259 | "f96b697d7cb7938d525a2f31aaf161d0", 260 | "c3fcd3d76192e4007dfb496cca67e13b", 261 | "d174ab98d277d9f5a5611c2c9f419d9f", 262 | "57edf4a22be3c955ac49da2e2107b67a" 263 | }; 264 | 265 | int main( int argc, char *argv[] ) 266 | { 267 | FILE *f; 268 | int i, j; 269 | char output[33]; 270 | md5_context ctx; 271 | unsigned char buf[1000]; 272 | unsigned char md5sum[16]; 273 | 274 | if( argc < 2 ) 275 | { 276 | printf( "\n MD5 Validation Tests:\n\n" ); 277 | 278 | for( i = 0; i < 7; i++ ) 279 | { 280 | printf( " Test %d ", i + 1 ); 281 | 282 | md5_starts( &ctx ); 283 | md5_update( &ctx, (uint8 *) msg[i], strlen( msg[i] ) ); 284 | md5_finish( &ctx, md5sum ); 285 | 286 | for( j = 0; j < 16; j++ ) 287 | { 288 | sprintf( output + j * 2, "%02x", md5sum[j] ); 289 | } 290 | 291 | if( memcmp( output, val[i], 32 ) ) 292 | { 293 | printf( "failed!\n" ); 294 | return( 1 ); 295 | } 296 | 297 | printf( "passed.\n" ); 298 | } 299 | 300 | printf( "\n" ); 301 | } 302 | else 303 | { 304 | if( ! ( f = fopen( argv[1], "rb" ) ) ) 305 | { 306 | perror( "fopen" ); 307 | return( 1 ); 308 | } 309 | 310 | md5_starts( &ctx ); 311 | 312 | while( ( i = fread( buf, 1, sizeof( buf ), f ) ) > 0 ) 313 | { 314 | md5_update( &ctx, buf, i ); 315 | } 316 | 317 | md5_finish( &ctx, md5sum ); 318 | 319 | for( j = 0; j < 16; j++ ) 320 | { 321 | printf( "%02x", md5sum[j] ); 322 | } 323 | 324 | printf( " %s\n", argv[1] ); 325 | } 326 | 327 | return( 0 ); 328 | } 329 | 330 | #endif 331 | -------------------------------------------------------------------------------- /src/md5.h: -------------------------------------------------------------------------------- 1 | #ifndef _MD5_H 2 | #define _MD5_H 3 | 4 | #ifndef uint8 5 | #define uint8 unsigned char 6 | #endif 7 | 8 | #ifndef uint32 9 | #define uint32 unsigned long int 10 | #endif 11 | 12 | typedef struct 13 | { 14 | uint32 total[2]; 15 | uint32 state[4]; 16 | uint8 buffer[64]; 17 | } 18 | md5_context; 19 | 20 | void md5_starts( md5_context *ctx ); 21 | void md5_update( md5_context *ctx, uint8 *input, uint32 length ); 22 | void md5_finish( md5_context *ctx, uint8 digest[16] ); 23 | 24 | #endif /* md5.h */ 25 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(rvw) 3 | 4 | test_check("rvw") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-cmdline.R: -------------------------------------------------------------------------------- 1 | context("Check predictions against command line version of VW") 2 | library(rvw) 3 | 4 | # Switch to temporary directory 5 | curr_dir <- getwd() 6 | setwd(tempdir()) 7 | 8 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 9 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw") 10 | 11 | lda_data <- system.file("extdata", "lda_data.vw", package = "rvw") 12 | 13 | multiclass_train_data <- system.file("extdata", "multiclass_train.vw", package = "rvw") 14 | multiclass_test_data <- system.file("extdata", "multiclass_valid.vw", package = "rvw") 15 | 16 | test_that("empty vwsetup works as CL version", { 17 | # Package session 18 | test_vwmodel <- vwsetup( 19 | dir = "./", 20 | model = "pk_mdl.vw" 21 | ) 22 | vwtrain(test_vwmodel, data = ext_train_data, quiet = T) 23 | vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T) 24 | vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw")) 25 | file.remove("pk_mdl.vw") 26 | 27 | # Command Line session 28 | system( 29 | paste0("vw -d ", ext_train_data," -f ./cl_mdl.vw"), 30 | intern = FALSE, 31 | ignore.stderr = TRUE 32 | ) 33 | vw_cl_output <- as.numeric( 34 | system( 35 | paste0("vw -t -d ", ext_test_data," -i ./cl_mdl.vw -p /dev/stdout"), 36 | intern = TRUE, 37 | ignore.stderr = TRUE 38 | ) 39 | ) 40 | vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw")) 41 | file.remove("cl_mdl.vw") 42 | 43 | # Results comparison 44 | expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum, tolerance=1e-7) 45 | expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7) 46 | }) 47 | 48 | test_that("nn vwsetup works as CL version", { 49 | # Package session 50 | test_vwmodel <- vwsetup( 51 | dir = "./", 52 | model = "pk_mdl.vw", 53 | option = "nn", 54 | num_hidden = 4 55 | ) 56 | vwtrain(test_vwmodel, data = ext_train_data, quiet = T) 57 | vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T) 58 | vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw")) 59 | file.remove("pk_mdl.vw") 60 | 61 | # Command Line session 62 | system( 63 | paste0("vw --nn 4 -d ", ext_train_data," -f ./cl_mdl.vw"), 64 | intern = FALSE, 65 | ignore.stderr = TRUE 66 | ) 67 | vw_cl_output <- as.numeric( 68 | system( 69 | paste0("vw --nn 4 -t -d ", ext_test_data," -i ./cl_mdl.vw -p /dev/stdout"), 70 | intern = TRUE, 71 | ignore.stderr = TRUE 72 | ) 73 | ) 74 | vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw")) 75 | file.remove("cl_mdl.vw") 76 | 77 | # Results comparison 78 | expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum) 79 | expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7) 80 | }) 81 | 82 | 83 | test_that("vwsetup with custom arguments and cache works as CL version", { 84 | # Skip for now, because not yet finished fix for this case 85 | # skip("Problem is being fixed") 86 | # Package session 87 | test_vwmodel <- vwsetup( 88 | dir = "./", 89 | model = "pk_mdl.vw", 90 | general_params = list(random_seed = 42, loss_function="logistic", link="logistic"), 91 | feature_params = list(bit_precision=20, ngram="A2", noconstant=T), 92 | optimization_params = list(adaptive=FALSE, l1=1E-8) 93 | ) 94 | test_vwmodel <- add_option(test_vwmodel, option = "boosting", num_learners=4) 95 | vwtrain(test_vwmodel, data = ext_train_data, quiet = T, passes = 10) 96 | vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T) 97 | vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw")) 98 | file.remove("pk_mdl.vw","binary_train.vw.cache", "binary_valid.vw.cache") 99 | 100 | # Command Line session 101 | system( 102 | paste0("vw --random_seed 42 --loss_function logistic --link logistic ", 103 | "--bit_precision 20 --ngram A2 --noconstant --l1 1e-08 --boosting 4 --passes 10 -c", 104 | " -d ", ext_train_data, " -f ./cl_mdl.vw"), 105 | intern = FALSE, 106 | ignore.stderr = TRUE 107 | ) 108 | vw_cl_output <- as.numeric( 109 | system( 110 | paste0("vw -t -d ", ext_test_data, " -i ./cl_mdl.vw -p /dev/stdout"), 111 | intern = TRUE, 112 | ignore.stderr = TRUE 113 | ) 114 | ) 115 | vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw")) 116 | file.remove("cl_mdl.vw","binary_train.vw.cache", "binary_valid.vw.cache") 117 | 118 | # Results comparison 119 | expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum) 120 | expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7) 121 | }) 122 | 123 | test_that("Updating model with new data works as CL version", { 124 | test_dir <- getwd() 125 | # Package session 126 | pk_mdl_file <- paste0(test_dir, "/", "pk_mdl.vw") 127 | test_vwmodel <- vwsetup( 128 | dir = test_dir, 129 | model = "pk_mdl.vw" 130 | ) 131 | vwtrain(test_vwmodel, data = ext_train_data, update_model = TRUE, quiet = T) 132 | vw_pk_initial_mdl_checksum <- unname(tools::md5sum(pk_mdl_file)) 133 | vwtrain(test_vwmodel, data = ext_test_data, update_model = TRUE, quiet = T) 134 | vw_pk_updated_mdl_checksum <- unname(tools::md5sum(pk_mdl_file)) 135 | vw_pk_output <- predict.vw(test_vwmodel, data = ext_test_data, quiet = T) 136 | file.remove(pk_mdl_file) 137 | 138 | # Command Line session 139 | cl_mdl_file <- paste0(test_dir, "/", "cl_mdl.vw") 140 | system( 141 | paste0("vw", 142 | " -d ", ext_train_data, " -f ", cl_mdl_file, " --save_resume --quiet"), 143 | intern = FALSE, 144 | ignore.stderr = TRUE 145 | ) 146 | vw_cl_initial_mdl_checksum <- unname(tools::md5sum(cl_mdl_file)) 147 | system( 148 | paste0("vw", 149 | " -d ", ext_test_data, " -i ", cl_mdl_file, " -f ", cl_mdl_file, " --save_resume --quiet"), 150 | intern = FALSE, 151 | ignore.stderr = TRUE 152 | ) 153 | vw_cl_updated_mdl_checksum <- unname(tools::md5sum(cl_mdl_file)) 154 | vw_cl_output <- as.numeric( 155 | system( 156 | paste0("vw", 157 | " -t -d ", ext_test_data, " -i ", cl_mdl_file, " -p /dev/stdout --quiet"), 158 | intern = TRUE, 159 | ignore.stderr = TRUE 160 | ) 161 | ) 162 | file.remove(cl_mdl_file) 163 | 164 | # Results comparison 165 | expect_equal(vw_pk_initial_mdl_checksum, vw_cl_initial_mdl_checksum) 166 | expect_equal(vw_pk_updated_mdl_checksum, vw_cl_updated_mdl_checksum) 167 | expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7) 168 | }) 169 | 170 | test_that("vwsetup with multiclass classification setup works as CL version", { 171 | # Package session 172 | test_vwmodel <- vwsetup( 173 | dir = "./", 174 | model = "pk_mdl.vw", 175 | option = "ect", 176 | num_classes = 3 177 | ) 178 | vwtrain(test_vwmodel, data = multiclass_train_data, quiet = T, passes = 4) 179 | vw_pk_output <- vwtest(test_vwmodel, data = multiclass_test_data, quiet = T) 180 | vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw")) 181 | file.remove("pk_mdl.vw","multiclass_train.vw.cache", "multiclass_valid.vw.cache") 182 | 183 | # Command Line session 184 | system( 185 | paste0("vw --passes 4 --cache --ect 3", 186 | " -d ", multiclass_train_data, " -f ./cl_mdl.vw"), 187 | intern = FALSE, 188 | ignore.stderr = TRUE 189 | ) 190 | system( 191 | paste0("vw", 192 | " -t -d ", multiclass_test_data, " -i ./cl_mdl.vw -p ./cl_probs.out"), 193 | intern = FALSE, 194 | ignore.stderr = TRUE 195 | ) 196 | vw_cl_output <- read.table(file = "./cl_probs.out", sep = " ", header = F) 197 | vw_cl_output <- vw_cl_output$V1 198 | vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw")) 199 | file.remove("cl_mdl.vw", "multiclass_train.vw.cache", "multiclass_valid.vw.cache", "cl_probs.out") 200 | 201 | # Results comparison 202 | expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum) 203 | expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7) 204 | }) 205 | 206 | test_that("vwsetup with lda setup works as CL version", { 207 | # Package session 208 | test_vwmodel <- vwsetup( 209 | dir = "./", 210 | model = "pk_mdl.vw", 211 | option = "lda", 212 | num_topics = 7, 213 | lda_D = 100, 214 | minibatch = 16, 215 | math_mode = "accuracy" 216 | ) 217 | vwtrain(test_vwmodel, data = lda_data, quiet = T, passes = 2, 218 | readable_model = "hashed", readable_model_path = "pk_readable_mdl.vw") 219 | vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw")) 220 | vw_pk_readable_mdl_checksum <- unname(tools::md5sum("pk_readable_mdl.vw")) 221 | file.remove("pk_mdl.vw", "pk_readable_mdl.vw", "lda_data.vw.cache") 222 | 223 | # Command Line session 224 | system( 225 | paste0("vw --lda 7 --lda_D 100 --math-mode accuracy --minibatch 16 --passes 2", 226 | " --cache_file ./lda_data.vw.cache", 227 | " --readable_model ./cl_readable_mdl.vw", 228 | " -d ", lda_data, " -f ./cl_mdl.vw"), 229 | intern = FALSE, 230 | ignore.stderr = TRUE 231 | ) 232 | vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw")) 233 | vw_cl_readable_mdl_checksum <- unname(tools::md5sum("cl_readable_mdl.vw")) 234 | file.remove("cl_mdl.vw", "cl_readable_mdl.vw", "lda_data.vw.cache") 235 | 236 | # Results comparison 237 | expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum) 238 | expect_equal(vw_pk_readable_mdl_checksum, vw_cl_readable_mdl_checksum) 239 | }) 240 | 241 | test_that("vwsetup with multicolumn output", { 242 | # Package session 243 | test_vwmodel <- vwsetup( 244 | dir = "./", 245 | model = "pk_mdl.vw", 246 | option = "oaa", 247 | num_classes = 3 248 | ) 249 | vwtrain(test_vwmodel, data = multiclass_train_data, quiet = T, passes = 2) 250 | vw_pk_output <- vwtest(test_vwmodel, data = multiclass_test_data, quiet = T, full_probs = T) 251 | write.table(vw_pk_output, file = "pk_probs.out", sep = " ", quote = F, row.names = F, col.names = F) 252 | vw_pk_probs_checksum <- unname(tools::md5sum("pk_probs.out")) 253 | 254 | file.remove("pk_mdl.vw", "pk_probs.out", "multiclass_train.vw.cache") 255 | 256 | # Command Line session 257 | system( 258 | paste0("vw --oaa 3 --passes 2 --cache", 259 | " -d ", multiclass_train_data, " -f ./cl_mdl.vw"), 260 | intern = FALSE, 261 | ignore.stderr = TRUE 262 | ) 263 | system( 264 | paste0("vw", 265 | " -t -d ", multiclass_test_data, " -i ./cl_mdl.vw -p ./cl_probs.out"), 266 | intern = FALSE, 267 | ignore.stderr = TRUE 268 | ) 269 | vw_cl_probs_checksum <- unname(tools::md5sum("cl_probs.out")) 270 | 271 | file.remove("cl_mdl.vw", "cl_probs.out", "multiclass_train.vw.cache") 272 | 273 | # Results comparison 274 | expect_equal(vw_pk_probs_checksum, vw_cl_probs_checksum) 275 | }) 276 | 277 | test_that("vwsetup with df2vw conversion works as CL version", { 278 | # Package session 279 | test_vwmodel <- vwsetup( 280 | dir = "./", 281 | model = "pk_mdl.vw", 282 | option = "oaa", 283 | num_classes = 3 284 | ) 285 | 286 | data_full <- iris 287 | levels(data_full$Species) <- c(1, 2, 3) 288 | ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full)) 289 | 290 | vwtrain(test_vwmodel, data = data_full[ind_train,], quiet = T, 291 | targets = "Species") 292 | vw_pk_output <- vwtest(test_vwmodel, data = data_full[-ind_train,], quiet = T, 293 | targets = "Species") 294 | vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw")) 295 | file.remove("pk_mdl.vw") 296 | 297 | # Command Line session 298 | 299 | df2vw(data = data_full[ind_train,], file_path = "cl_train.vw", 300 | targets = "Species") 301 | df2vw(data = data_full[-ind_train,], file_path = "cl_test.vw", 302 | targets = "Species") 303 | 304 | system( 305 | paste0("vw --oaa 3 -d ./cl_train.vw -f ./cl_mdl.vw"), 306 | intern = FALSE, 307 | ignore.stderr = TRUE 308 | ) 309 | vw_cl_output <- as.numeric( 310 | system( 311 | paste0("vw -t -d ./cl_test.vw -i ./cl_mdl.vw -p /dev/stdout"), 312 | intern = TRUE, 313 | ignore.stderr = TRUE 314 | ) 315 | ) 316 | vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw")) 317 | file.remove("cl_mdl.vw", "cl_train.vw", "cl_test.vw") 318 | 319 | # Results comparison 320 | expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum, tolerance=1e-7) 321 | expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7) 322 | }) 323 | 324 | test_that("print.vw outputs correct results to console", { 325 | test_vwmodel <- vwsetup( 326 | dir = "./", 327 | model = "pk_mdl.vw", 328 | option = "boosting", 329 | num_learners = 10 330 | ) 331 | vwtrain(test_vwmodel, data = ext_train_data, quiet = T) 332 | vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T) 333 | 334 | capture.output(test_vwmodel, file = "pk_print.out") 335 | 336 | vw_pk_print_checksum <- unname(tools::md5sum("pk_print.out")) 337 | ref_print_file <- system.file("extdata", "ref_print.out", package = "rvw") 338 | vw_ref_print_checksum <- unname(tools::md5sum(ref_print_file)) 339 | 340 | file.remove("pk_mdl.vw", "pk_print.out") 341 | 342 | expect_equal(vw_pk_print_checksum, vw_ref_print_checksum) 343 | }) 344 | 345 | # Return back 346 | setwd(curr_dir) 347 | -------------------------------------------------------------------------------- /tests/testthat/test-err.R: -------------------------------------------------------------------------------- 1 | context("vw error messages") 2 | library(rvw) 3 | 4 | fake_vwmodel <- list(params = list(algorithm = "sgd", 5 | general_params = list(), 6 | feature_params = list(), 7 | optimization_params = list(), 8 | options = list() 9 | 10 | ), 11 | dir = "../my_tmp", 12 | model = "mdl.vw", 13 | params_str = paste0(""), 14 | is_cl = FALSE, 15 | data_md5sum = list(train = "", 16 | test = ""), 17 | train_file = "", 18 | eval = list( 19 | train=list(), 20 | test=list() 21 | ), 22 | parser_opts=NA 23 | ) 24 | 25 | 26 | test_that(".check_parameters raises correct errors", { 27 | # Wrong argument names 28 | 29 | # General params 30 | expect_error( 31 | vwsetup(general_params = list(wrong_param_1=10, ting_size=10)), 32 | "Wrong argument names: wrong_param_1, ting_size", 33 | fixed = T 34 | ) 35 | # Feature params 36 | expect_error( 37 | vwsetup(feature_params = list(bit_precision=25, featurelimit=100)), 38 | "Wrong argument names: featurelimit", 39 | fixed = T 40 | ) 41 | # Optimization params 42 | expect_error( 43 | vwsetup(optimization_params = list(initial_p=0.1, l1=1E-7)), 44 | "Wrong argument names: initial_p", 45 | fixed = T 46 | ) 47 | # Option 48 | expect_error( 49 | vwsetup(option = "nn", num_hidden = 10, inpas = TRUE), 50 | "Wrong argument names: inpas", 51 | fixed = T 52 | ) 53 | 54 | 55 | # Wrong argument values 56 | 57 | # General params 58 | expect_error( 59 | vwsetup(general_params = list(random_seed="10", ring_size=10)), 60 | "Wrong argument values: random_seed", 61 | fixed = T 62 | ) 63 | # Feature params 64 | # This test should be changed in future, because we want to accept both real and integer numbers 65 | expect_error( 66 | vwsetup(feature_params = list(bit_precision=25L, noconstant="foo")), 67 | "Wrong argument values: bit_precision, noconstant", 68 | fixed = T 69 | ) 70 | # Optimization params 71 | expect_error( 72 | vwsetup(optimization_params = list(no_bias_regularization="on", feature_mask=1E-7)), 73 | "Wrong argument values: feature_mask", 74 | fixed = T 75 | ) 76 | # Option 77 | expect_error( 78 | vwsetup(option = "nn", num_hidden = "10", inpass = "TRUE"), 79 | "Wrong argument values: num_hidden, inpass", 80 | fixed = T 81 | ) 82 | 83 | 84 | # Missing first argument value in option parameters 85 | expect_error( 86 | vwsetup(option = "nn"), 87 | "Missing value for argument: num_hidden", 88 | fixed = T 89 | ) 90 | }) 91 | 92 | test_that("vwsetup raises correct errors", { 93 | 94 | # Whitespace characters in dir path 95 | expect_error( 96 | vwsetup(dir = "./some folder/"), 97 | "Whitespace characters are not allowed in `dir` path", 98 | fixed = T 99 | ) 100 | 101 | # Whitespace characters in model path 102 | expect_error( 103 | vwsetup(model = "./some folder/mdl.vw"), 104 | "Whitespace characters are not allowed in `model` path", 105 | fixed = T 106 | ) 107 | 108 | # Forbidden flags in cmd line parameters 109 | expect_error( 110 | vwsetup(params_str = "--passes 10"), 111 | "Following cmd line parameters are defined in other functions:", 112 | fixed = T 113 | ) 114 | }) 115 | 116 | test_that("add_option raises correct errors", { 117 | 118 | # vwmodel should be of class vw 119 | expect_error( 120 | add_option(fake_vwmodel, option = "nn", num_hidden = 10), 121 | "vwmodel should be of class vw", 122 | fixed = T 123 | ) 124 | 125 | # add_option can't be used with direct cmd line parameters 126 | test_vwmodel <- vwsetup(params_str = "--bit_precision 25") 127 | 128 | expect_error( 129 | add_option(test_vwmodel, option = "nn", num_hidden = 10), 130 | "add_option can't be used when cmd line parameters are used", 131 | fixed = T 132 | ) 133 | 134 | # Overwrite option 135 | test_vwmodel <- vwsetup(option = "nn", num_hidden = 5) 136 | 137 | expect_error( 138 | add_option(test_vwmodel, option = "nn", num_hidden = 10), 139 | "Trying to overwrite option", 140 | fixed = T 141 | ) 142 | }) 143 | 144 | test_that("vwparams raises correct errors", { 145 | 146 | # vwmodel should be of class vw 147 | expect_error( 148 | vwparams(fake_vwmodel, name = "bit_precision"), 149 | "vwmodel should be of class vw", 150 | fixed = T 151 | ) 152 | 153 | # add_option can't be used with direct cmd line parameters 154 | test_vwmodel <- vwsetup(params_str = "--bit_precision 25") 155 | 156 | expect_error( 157 | vwparams(test_vwmodel, name = "bit_precision"), 158 | "vwparams can't be used when cmd line parameters are used", 159 | fixed = T 160 | ) 161 | }) 162 | -------------------------------------------------------------------------------- /tests/testthat/test-parser.R: -------------------------------------------------------------------------------- 1 | context("Check df2vw parser") 2 | library(rvw) 3 | 4 | # Switch to temporary directory 5 | curr_dir <- getwd() 6 | setwd(tempdir()) 7 | 8 | test_that("df2vw correctly parses data", { 9 | df2vw_path <- "df2vw.vw" 10 | ref_path <- "ref.vw" 11 | 12 | test_df = data.frame( 13 | num_v1 = c(0.00005, 0.333333334, 10, 100000.314), 14 | fact_v2 = factor(c("a", "a", "b", "c")), 15 | text_v3 = rep("Et harum| (quid)em: rerum facilis!", 4), 16 | text_v4 = rep(" Et harum| (quid)em: rerum facilis!", 4), 17 | regular_label = c(1, 1.2, 4, 5.4), 18 | base = c(1, 1, 1, 1), 19 | multiline_label = c(0, 1, 1, 0), 20 | multilabel_1 = c(0.25, 0.25, 0.25, 0.25), 21 | multilabel_2 = c(0.25, 0.25, 0.25, 0.25), 22 | multilabel_3 = c(0.25, 0.25, 0.25, 0.25), 23 | multilabel_4 = c(0.25, 0.25, 0.25, 0.25), 24 | na_multilabel_1 = c(NA, 0.25, NA, 0.25), 25 | na_multilabel_2 = c(NA, NA, 0.25, 0.25), 26 | na_multilabel_3 = c(NA, NA, NA, NA), 27 | tag = c("ex1", "ex2", "ex3", "ex4"), 28 | importance = c("10", "0.5", "0.5", "4") 29 | ) 30 | 31 | ref_df = data.frame( 32 | features = c(" |NS1 num_v1:5e-05 fact_v2^a |NS2 fact_v2^a Et harum_ _quid_em_ rerum facilis! |NS3 Et harum| (quid)em: rerum facilis!", 33 | " |NS1 num_v1:0.333333334 fact_v2^a |NS2 fact_v2^a Et harum_ _quid_em_ rerum facilis! |NS3 Et harum| (quid)em: rerum facilis!", 34 | " |NS1 num_v1:10 fact_v2^b |NS2 fact_v2^b Et harum_ _quid_em_ rerum facilis! |NS3 Et harum| (quid)em: rerum facilis!", 35 | " |NS1 num_v1:100000.314 fact_v2^c |NS2 fact_v2^c Et harum_ _quid_em_ rerum facilis! |NS3 Et harum| (quid)em: rerum facilis!"), 36 | regular_labels = c("1 10 1 'ex1", "1.2 0.5 1 'ex2", "4 0.5 1 'ex3", "5.4 4 1 'ex4"), 37 | csoaa_labels = c("1:0.25 2:0.25 3:0.25 4:0.25 'ex1", "1:0.25 2:0.25 3:0.25 4:0.25 'ex2", 38 | "1:0.25 2:0.25 3:0.25 4:0.25 'ex3", "1:0.25 2:0.25 3:0.25 4:0.25 'ex4"), 39 | cb_labels = c("1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex1", 40 | "1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex2", 41 | "1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex3", 42 | "1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex4"), 43 | na_labels = c(" 'ex1", 44 | "1:0.25:0.25 'ex2", 45 | "2:0.25:0.25 'ex3", 46 | "1:0.25:0.25 2:0.25:0.25 'ex4"), 47 | multiline_labels = c("1:0", "2:1", "1:1", "2:0") 48 | ) 49 | 50 | # Regular labels 51 | cat("Regular labels\n") 52 | ref_file <- file(ref_path,"w") 53 | apply(ref_df, MARGIN = 1, function(x) { 54 | writeLines(text = paste0(x[["regular_labels"]], x[["features"]]), con = ref_file) 55 | }) 56 | close(ref_file) 57 | regular_ref_checksum <- unname(tools::md5sum(ref_path)) 58 | 59 | df2vw(data = test_df, file_path = df2vw_path, 60 | namespaces = list(NS1 = c("num_v1", "fact_v2"), 61 | NS2 = c("fact_v2", "text_v3"), 62 | NS3 = c("text_v4")), 63 | keep_space = "text_v3", base = "base", 64 | fixed = "text_v4", 65 | targets = "regular_label", tag = "tag", weight = "importance") 66 | regular_df2vw_checksum <- unname(tools::md5sum(df2vw_path)) 67 | 68 | # CSOAA labels 69 | cat("CSOAA labels\n") 70 | ref_file <- file(ref_path,"w") 71 | apply(ref_df, MARGIN = 1, function(x) { 72 | writeLines(text = paste0(x[["csoaa_labels"]], x[["features"]]), con = ref_file) 73 | }) 74 | close(ref_file) 75 | csoaa_ref_checksum <- unname(tools::md5sum(ref_path)) 76 | 77 | df2vw(data = test_df, file_path = df2vw_path, 78 | namespaces = list(NS1 = c("num_v1", "fact_v2"), 79 | NS2 = c("fact_v2", "text_v3"), 80 | NS3 = c("text_v4")), 81 | keep_space = "text_v3", 82 | fixed = "text_v4", 83 | targets = c("multilabel_1", "multilabel_2", "multilabel_3", "multilabel_4"), 84 | tag = "tag", weight = "importance") 85 | csoaa_df2vw_checksum <- unname(tools::md5sum(df2vw_path)) 86 | 87 | # Context Bandit labels 88 | cat("Context Bandit labels\n") 89 | ref_file <- file(ref_path,"w") 90 | apply(ref_df, MARGIN = 1, function(x) { 91 | writeLines(text = paste0(x[["cb_labels"]], x[["features"]]), con = ref_file) 92 | }) 93 | close(ref_file) 94 | cb_ref_checksum <- unname(tools::md5sum(ref_path)) 95 | 96 | df2vw(data = test_df, file_path = df2vw_path, 97 | namespaces = list(NS1 = c("num_v1", "fact_v2"), 98 | NS2 = c("fact_v2", "text_v3"), 99 | NS3 = c("text_v4")), 100 | keep_space = "text_v3", 101 | fixed = "text_v4", 102 | targets = c("multilabel_1", "multilabel_2", "multilabel_3", "multilabel_4"), 103 | probabilities = c("multilabel_1", "multilabel_2", "multilabel_3", "multilabel_4"), 104 | tag = "tag", weight = "importance") 105 | cb_df2vw_checksum <- unname(tools::md5sum(df2vw_path)) 106 | 107 | # CSOAA when not all labels are allowed 108 | cat("CSOAA when not all labels are allowed\n") 109 | ref_file <- file(ref_path,"w") 110 | apply(ref_df, MARGIN = 1, function(x) { 111 | writeLines(text = paste0(x[["na_labels"]], x[["features"]]), con = ref_file) 112 | }) 113 | close(ref_file) 114 | na_ref_checksum <- unname(tools::md5sum(ref_path)) 115 | 116 | df2vw(data = test_df, file_path = df2vw_path, 117 | namespaces = list(NS1 = c("num_v1", "fact_v2"), 118 | NS2 = c("fact_v2", "text_v3"), 119 | NS3 = c("text_v4")), 120 | keep_space = "text_v3", 121 | fixed = "text_v4", 122 | targets = c("na_multilabel_1", "na_multilabel_2", "na_multilabel_3"), 123 | probabilities = c("multilabel_1", "multilabel_2", "multilabel_3"), 124 | tag = "tag", weight = "importance") 125 | na_df2vw_checksum <- unname(tools::md5sum(df2vw_path)) 126 | 127 | # Multiline CSOAA 128 | cat("Multiline CSOAA\n") 129 | ref_file <- file(ref_path,"w") 130 | ref_df$lines <- apply(ref_df, MARGIN = 1, function(x) { 131 | paste0(x[["multiline_labels"]], x[["features"]]) 132 | }) 133 | writeLines(text = paste0(ref_df$lines, c("", "\n"), collapse = "\n"), con = ref_file) 134 | close(ref_file) 135 | mult_ref_checksum <- unname(tools::md5sum(ref_path)) 136 | 137 | df2vw(data = test_df, file_path = df2vw_path, 138 | namespaces = list(NS1 = c("num_v1", "fact_v2"), 139 | NS2 = c("fact_v2", "text_v3"), 140 | NS3 = c("text_v4")), 141 | keep_space = "text_v3", 142 | fixed = "text_v4", 143 | targets = "multiline_label", 144 | multiline = 2) 145 | mult_df2vw_checksum <- unname(tools::md5sum(df2vw_path)) 146 | 147 | 148 | file.remove(ref_path, df2vw_path) 149 | 150 | # Results comparison 151 | expect_equal(regular_df2vw_checksum, regular_ref_checksum) 152 | expect_equal(csoaa_df2vw_checksum, csoaa_ref_checksum) 153 | expect_equal(cb_df2vw_checksum, cb_ref_checksum) 154 | expect_equal(na_df2vw_checksum, na_ref_checksum) 155 | expect_equal(mult_df2vw_checksum, mult_ref_checksum) 156 | 157 | }) 158 | 159 | # Return back 160 | setwd(curr_dir) 161 | -------------------------------------------------------------------------------- /tests/testthat/test-utils.R: -------------------------------------------------------------------------------- 1 | context("Check auxiliary functionality") 2 | library(rvw) 3 | 4 | # Switch to temporary directory 5 | curr_dir <- getwd() 6 | setwd(tempdir()) 7 | 8 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw") 9 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw") 10 | 11 | test_that("vwtrain and vwtest output correct readable model", { 12 | # Package session 13 | test_vwmodel <- vwsetup(dir = "./", model = "pk_mdl.vw") 14 | # vwtrain 15 | vwtrain(test_vwmodel, data = ext_train_data, readable_model = "hashed", quiet = T) 16 | vw_pk_train_hashed_mdl_checksum = unname(tools::md5sum("readable_pk_mdl.vw")) 17 | test_vwmodel <- vwsetup(dir = "./", model = "pk_mdl.vw") 18 | vwtrain(test_vwmodel, data = ext_train_data, readable_model = "inverted", quiet = T) 19 | vw_pk_train_inverted_mdl_checksum <- unname(tools::md5sum("readable_pk_mdl.vw")) 20 | # vwtest 21 | vwtest(test_vwmodel, data = ext_test_data, readable_model = "hashed", quiet = T) 22 | vw_pk_test_hashed_mdl_checksum = unname(tools::md5sum("readable_pk_mdl.vw")) 23 | vwtest(test_vwmodel, data = ext_test_data, readable_model = "inverted", quiet = T) 24 | vw_pk_test_inverted_mdl_checksum <- unname(tools::md5sum("readable_pk_mdl.vw")) 25 | 26 | file.remove("pk_mdl.vw","readable_pk_mdl.vw") 27 | 28 | # Command Line session 29 | # train 30 | system( 31 | paste0("vw -d ", ext_train_data, " -f ./cl_mdl.vw --readable_model ./readable_cl_mdl.vw"), 32 | intern = FALSE, 33 | ignore.stderr = TRUE 34 | ) 35 | vw_cl_train_hashed_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw")) 36 | system( 37 | paste0("vw -d ", ext_train_data, " -f ./cl_mdl.vw --invert_hash ./readable_cl_mdl.vw"), 38 | intern = FALSE, 39 | ignore.stderr = TRUE 40 | ) 41 | vw_cl_train_inverted_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw")) 42 | # test 43 | system( 44 | paste0("vw -t -d ", ext_test_data, " -i ./cl_mdl.vw --readable_model ./readable_cl_mdl.vw"), 45 | intern = FALSE, 46 | ignore.stderr = TRUE 47 | ) 48 | vw_cl_test_hashed_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw")) 49 | system( 50 | paste0("vw -t -d ", ext_test_data, " -i ./cl_mdl.vw --invert_hash ./readable_cl_mdl.vw"), 51 | intern = FALSE, 52 | ignore.stderr = TRUE 53 | ) 54 | vw_cl_test_inverted_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw")) 55 | file.remove("cl_mdl.vw","readable_cl_mdl.vw") 56 | 57 | # Results comparison 58 | unique_checksums_hashed <- length(unique(c(vw_pk_train_hashed_mdl_checksum, 59 | vw_pk_test_hashed_mdl_checksum, 60 | vw_cl_train_hashed_mdl_checksum, 61 | vw_cl_test_hashed_mdl_checksum))) 62 | 63 | expect_equal(vw_pk_train_hashed_mdl_checksum, vw_cl_train_hashed_mdl_checksum) 64 | expect_equal(vw_pk_test_hashed_mdl_checksum, vw_cl_test_hashed_mdl_checksum) 65 | expect_equal(vw_pk_train_inverted_mdl_checksum, vw_cl_train_inverted_mdl_checksum) 66 | expect_equal(vw_pk_test_inverted_mdl_checksum, vw_cl_test_inverted_mdl_checksum) 67 | }) 68 | 69 | test_that("vwaudit outputs correct audit data.frame", { 70 | ref_df <- data.frame(Names = c("A^carat", "A^depth", "A^table", "A^price", "A^x", 71 | "A^z", "A^cut_Very_Good", "A^color_G", "A^clarity_SI1", 72 | "Constant", "A^cut_Premium", "A^color_I", "A^clarity_SI2", 73 | "A^cut_Good", "A^color_E", "A^clarity_VS2", "A^cut_Ideal", 74 | "A^color_D", "A^color_H", "A^color_F", "A^clarity_IF", 75 | "A^clarity_VS1", "A^cut_Fair", "A^color_J", "A^clarity_VVS2", 76 | "A^clarity_VVS1", "A^clarity_I1"), 77 | Hashes = c(161523, 255131, 191106, 174484, 157305, 71870, 78 | 197774, 147043, 202990, 116060, 179903, 131053, 79 | 102042, 1176, 164360, 113391, 116290, 58595, 87066, 80 | 240073, 1556, 114685, 151473, 32836, 101424, 80982, 141904), 81 | V1 = c(-0.345847010612488, 0.00200122990645468, 82 | 0.00143359997309744, -5.60191983822733e-05, 83 | -0.0254513993859291, -0.0392897985875607, 0.159768000245094, 84 | 0.130854994058609, 0.0361801981925964, 0.119374997913837, 85 | 0.0958541035652161, -0.0784583017230034, -0.191651001572609, 86 | 0.144849002361298, 0.349283009767532, 0.0694333985447884, 87 | 0.00745435990393162, -0.0727915987372398, -0.0811441987752914, 88 | 0.273036986589432, 0.126379996538162, 0.171755000948906, 89 | -0.108182996511459, -0.328087002038956, 0.246926993131638, 90 | 0.451092004776001, -0.148938998579979)) 91 | 92 | test_vwmodel <- vwsetup() 93 | vwtrain(test_vwmodel, data = ext_train_data, quiet = T) 94 | aud_df <- vwaudit(test_vwmodel, quiet = T) 95 | 96 | expect_equal(aud_df, ref_df) 97 | }) 98 | 99 | test_that("vwparams correctly returns and sets parameter values", { 100 | 101 | test_vwmodel <- vwsetup(general_params = list(link="identity", holdout_off=FALSE), 102 | feature_params = list(bit_precision=10), 103 | option = "nn", num_hidden = 5) 104 | 105 | 106 | 107 | # Character value 108 | vwparams(test_vwmodel, name = "link") <- "logistic" 109 | expect_equal(vwparams(test_vwmodel, name = "link"), "logistic") 110 | 111 | # Numerical value 112 | vwparams(test_vwmodel, name = "bit_precision") <- 25 113 | expect_equal(vwparams(test_vwmodel, name = "bit_precision"), 25) 114 | 115 | # Logical value 116 | vwparams(test_vwmodel, name = "holdout_off") <- TRUE 117 | expect_equal(vwparams(test_vwmodel, name = "holdout_off"), TRUE) 118 | 119 | # Option value 120 | vwparams(test_vwmodel, name = "num_hidden") <- 10 121 | expect_equal(vwparams(test_vwmodel, name = "num_hidden"), 10) 122 | }) 123 | 124 | # Return back 125 | setwd(curr_dir) 126 | -------------------------------------------------------------------------------- /tests/testthat/test-vwsetup.R: -------------------------------------------------------------------------------- 1 | context("vwsetup") 2 | library(rvw) 3 | 4 | test_model <- list(params = list(algorithm = "sgd", 5 | general_params = list(random_seed=0, 6 | ring_size=NA_real_, 7 | holdout_off=FALSE, 8 | holdout_period=10, 9 | holdout_after=0, 10 | early_terminate=3, 11 | loss_function=NA_character_, 12 | link=NA_character_, 13 | quantile_tau=0.5), 14 | feature_params = list(bit_precision=18, 15 | quadratic=NA_character_, 16 | cubic=NA_character_, 17 | interactions=NA_character_, 18 | permutations=FALSE, 19 | leave_duplicate_interactions=FALSE, 20 | noconstant=FALSE, 21 | feature_limit=NA_character_, 22 | ngram=NA_character_, 23 | skips=NA_character_, 24 | hash=NA_character_, 25 | affix=NA_character_, 26 | spelling=NA_character_, 27 | interact=NA_character_), 28 | optimization_params = list(adaptive=TRUE, 29 | normalized=TRUE, 30 | invariant=TRUE, 31 | adax=FALSE, 32 | sparse_l2=0, 33 | l1_state=0, 34 | l2_state=1, 35 | learning_rate=0.5, 36 | initial_pass_length=NA_real_, 37 | l1=0, 38 | l2=0, 39 | no_bias_regularization=NA_character_, 40 | feature_mask=NA_character_, 41 | decay_learning_rate=1, 42 | initial_t=0, 43 | power_t=0.5, 44 | initial_weight=0, 45 | random_weights="off", 46 | normal_weights="off", 47 | truncated_normal_weights="off", 48 | sparse_weights=FALSE, 49 | input_feature_regularizer=NA_character_), 50 | options = list() 51 | 52 | ), 53 | dir = "../my_tmp", 54 | model = "mdl.vw", 55 | params_str = paste0(""), 56 | is_cl = FALSE, 57 | data_md5sum = list(train = "", 58 | test = ""), 59 | train_file = "", 60 | eval = list( 61 | train=list( 62 | num_examples = NA_real_, 63 | weighted_example_sum = NA_real_, 64 | weighted_label_sum = NA_real_, 65 | avg_loss = NA_real_, 66 | avg_multiclass_log_loss = NA_real_, 67 | best_const = NA_real_, 68 | best_const_loss = NA_real_, 69 | total_feature = NA_real_ 70 | ), 71 | test=list( 72 | num_examples = NA_real_, 73 | weighted_example_sum = NA_real_, 74 | weighted_label_sum = NA_real_, 75 | avg_loss = NA_real_, 76 | avg_multiclass_log_loss = NA_real_, 77 | best_const = NA_real_, 78 | best_const_loss = NA_real_, 79 | total_feature = NA_real_ 80 | ) 81 | ), 82 | parser_opts=NA 83 | ) 84 | class(test_model) <- "vw" 85 | 86 | test_that("vwsetup correctly setup model with different learning modes", { 87 | # Empty setup 88 | expect_equal(vwsetup(dir = "../my_tmp/", model = "mdl.vw"), test_model) 89 | 90 | # CMD line setup 91 | cmd_test_model <- test_model 92 | cmd_test_model$params_str <- "-b 25 --link glm" 93 | cmd_test_model$is_cl <- TRUE 94 | cmd_test_model$params$general_params <- list() 95 | cmd_test_model$params$feature_params <- list() 96 | cmd_test_model$params$optimization_params <- list() 97 | cmd_test_model$params$algorithm <- NA 98 | expect_equal(vwsetup(dir = "../my_tmp/", model = "mdl.vw", params_str = "-b 25 --link glm"), 99 | cmd_test_model) 100 | 101 | # Reference test model for nn mode 102 | nn_test_model <- test_model 103 | nn_test_model$params$options = list(nn = list(num_hidden=3, 104 | inpass=FALSE, 105 | multitask=FALSE, 106 | dropout=FALSE, 107 | meanfield=FALSE)) 108 | nn_test_model$params_str = paste0("--nn 3") 109 | expect_equal( 110 | vwsetup(dir = "../my_tmp/", model = "mdl.vw", option = "nn", num_hidden=3), 111 | nn_test_model 112 | ) 113 | 114 | # Reference test model for lda mode 115 | lda_test_model <- test_model 116 | lda_test_model$params$options = list(lda = list(num_topics=5, 117 | lda_alpha=0.100000001, 118 | lda_rho=0.100000001, 119 | lda_D=10000, 120 | lda_epsilon=0.00100000005, 121 | math_mode=NA_character_, 122 | minibatch=1, 123 | metrics=0)) 124 | lda_test_model$params_str = paste0("--lda 5") 125 | 126 | expect_equal( 127 | vwsetup(dir = "../my_tmp/", model = "mdl.vw", option = "lda", num_topics=5), 128 | lda_test_model 129 | ) 130 | 131 | # Reference test model with custom parameters 132 | custom_test_model <- test_model 133 | custom_test_model$params$optimization_params$adaptive = FALSE 134 | custom_test_model$params$options = list(binary = list(binary=TRUE)) 135 | custom_test_model$params_str <- paste0("--binary") 136 | # Package vwmodel setup 137 | test_vwmodel <- vwsetup( 138 | dir = "../my_tmp/", 139 | model = "mdl.vw", 140 | optimization_params = list(adaptive=FALSE), 141 | option = "binary" 142 | ) 143 | expect_equal(test_vwmodel, custom_test_model) 144 | 145 | # Reference test model with Experience Replay 146 | replay_test_model <- test_model 147 | replay_test_model$params$options = list(replay = list(level="m", 148 | buffer=200, 149 | count=1)) 150 | replay_test_model$params_str <- paste0("--replay_m 200 --replay_m_count 1") 151 | # Package vwmodel setup 152 | test_vwmodel <- vwsetup( 153 | dir = "../my_tmp/", 154 | model = "mdl.vw", 155 | option = "replay", 156 | level="m", 157 | buffer=200 158 | ) 159 | expect_equal(test_vwmodel, replay_test_model) 160 | 161 | # Reference test model with Contextual Bandit Exploration with Action Dependent Features 162 | cb_explore_test_model <- test_model 163 | cb_explore_test_model$params$options = list(cb_explore = list(num_actions=0, 164 | explore_type="bag", 165 | explore_arg=10, 166 | psi=1, 167 | nounif=FALSE, 168 | mellowness=0.1, 169 | greedify=FALSE, 170 | lambda=-1, 171 | cb_min_cost=0, 172 | cb_max_cost=1, 173 | first_only=FALSE)) 174 | cb_explore_test_model$params_str <- paste0("--cb_explore_adf --bag 10") 175 | # Package vwmodel setup 176 | test_vwmodel <- vwsetup( 177 | dir = "../my_tmp/", 178 | model = "mdl.vw", 179 | option = "cb_explore", 180 | num_actions=0, 181 | explore_type="bag", 182 | explore_arg=10 183 | ) 184 | expect_equal(test_vwmodel, cb_explore_test_model) 185 | }) 186 | -------------------------------------------------------------------------------- /tools/r_configure.R: -------------------------------------------------------------------------------- 1 | # For VW v8.6.1 2 | # headers <- c('action_score.h', 'allreduce.h', 'array_parameters_dense.h', 3 | # 'array_parameters.h', 'cb_explore.h', 'cb.h', 'comp_io.h', 4 | # 'config.h', 'constant.h', 'cost_sensitive.h', 'crossplat_compat.h', 5 | # 'error_reporting.h', 'example_predict.h', 'example.h', 'ezexample.h', 6 | # 'feature_group.h', 'floatbits.h', 'global_data.h', 'hash.h', 7 | # 'io_buf.h', 'label_parser.h', 'learner.h', 'loss_functions.h', 8 | # 'memory.h', 'multiclass.h', 'multilabel.h', 'no_label.h', 9 | # 'parse_example.h', 'parse_primitives.h', 'parser_helper.h', 10 | # 'parser.h', 'simple_label.h', 'v_array.h', 'v_hashmap.h', 11 | # 'vw_exception.h', 'vw_validate.h', 'vw.h', 'vwdll.h') 12 | 13 | # For VW v8.6.1 without missing headers 14 | headers <- c('action_score.h', 'allreduce.h', 'cb_explore.h', 'cb.h', 'comp_io.h', 15 | 'config.h', 'constant.h', 'cost_sensitive.h', 'crossplat_compat.h', 16 | 'example.h', 'ezexample.h','feature_group.h', 'floatbits.h', 17 | 'global_data.h','io_buf.h', 'label_parser.h', 'learner.h', 18 | 'loss_functions.h', 'memory.h', 'multiclass.h', 'multilabel.h', 19 | 'parse_example.h', 'parse_primitives.h','parser.h', 'simple_label.h', 20 | 'v_array.h', 'v_hashmap.h','vw_exception.h', 'vw_validate.h', 'vw.h', 'vwdll.h') 21 | 22 | path_prefix_list <- c("/usr/local", "/usr", "/opt") 23 | path_suffix_list <- c("vw", "vowpalwabbit", "") 24 | 25 | search_path_list <- unlist(lapply(path_suffix_list, FUN = function(x) file.path(path_prefix_list, "include", x))) 26 | 27 | for (search_path in search_path_list) { 28 | headers_in_path <- headers %in% list.files(search_path) 29 | 30 | if (all(headers_in_path)) { # All headers are found 31 | valid_path <- search_path 32 | # Drop "/vowpalwabbit" terminal part of a path 33 | # valid_path <- dirname(valid_path) 34 | 35 | break 36 | } else if ( (sum(headers_in_path) < length(headers_in_path)) && (sum(headers_in_path) > 0) ) { # Some headers are found 37 | valid_path <- NULL 38 | cat(paste0("Missing headers in ", search_path, "\n")) 39 | cat(paste0(headers[!headers_in_path], collapse = ", "), "\n") 40 | break 41 | } else { # No headers are found 42 | valid_path <- NULL 43 | } 44 | } 45 | 46 | if(is.null(valid_path)) { 47 | stop("Can't find the proper 'include/vowpalwabbit' directory containing Vowpal Wabbit header files.", call. = FALSE) 48 | } else { 49 | 50 | # Valid path found 51 | cat(paste0("Valid path: ", valid_path, "\n")) 52 | # cat(paste0("Valid path: ", file.path(valid_path, "vowpalwabbit"), "\n")) 53 | 54 | if(!file.exists(file.path("src", "Makevars.in"))){ 55 | stop("No 'Makevars.in' file", call. = FALSE) 56 | } 57 | 58 | makevars_in_lines <- readLines(file.path("src", "Makevars.in")) 59 | makevars_out <- file(file.path("src", "Makevars"), "w") 60 | 61 | 62 | include_line <- paste0("PKG_CPPFLAGS = -Iextra/ -I", valid_path) 63 | writeLines(include_line, con = makevars_out) 64 | for (line in makevars_in_lines) { 65 | writeLines(line, con = makevars_out) 66 | } 67 | close(makevars_out) 68 | 69 | } 70 | --------------------------------------------------------------------------------