├── .Rbuildignore
├── .gitignore
├── .travis.yml
├── ChangeLog
├── DESCRIPTION
├── NAMESPACE
├── R
    ├── RcppExports.R
    ├── functions.R
    ├── init.R
    └── rhelpers.R
├── README.md
├── cleanup
├── configure
├── demo
    ├── 00Index
    ├── rvw_bin.R
    ├── rvw_df.R
    ├── rvw_lda.R
    └── rvw_overview.R
├── docker
    ├── ci
    │   └── Dockerfile
    └── run
    │   └── Dockerfile
├── inst
    └── extdata
    │   ├── binary_train.vw
    │   ├── binary_valid.vw
    │   ├── lda_data.vw
    │   ├── multiclass_train.vw
    │   ├── multiclass_valid.vw
    │   └── ref_print.out
├── man
    ├── add_option.Rd
    ├── df2vw.Rd
    ├── print.vw.Rd
    ├── rvwgsoc-package.Rd
    ├── vwaudit.Rd
    ├── vwparams.Rd
    ├── vwsetup.Rd
    ├── vwtest.Rd
    └── vwtrain.Rd
├── rvw.Rproj
├── src
    ├── Makevars.in
    ├── RcppExports.cpp
    ├── extra
    │   ├── array_parameters.h
    │   ├── array_parameters_dense.h
    │   ├── error_reporting.h
    │   ├── example_predict.h
    │   ├── hash.h
    │   ├── no_label.h
    │   └── parser_helper.h
    ├── helpers.cpp
    ├── helpers.h
    ├── md5.c
    ├── md5.h
    └── rvw.cpp
├── tests
    ├── testthat.R
    └── testthat
    │   ├── test-cmdline.R
    │   ├── test-err.R
    │   ├── test-parser.R
    │   ├── test-utils.R
    │   └── test-vwsetup.R
├── tools
    └── r_configure.R
└── vignettes
    └── introduction.Rmd


/.Rbuildignore:
--------------------------------------------------------------------------------
1 | ^.*\.Rproj$
2 | ^\.Rproj\.user$
3 | docker
4 | .travis.yml
5 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | src/*.o
 6 | src/*.so
 7 | src/*.dll
 8 | src/Makevars
 9 | test_dir/*
10 | inst/doc
11 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | # Run Travis CI for R via Docker
 2 | #
 3 | # Made by Dirk Eddelbuettel in August 2018 and released under GPL (>=2)
 4 | 
 5 | os: linux
 6 | dist: trusty
 7 | sudo: required
 8 | services: docker
 9 | 
10 | env:
11 |   global:
12 |     - DOCKER_OPTS="--rm -ti -v $(pwd):/mnt -w /mnt"
13 |       DOCKER_CNTR="rvowpalwabbit/ci"
14 |       R_BLD_CHK_OPTS="--no-build-vignettes --no-manual"
15 | 
16 | before_install:
17 |  - docker pull ${DOCKER_CNTR}
18 |  - docker run ${DOCKER_OPTS} ${DOCKER_CNTR} r -p -e 'sessionInfo()'
19 | 
20 | install:
21 |  - docker run ${DOCKER_OPTS} ${DOCKER_CNTR} R CMD build ${R_BLD_CHK_OPTS} .
22 | 
23 | script:
24 |  - docker run ${DOCKER_OPTS} ${DOCKER_CNTR} R CMD check ${R_BLD_CHK_OPTS} rvw*.tar.gz
25 | 
26 | after_failure:
27 |   - ./run.sh dump_logs
28 | 
29 | notifications:
30 |   email:
31 |     on_success: change
32 |     on_failure: change
33 | 
34 | 


--------------------------------------------------------------------------------
/ChangeLog:
--------------------------------------------------------------------------------
  1 | 2018-10-13  Dirk Eddelbuettel  <edd@debian.org>
  2 | 
  3 | 	* docker/ci/Dockerfile: Install mltools (and r-cran-matrix)
  4 | 
  5 | 2018-09-12  Dirk Eddelbuettel  <edd@debian.org>
  6 | 
  7 | 	* README.md: Added brief Docker documentation
  8 | 
  9 | 	* .travis.yml: Update container name
 10 | 
 11 | 2018-09-11  Dirk Eddelbuettel  <edd@debian.org>
 12 | 
 13 | 	* docker/run/Dockerfile: Added for deployment
 14 | 
 15 | 2018-09-10  Dirk Eddelbuettel  <edd@debian.org>
 16 | 
 17 | 	* .travis.yml: Updated container reference
 18 | 
 19 | 2018-09-09  Dirk Eddelbuettel  <edd@debian.org>
 20 | 
 21 | 	* docker/ci/Dockerfile: Add Dockerfile for Travis
 22 | 	* .travis.yml: Use Docker container in tests
 23 | 
 24 | ---- a lot omitted here, history from rvw-legacy below ---
 25 | 
 26 | 2017-03-19  Dirk Eddelbuettel  <edd@debian.org>
 27 | 
 28 | 	* DESCRIPTION (Version, Date): Roll minor version
 29 | 
 30 | 	* R/dt2vw.R (dt2vw): Make a copy of the dataset;
 31 | 	test for NA values in input data set
 32 | 
 33 | 2017-03-12  Selim Raboudi  <selim.raboudi@axa.com>
 34 | 
 35 | 	* R/dt2vw.R (dt2vw): Allow for escaped variable names
 36 | 
 37 | 2016-09-02  Dirk Eddelbuettel  <edd@debian.org>
 38 | 
 39 | 	* demo/vw_example_4.R: Added Rborist
 40 | 
 41 | 2016-08-30  Dirk Eddelbuettel  <edd@debian.org>
 42 | 
 43 | 	* demo/vw_example_4.R: Added ranger
 44 | 
 45 | 2016-08-29  Dirk Eddelbuettel  <edd@debian.org>
 46 | 
 47 | 	* README.md: Added
 48 | 
 49 | 2016-08-23  Dirk Eddelbuettel  <edd@debian.org>
 50 | 
 51 | 	* demo/vw_example_4.R: Rewritten/extended, now with ctree and gbm
 52 | 
 53 | 	* demo/vw_example_5.R: Plot predicted vs actual for regression example
 54 | 
 55 | 	* R/vw.R (vw): Set a AUC fallback value
 56 | 
 57 | 2016-08-22  Dirk Eddelbuettel  <edd@debian.org>
 58 | 
 59 | 	* DESCRIPTION (License): Use dual license with GPL (>= 2) for my code,
 60 | 	and BSD 3-clause for existing code
 61 | 
 62 | 	* NAMESPACE: All importing now via importFrom()
 63 | 
 64 | 	* demo/vw_example_4.R: Add library(pROC), add a legend to ROC plot
 65 | 	* demo/00Index: Add vw_example4
 66 | 
 67 | 	* R/plot.R (plotDensity): add utils::globalVariables() for R CMD check
 68 | 
 69 | 	* R/dt2vw.R (dt2vw): If dependent variable is numeric, do not
 70 | 	check factor levels
 71 | 
 72 | 	* demo/vw_example_5.R: New regression example
 73 | 
 74 | 2016-08-21  Dirk Eddelbuettel  <edd@debian.org>
 75 | 
 76 | 	* DESCRIPTION (Suggests): Added caret (confusion matrix), ggplot2
 77 | 	and earth (etitanic data)
 78 | 
 79 | 	* R/vw.R (vw): Steps towards richer return objects, more arguments
 80 | 	documented, all temp files now in current directory, new option to
 81 | 	keep files
 82 | 	* man/vw.Rd: Updated accordingly
 83 | 
 84 | 	* R/plot.R: New simple density plot function
 85 | 	* man/plotDensity.Rd: Documentation
 86 | 
 87 | 	* demo/vw_example_4.R: Now plots ROC curve with (up to) three models
 88 | 
 89 | 2016-08-20  Dirk Eddelbuettel  <edd@debian.org>
 90 | 
 91 | 	* R/init.R (.getVW,.getPerf): Helper accessors for vw and perf
 92 | 	binaries
 93 | 
 94 | 	* R/vw.R (vw): Use vw and perf binaries stored in package
 95 | 	environment, ouput simplification
 96 | 
 97 | 2016-08-19  Dirk Eddelbuettel  <edd@debian.org>
 98 | 
 99 | 	* R/vw.R (vw): Test for vw and perf binaries
100 | 
101 | 2016-08-18  Dirk Eddelbuettel  <edd@debian.org>
102 | 
103 | 	* R/init.R (.onAttach): Startup code to look for 'vw' and 'perf'
104 | 
105 | 	* R/vw.R: Reindented and other whitespace changes, now uses '<-'
106 | 	assignments and TRUE/FALSE not T/F, removed @export/@import
107 | 	* R/dt2vw.R: Ditto
108 | 
109 | 2016-08-17  Dirk Eddelbuettel  <edd@debian.org>
110 | 
111 | 	* DESCRIPTION (License): Changed to BSD_3_clause which is what
112 | 	vowpal wabbit itself uses
113 | 	* LICENSE: Ditto
114 | 
115 | 	* demo/vw_example.R: Updated (old) example, no longer uses system,
116 | 	files in R's temp directory
117 | 
118 | 	* demo/vw_example_2.R: Updated (old) example_2
119 | 
120 | 	* inst/examples/bostonHousing/: New (simple) regression example
121 | 
122 | 2016-08-16  Dirk Eddelbuettel  <edd@debian.org>
123 | 
124 | 	* DESCRIPTION (Version): 0.1.1 now passes R CMD check
125 | 
126 | 	* DESCRIPTION: Rewritten / adapted to current standards
127 | 	* NAMESPACE: Updated
128 | 	* LICENSE: Added per BSD_2_clause requirements
129 | 	* R/vw.R: Roxygen documentation update, minimal changes
130 | 	* R/dt2vw.R: Ditto
131 | 	* man/vw.Rd: Updated
132 | 	* man/dt2vw.Rd: Ditto
133 | 
134 | 	* DESCRIPTION (Version): 0.1.0 committed 'as is' as intial commit
135 | 
136 | 
137 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: rvw
 2 | Type: Package
 3 | Title: R Interface for Vowpal Wabbit
 4 | Version: 0.6.0
 5 | Date: 2018-08-20
 6 | Author: Ivan Pavlov, Dirk Eddelbuettel, James J Balamuta
 7 | Maintainer: Ivan Pavlov <pavlov.ivan095@gmail.com>
 8 | Description: R interface for Vowpal Wabbit using 'Rcpp' and the 'Vowpal Wabbit' library 'libvw' for the Google Summer of Code 2018.
 9 | License: GPL (>= 2)
10 | Imports: Rcpp (>= 0.12.16), tools, RApiSerialize, data.table
11 | LinkingTo: Rcpp, RApiSerialize
12 | Suggests: testthat, yaml, utils,
13 |     knitr,
14 |     rmarkdown,
15 |     mltools,
16 |     magrittr
17 | SystemRequirements: C++11, Vowpal Wabbit library
18 | RoxygenNote: 6.1.0
19 | VignetteBuilder: knitr
20 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | useDynLib(rvw, .registration=TRUE)
 2 | exportPattern("^[[:alpha:]]+")
 3 | importFrom(Rcpp, evalCpp)
 4 | importFrom(stats, setNames)
 5 | import(data.table)
 6 | import(RApiSerialize)
 7 | importFrom(tools, md5sum)
 8 | S3method(print, vw)
 9 | S3method(predict, vw)
10 | 


--------------------------------------------------------------------------------
/R/RcppExports.R:
--------------------------------------------------------------------------------
  1 | # Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | # Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | .get_vw_version <- function() {
  5 |     .Call(`_rvw_get_vw_version`)
  6 | }
  7 | 
  8 | #'Train Vowpal Wabbit model
  9 | #'
 10 | #'vwtrain is an interface to train VW model from \code{\link{vwsetup}}
 11 | #'
 12 | #'@param vwmodel [vw] Model of vw class to train
 13 | #'@param data [string or data.frame] Path to training data in .vw plain text format or data.frame.
 14 | #'If \code{[data.frame]} then will be parsed using \code{df2vw} function.
 15 | #'@param readable_model [string] Print trained model in human readable format ("hashed") 
 16 | #'and also with human readable features ("inverted")
 17 | #'@param readable_model_path [string] Path to file where to save readable model.
 18 | #'@param quiet [logical] Do not print anything to the console 
 19 | #'@param update_model [logical] Update an existing model, when training with new data. \code{FALSE} by default.
 20 | #'@param passes [int] Number of times the algorithm will cycle over the data (epochs).
 21 | #'@param cache [bool] Use a cache for a data file.
 22 | #'@param progress [int/real] Progress update frequency. int: additive, real: multiplicative
 23 | #'@param namespaces [list or yaml file] For \code{df2vw}. Name of each namespace and
 24 | #'  each variable for each namespace can be a R list, or a YAML
 25 | #'  file example namespace with the IRIS database: namespaces =
 26 | #'  list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length',
 27 | #'  'Petal.Width') this creates 2 namespaces (sepal
 28 | #'  and petal) containing the features defined by elements of this lists.
 29 | #'@param keep_space [string vector] For \code{df2vw}. Keep spaces for this features
 30 | #'Example:"FERRARI 4Si"
 31 | #'With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features
 32 | #'Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature
 33 | #'@param fixed [string vector] fixed parsing for this features
 34 | #'Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'").
 35 | #'Can be used for LDA ("word_1:2 word_2:3" will stay the same),
 36 | #'but should be used carefully, because special characters can ruin final VW format file.
 37 | #'@param targets [string or string vector] For \code{df2vw}.
 38 | #'If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 
 39 | #'If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 
 40 | #'multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm. 
 41 | #'@param probabilities [string vector] For \code{df2vw}. vectors with action probabilities for Contextual Bandit algorithm.
 42 | #'@param weight [string] For \code{df2vw}. Weight (importance) of each line of the dataset.
 43 | #'@param base [string] For \code{df2vw}. base of each line of the dataset. Used for residual regression.
 44 | #'@param tag [string] For \code{df2vw}. Tag of each line of the dataset.
 45 | #'@param multiline [integer] number of labels (separate lines) for multilines examle
 46 | #'@import tools
 47 | #'@examples
 48 | #'ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
 49 | #'test_vwmodel <- vwsetup()
 50 | #'vwtrain(test_vwmodel, data = ext_train_data)
 51 | vwtrain <- function(vwmodel, data, readable_model = NULL, readable_model_path = "", quiet = FALSE, update_model = FALSE, passes = 1L, cache = FALSE, progress = NULL, namespaces = NULL, keep_space = NULL, fixed = NULL, targets = NULL, probabilities = NULL, weight = NULL, base = NULL, tag = NULL, multiline = NULL) {
 52 |     invisible(.Call(`_rvw_vwtrain`, vwmodel, data, readable_model, readable_model_path, quiet, update_model, passes, cache, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline))
 53 | }
 54 | 
 55 | #'Compute predictions using Vowpal Wabbit model
 56 | #'
 57 | #'\code{vwtest} computes predictions using VW model from \code{\link{vwsetup}}
 58 | #'\code{predict.vw} compute predictions using parser settings from \code{\link{vwtrain}}
 59 | #'
 60 | #'@param vwmodel [vw] Model of vw class to train.
 61 | #'@param object Model of vw class to train for \code{predict.vw}
 62 | #'@param data [string or data.frame] Path to training data in .vw plain text format or data.frame.
 63 | #'If \code{[data.frame]} then will be parsed using \code{df2vw} function.
 64 | #'@param probs_path [string] Path to file where to save predictions.
 65 | #'@param full_probs [bool] Output full predictions in data.frame format. If not, force predictions into a single vector (default).
 66 | #'@param readable_model [string] Print trained model in human readable format ("hashed") 
 67 | #'and also with human readable features ("inverted").
 68 | #'@param readable_model_path [string] Path to file where to save readable model.
 69 | #'@param quiet [bool] Do not print anything to the console.
 70 | #'@param passes [int] Number of times the algorithm will cycle over the data (epochs).
 71 | #'@param cache [bool] Use a cache for a data file.
 72 | #'@param raw [bool] Output unnormalized predictions. Default is FALSE.
 73 | #'@param progress [int/real] Progress update frequency. int: additive, real: multiplicative
 74 | #'@param namespaces [list or yaml file] For \code{df2vw}. Name of each namespace and
 75 | #'  each variable for each namespace can be a R list, or a YAML
 76 | #'  file example namespace with the IRIS database: namespaces =
 77 | #'  list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length',
 78 | #'  'Petal.Width') this creates 2 namespaces (sepal
 79 | #'  and petal) containing the features defined by elements of this lists.
 80 | #'@param keep_space [string vector] For \code{df2vw}. Keep spaces for this features
 81 | #'Example:"FERRARI 4Si"
 82 | #'With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features
 83 | #'Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature
 84 | #'@param fixed [string vector] fixed parsing for this features
 85 | #'Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'").
 86 | #'Can be used for LDA ("word_1:2 word_2:3" will stay the same),
 87 | #'but should be used carefully, because special characters can ruin final VW format file.
 88 | #'@param targets [string or string vector] For \code{df2vw}.
 89 | #'If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 
 90 | #'If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 
 91 | #'multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm. 
 92 | #'@param probabilities [string vector] For \code{df2vw}. Vectors with action probabilities for Contextual Bandit algorithm.
 93 | #'@param weight [string] For \code{df2vw}. Weight (importance) of each line of the dataset.
 94 | #'@param base [string] For \code{df2vw}. Base of each line of the dataset. Used for residual regression.
 95 | #'@param tag [string] For \code{df2vw}. Tag of each line of the dataset.
 96 | #'@param multiline [integer] Number of labels (separate lines) for multilines example
 97 | #'@param ... Parameters passed to \code{predict.vw}
 98 | #'@return Numerical vector containing predictions
 99 | #'@import tools
100 | #'@examples
101 | #'ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
102 | #'ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw") 
103 | #'test_vwmodel <- vwsetup()
104 | #'vwtrain(test_vwmodel, data = ext_train_data)
105 | #'vwtest(test_vwmodel, data = ext_test_data)
106 | #'@rdname vwtest
107 | vwtest <- function(vwmodel, data, probs_path = "", full_probs = FALSE, readable_model = NULL, readable_model_path = "", quiet = FALSE, passes = 1L, cache = FALSE, raw = FALSE, progress = NULL, namespaces = NULL, keep_space = NULL, fixed = NULL, targets = NULL, probabilities = NULL, weight = NULL, base = NULL, tag = NULL, multiline = NULL) {
108 |     .Call(`_rvw_vwtest`, vwmodel, data, probs_path, full_probs, readable_model, readable_model_path, quiet, passes, cache, raw, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline)
109 | }
110 | 
111 | #'Audit Vowpal Wabbit model
112 | #'
113 | #'Get feature names and their model values. 
114 | #'
115 | #'@param vwmodel Model of vw class to train
116 | #'@param quiet [bool] Do not print anything to the console.
117 | #'@return Data.frame containing feature names, feature hashes and model values
118 | #'@examples
119 | #'ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
120 | #'test_vwmodel <- vwsetup()
121 | #'vwtrain(test_vwmodel, data = ext_train_data)
122 | #'vwaudit(test_vwmodel)
123 | vwaudit <- function(vwmodel, quiet = FALSE) {
124 |     .Call(`_rvw_vwaudit`, vwmodel, quiet)
125 | }
126 | 
127 | 


--------------------------------------------------------------------------------
/R/init.R:
--------------------------------------------------------------------------------
  1 | 
  2 | .rvw_global <- new.env(parent=emptyenv())
  3 | 
  4 | .onAttach <- function(libname, pkgname) {
  5 |     # Initialise default/check lists
  6 |     general_check <- list(random_seed=0,
  7 |                           ring_size=NA_real_,
  8 |                           holdout_off=FALSE,
  9 |                           holdout_period=10,
 10 |                           holdout_after=0,
 11 |                           early_terminate=3,
 12 |                           loss_function=NA_character_,
 13 |                           link=NA_character_,
 14 |                           quantile_tau=0.5)
 15 |     feature_check <- list(bit_precision=18,
 16 |                           quadratic=NA_character_,
 17 |                           cubic=NA_character_,
 18 |                           interactions=NA_character_,
 19 |                           permutations=FALSE,
 20 |                           leave_duplicate_interactions=FALSE,
 21 |                           noconstant=FALSE, 
 22 |                           feature_limit=NA_character_,
 23 |                           ngram=NA_character_,
 24 |                           skips=NA_character_,
 25 |                           hash=NA_character_,
 26 |                           affix=NA_character_,
 27 |                           spelling=NA_character_,
 28 |                           interact=NA_character_)
 29 |     
 30 |     optimization_check <- list(learning_rate=0.5,
 31 |                                initial_pass_length=NA_real_,
 32 |                                l1=0,
 33 |                                l2=0,
 34 |                                no_bias_regularization=NA_character_,
 35 |                                feature_mask=NA_character_,
 36 |                                decay_learning_rate=1,
 37 |                                initial_t=0,
 38 |                                power_t=0.5,
 39 |                                initial_weight=0,
 40 |                                random_weights="off",
 41 |                                normal_weights="off",
 42 |                                truncated_normal_weights="off",
 43 |                                sparse_weights=FALSE,
 44 |                                input_feature_regularizer=NA_character_)
 45 |     
 46 |     if (.get_vw_version() == "8.6.1") {
 47 |         # Learning algorithm default/check lists
 48 |         sgd_check <- list(adaptive=TRUE,
 49 |                           normalized=TRUE,
 50 |                           invariant=TRUE,
 51 |                           adax=FALSE,
 52 |                           sparse_l2=0,
 53 |                           l1_state=0,
 54 |                           l2_state=1)
 55 |         bfgs_check <- list(conjugate_gradient=FALSE,
 56 |                            hessian_on=FALSE,
 57 |                            mem=15,
 58 |                            termination=0.00100000005)
 59 |         ftrl_check <- list(ftrl_alpha=0.005,
 60 |                            ftrl_beta=0.1)
 61 |         pistol_check <- list(ftrl_alpha=0.005,
 62 |                              ftrl_beta=0.1)
 63 |         ksvm_check <- list(reprocess=1,
 64 |                            kernel="linear",
 65 |                            bandwidth=1.0,
 66 |                            degree=2,
 67 |                            lambda=-1)
 68 |         OjaNewton_check <- list(sketch_size=10,
 69 |                                 epoch_size=1,
 70 |                                 alpha=1,
 71 |                                 alpha_inverse=NA_real_,
 72 |                                 learning_rate_cnt=2,
 73 |                                 normalize="on",
 74 |                                 random_init="on")
 75 |         svrg_check <- list(stage_size=1)
 76 |         
 77 |         # Learning parameters/reductions default/check lists
 78 |         binary_check <- list(binary=TRUE)
 79 |         oaa_check <- list(num_classes=NA_real_,
 80 |                           oaa_subsample=NA_real_
 81 |                           # probabilities=FALSE,
 82 |                           # scores=FALSE
 83 |                           )
 84 |         ect_check <- list(num_classes=NA_real_)
 85 |         csoaa_check <- list(num_classes=NA_real_,
 86 |                             csoaa_ldf=""
 87 |                             # csoaa_rank=FALSE,
 88 |                             # probabilities=FALSE
 89 |                             )
 90 |         wap_check <- list(num_classes=NA_real_,
 91 |                           wap_ldf=""
 92 |                           # csoaa_rank=FALSE,
 93 |                           # probabilities=FALSE
 94 |                           )
 95 |         log_multi_check <- list(num_classes=NA_real_,
 96 |           no_progress=FALSE,
 97 |           swap_resistance=4)
 98 |         recall_tree_check <- list(num_classes=NA_real_,
 99 |           max_candidates=NA_real_,
100 |           bern_hyper=1,
101 |           max_depth=NA_real_,
102 |           node_only=0,
103 |           randomized_routing=0)
104 |         lda_check <- list(num_topics=NA_real_,
105 |                           lda_alpha=0.100000001,
106 |                           lda_rho=0.100000001,
107 |                           lda_D=10000,
108 |                           lda_epsilon=0.00100000005,
109 |                           math_mode=NA_character_,
110 |                           minibatch=1,
111 |                           metrics=0)
112 |         multilabel_oaa_check <- list(num_labels=NA_real_)
113 |         classweight_check <- list(class_multiplier=NA_real_)
114 |         new_mf_check <- list(rank=NA_real_)
115 |         lrq_check <- list(features=NA_character_,
116 |                           lrqdropout=FALSE)
117 |         stage_poly_check <- list(sched_exponent = 1.0,
118 |                             batch_sz = 1000,
119 |                             batch_sz_no_doubling = TRUE)
120 |         bootstrap_check <- list(num_rounds=NA_real_,
121 |                                 bs_type="mean")
122 |         autolink_check <- list(degree=2)
123 |         replay_check <- list(level="b",
124 |                         buffer=100,
125 |                         count=1)
126 |         explore_eval_check <- list(explore_eval=TRUE,
127 |           multiplier=NA_real_)
128 |         cb_check<- list(num_costs=NA_real_,
129 |                    cb_type="dr",
130 |                    eval=FALSE,
131 |                    rank_all=FALSE,
132 |                    no_predict=FALSE)
133 |         cb_explore_check <- list(num_actions=NA_real_,
134 |                             explore_type="epsilon",
135 |                             explore_arg=0.05,
136 |                             psi=1,
137 |                             nounif=FALSE,
138 |                             mellowness=0.1,
139 |                             greedify=FALSE,
140 |                             lambda=-1,
141 |                             cb_min_cost=0,
142 |                             cb_max_cost=1,
143 |                             first_only=FALSE)
144 |         cbify_check <- list(num_classes=NA_real_,
145 |                       cbify_cs=FALSE,
146 |                       loss0=0,
147 |                       loss1=1)
148 |         multiworld_test_check <- list(features=NA_character_,
149 |           learn=NA_real_,
150 |           exclude_eval=FALSE)
151 |         nn_check <- list(num_hidden=NA_real_,
152 |                          inpass=FALSE,
153 |                          multitask=FALSE,
154 |                          dropout=FALSE,
155 |                          meanfield=FALSE)
156 |         topk_check <- list(num_k=NA_real_)
157 |         search_check <- list(id=NA_real_,
158 |                        search_task=NA_character_,
159 |                        search_interpolation=NA_character_,
160 |                        search_rollout=NA_character_,
161 |                        search_rollin=NA_character_,
162 |                        search_passes_per_policy=1,
163 |                        search_beta=0.5,
164 |                        search_alpha=1e-10,
165 |                        search_total_nb_policies=NA_real_,
166 |                        search_trained_nb_policies=NA_real_,
167 |                        search_allowed_transitions=NA_character_,
168 |                        search_subsample_time=NA_real_,
169 |                        search_neighbor_features=NA_character_,
170 |                        search_rollout_num_steps=NA_real_,
171 |                        search_history_length=1,
172 |                        search_no_caching=FALSE,
173 |                        search_xv=FALSE,
174 |                        search_perturb_oracle=0,
175 |                        search_linear_ordering=FALSE,
176 |                        search_active_verify=NA_real_,
177 |                        search_save_every_k_runs=NA_real_)
178 |         boosting_check <- list(num_learners=NA_real_,
179 |                                gamma=0.100000001,
180 |                                alg="BBM")
181 |         marginal_check <- list(ids=NA_character_,
182 |                                initial_denominator=1,
183 |                                initial_numerator=0.5,
184 |                                compete=FALSE,
185 |                                update_before_learn=0,
186 |                                unweighted_marginals=0,
187 |                                decay=0)
188 |         check_lists <- list(general_check=general_check, feature_check=feature_check, optimization_check=optimization_check,
189 |                                     sgd_check=sgd_check, bfgs_check=bfgs_check, ftrl_check=ftrl_check, pistol_check=pistol_check, ksvm_check=ksvm_check,
190 |                                     OjaNewton_check=OjaNewton_check, svrg_check=svrg_check,
191 |                                     binary_check=binary_check, oaa_check=oaa_check, ect_check=ect_check, csoaa_check=csoaa_check, wap_check=wap_check,
192 |                                     log_multi_check=log_multi_check, recall_tree_check=recall_tree_check, lda_check=lda_check, multilabel_oaa_check=multilabel_oaa_check,
193 |                                     new_mf_check=new_mf_check, classweight_check=classweight_check, lrq_check=lrq_check, stage_poly_check=stage_poly_check,
194 |                                     bootstrap_check=bootstrap_check, autolink_check=autolink_check, replay_check=replay_check,
195 |                                     cb_check=cb_check, explore_eval_check=explore_eval_check, cb_explore_check=cb_explore_check, cbify_check=cbify_check,
196 |                                     multiworld_test_check=multiworld_test_check, nn_check=nn_check, topk_check=topk_check, search_check=search_check,
197 |                                     boosting_check=boosting_check, marginal_check=marginal_check)
198 |     } else {
199 |         stop("Vowpal Wabbit v8.6.1 or newer required")
200 |     }
201 | 
202 |     flatten_check_lists <- .flatten(check_lists)
203 | 
204 |     assign("check_lists", check_lists, envir=.rvw_global)
205 |     assign("flatten_check_lists", flatten_check_lists, envir=.rvw_global)
206 | }
207 | 


--------------------------------------------------------------------------------
/R/rhelpers.R:
--------------------------------------------------------------------------------
  1 | # Helper functions
  2 | 
  3 | .sprintf2 <- function(fmt, ...) {
  4 |     MAX_NVAL <- 99L
  5 |     args <- c(...)
  6 |     if (length(args) <= MAX_NVAL)
  7 |         return(do.call(sprintf, c(list(fmt), args)))
  8 |     stopifnot(length(fmt) == 1L)
  9 |     not_a_spec_at <- gregexpr("%%", fmt, fixed=TRUE)[[1L]]
 10 |     not_a_spec_at <- c(not_a_spec_at, not_a_spec_at + 1L)
 11 |     spec_at <- setdiff(gregexpr("%", fmt, fixed=TRUE)[[1L]], not_a_spec_at)
 12 |     nspec <- length(spec_at)
 13 |     if (length(args) < nspec)
 14 |         stop("too few arguments")
 15 |     if (nspec <= MAX_NVAL) {
 16 |         break_points <- integer(0)
 17 |     } else {
 18 |         break_points <- seq(MAX_NVAL + 1L, nspec, by=MAX_NVAL)
 19 |     }
 20 |     break_from <- c(1L, break_points)
 21 |     break_to <- c(break_points - 1L, nspec)
 22 |     fmt_break_at <- spec_at[break_points]
 23 |     fmt_chunks <- substr(rep.int(fmt, length(fmt_break_at) + 1L),
 24 |                          c(1L, fmt_break_at),
 25 |                          c(fmt_break_at - 1L, nchar(fmt)))
 26 |     ans_chunks <- mapply(
 27 |         function(fmt_chunk, from, to)
 28 |             do.call(sprintf, c(list(fmt_chunk), args[from:to])),
 29 |         fmt_chunks,
 30 |         break_from,
 31 |         break_to
 32 |     )
 33 |     paste(apply(ans_chunks,1, paste, collapse = ""), collapse = "\n")
 34 | }
 35 | 
 36 | .check_parameters <- function(params) {
 37 |     # Helper function to check parameters
 38 |     check_param_values <- function(input, check) {
 39 |         bool_check_names <- names(input) %in% names(check)
 40 |         if(!all(bool_check_names)) {
 41 |             error_msg <- paste0("Wrong argument names: ",
 42 |                                 paste0(names(input)[!bool_check_names], collapse = ", "))
 43 |             stop(error_msg, call. = FALSE)
 44 |         }
 45 |         
 46 |         valid_input <- check
 47 |         bool_check_values <- sapply(names(input), FUN = function(i) {
 48 |             # First check if types of input argument values are correct (same as of check lists)
 49 |             bool_check <- (typeof(input[[i]]) == typeof(check[[i]])) | (is.na(input[[i]]))
 50 |             # Replace default/check values with values from input
 51 |             valid_input[[i]] <<- input[[i]]
 52 |             # And return bool values to raise errors
 53 |             bool_check
 54 |         })
 55 |         if(!all(bool_check_values)) {
 56 |             error_msg <- paste0("Wrong argument values: ",
 57 |                                 paste0(names(input)[!bool_check_values], collapse = ", "))
 58 |             stop(error_msg, call. = FALSE)
 59 |         }
 60 |         
 61 |         # Return check with modified values
 62 |         return(valid_input)
 63 |     }
 64 |     
 65 |     # Create default parameters list if no parameters provided
 66 |     # Else check parameters and return validated parameters
 67 |     if(length(params$options) != 0) {
 68 |         valid_options <- list()
 69 |         params$options <- sapply(names(params$options), function(option_name) {
 70 |             option_check_type <- .rvw_global[["check_lists"]][[paste0(option_name, "_check")]]
 71 |             valid_option <- check_param_values(
 72 |                 input = params$options[[option_name]],
 73 |                 check = option_check_type
 74 |             )
 75 |             # Check for missing first argument value in option parameters
 76 |             if(is.na(valid_option[[1]])){
 77 |                 error_msg <- paste0("Missing value for argument: ",
 78 |                                     names(valid_option)[[1]],
 79 |                                     "\nFor option: ",
 80 |                                     option_name, "\n"
 81 |                 )
 82 |                 stop(error_msg, call. = FALSE)
 83 |             }
 84 |             valid_option <- setNames(list(valid_option), option_name)
 85 |             valid_options <<- c(valid_options, valid_option)
 86 |         })
 87 |         params$options <- valid_options
 88 |         
 89 |     }
 90 |     if(length(params$general_params) == 0) {
 91 |         params$general_params <- .rvw_global[["check_lists"]][["general_check"]]
 92 |     } else {
 93 |         params$general_params <- check_param_values(
 94 |             input = params$general_params,
 95 |             # input = c(list(cache=params$cache), params$general_params),
 96 |             check = .rvw_global[["check_lists"]][["general_check"]]
 97 |         )
 98 |     }
 99 |     if(length(params$feature_params) == 0) {
100 |         params$feature_params <- .rvw_global[["check_lists"]][["feature_check"]]
101 |     } else {
102 |         params$feature_params <- check_param_values(
103 |             input = params$feature_params,
104 |             check = .rvw_global[["check_lists"]][["feature_check"]]
105 |         )
106 |     }
107 |     if(length(params$optimization_params) == 0) {
108 |         algorithm_parameters <- .rvw_global[["check_lists"]][[paste0(params$algorithm, "_check")]]
109 |         params$optimization_params <- c(algorithm_parameters, .rvw_global[["check_lists"]][["optimization_check"]])
110 |     } else {
111 |         algorithm_check_type <- .rvw_global[["check_lists"]][[paste0(params$algorithm, "_check")]]
112 |         params$optimization_params <- check_param_values(
113 |             input = params$optimization_params,
114 |             check = c(algorithm_check_type, .rvw_global[["check_lists"]][["optimization_check"]])
115 |         )
116 |     }
117 |     
118 |     # # Cache should be created, if passes > 1
119 |     # if(params$general_params$passes > 1) {
120 |     #     params$general_params$cache <- TRUE
121 |     # }
122 |     # Return validated parameters
123 |     return(list(algorithm = params$algorithm,
124 |                 general_params = params$general_params,
125 |                 feature_params = params$feature_params,
126 |                 optimization_params = params$optimization_params,
127 |                 options = params$options))
128 | }
129 | 
130 | .create_parameters_string <- function(params) {
131 |     params_to_strings <- function(i) {
132 |         if(is.na(flat_params[[i]]) || isTRUE(flat_params[[i]] == .rvw_global[["flatten_check_lists"]][[i]])) {
133 |             return("")
134 |         };
135 |         if(is.logical(flat_params[[i]][[1]]) & flat_params[[i]][[1]] == TRUE) {
136 |             return(paste0("--",i))
137 |         };
138 |         if(is.logical(flat_params[[i]][[1]]) & flat_params[[i]][[1]] == FALSE) {
139 |             return("")
140 |         } else {
141 |             return(paste0("--",i," ",flat_params[[i]]))
142 |         }
143 |     }
144 |     
145 |     temp_params <- params
146 |     
147 |     # Options exceptions
148 |     exceptions_params <- c()
149 |     # cb exception
150 |     if(isTRUE(temp_params$options[["cb"]][1] == 0)){
151 |         # Use --cb_adf if num_costs == 0
152 |         exceptions_params <- c(
153 |             exceptions_params,
154 |             paste0("--cb_adf")
155 |         )
156 |         temp_params$options[["cb"]][1] <- NA 
157 |     }
158 |     # cb_explore exception
159 |     if("cb_explore" %in% names(temp_params$options)) {
160 |         # Use --cb_explore_adf if num_actions == 0
161 |         if(isTRUE(temp_params$options[["cb_explore"]][1] == 0)){
162 |             exceptions_params <- c(
163 |                 exceptions_params,
164 |                 paste0("--cb_explore_adf")
165 |             )
166 |             temp_params$options[["cb_explore"]][1] <- NA 
167 |         }
168 |         # create exploration type string like "--first arg"
169 |         exceptions_params <- c(
170 |             exceptions_params,
171 |             paste0("--", temp_params$options[["cb_explore"]][2], " ", temp_params$options[["cb_explore"]][3])
172 |         )
173 |         temp_params$options[["cb_explore"]][2] <- NA
174 |         temp_params$options[["cb_explore"]][3] <- NA
175 |     }
176 |     # Experience Replay exception
177 |     if("replay" %in% names(temp_params$options)) {
178 |         # --replay_c 100, --replay_m 100, --replay_b 100 like exception
179 |         exceptions_params <- c(
180 |             exceptions_params,
181 |             paste0("--replay_", temp_params$options[["replay"]][1], " ", temp_params$options[["replay"]][2]),
182 |             paste0("--replay_", temp_params$options[["replay"]][1], "_count ", temp_params$options[["replay"]][3])
183 |         )
184 |         temp_params$options[["replay"]][1] <- NA
185 |         temp_params$options[["replay"]][2] <- NA
186 |         temp_params$options[["replay"]][3] <- NA
187 |     }
188 |     exceptions_params <- Filter(exceptions_params, f = function(x) nchar(x) > 0)
189 |     exceptions_string <-  paste0(exceptions_params, collapse = " ")
190 |     
191 |     
192 |     # Convert different options into string with CL arguments
193 |     options_params <- sapply(names(temp_params$options), function(option_name) {
194 |         if(is.na(temp_params$options[[option_name]][1])) {
195 |             tmp <- ""
196 |         } else if (option_name == names(temp_params$options[[option_name]])[1]) {
197 |             tmp <- paste0("--", option_name)
198 |         } else {
199 |             tmp <- paste0("--", option_name, " ", temp_params$options[[option_name]][1])
200 |         }
201 |         temp_params$options[[option_name]][1] <<- NA
202 |         tmp
203 |     })
204 |     
205 |     # temp_params$options <- list()
206 |     # Filter empty strings
207 |     options_params <- Filter(options_params, f = function(x) nchar(x) > 0)
208 |     options_string <-  paste0(options_params, collapse = " ")
209 |     
210 |     # Flatten option
211 |     flat_params <- .flatten(temp_params$options)
212 |     
213 |     # Exception for "--math_mode" -> "--math-mode"
214 |     if("math_mode" %in% names(flat_params)) {
215 |         names(flat_params) <- gsub(pattern = "math_mode", replacement = "math-mode", x = names(flat_params))
216 |     }
217 |     
218 |     # Convert option parameters list to "--arg _" list
219 |     flat_option_params <- sapply(names(flat_params), FUN = params_to_strings)
220 |     # Filter empty strings
221 |     flat_option_params <- Filter(flat_option_params, f = function(x) nchar(x) > 0)
222 |     # Create string "--passes 0 --bit_precision 18" for parser
223 |     option_params_string <- paste0(flat_option_params, collapse = " ")
224 |     
225 |     temp_params$options <- list()
226 |     
227 |     #Set learning mode string argument
228 |     algorithm_string <- switch (temp_params$algorithm,
229 |                                 sgd = {tmp <- ""; tmp},
230 |                                 bfgs = {tmp <- "--bfgs"; tmp},
231 |                                 ftrl = {tmp <- "--ftrl"; tmp},
232 |                                 pistol = {tmp <- "--pistol"; tmp},
233 |                                 ksvm = {tmp <- "--ksvm"; tmp},
234 |                                 OjaNewton = {tmp <- "--OjaNewton"; tmp},
235 |                                 svrg = {tmp <- "--svrg"; tmp}
236 |     )
237 |     # # Disable cache here, because it's checked in vwtrain and vwtest
238 |     # if (temp_params$general_params$cache) {
239 |     #     temp_params$general_params$cache <- NA
240 |     # }
241 |     # Flatten list
242 |     flat_params <- .flatten(temp_params[-c(1)])
243 |     # Convert parameters list to "--arg _" list
244 |     flat_params <- sapply(names(flat_params), FUN = params_to_strings)
245 |     # Filter empty strings
246 |     flat_params <- Filter(flat_params, f = function(x) nchar(x) > 0)
247 |     # Create string "--passes 0 --bit_precision 18" for parser
248 |     parameters_string <- paste0(flat_params, collapse = " ")
249 |     final_params <- c(algorithm_string, parameters_string, exceptions_string, options_string, option_params_string)
250 |     # parameters_string <- paste(algorithm_string, parameters_string, exceptions_string, options_string, option_params_string, sep = " ")
251 |     parameters_string <-  paste0(
252 |         Filter(final_params,f = function(x) nchar(x) > 0),
253 |         collapse = " "
254 |     )
255 |     
256 |     return(parameters_string)
257 | }
258 | 
259 | # Flatten parameters list
260 | .flatten <- function(x) {
261 |     repeat {
262 |         if(!any(vapply(x,is.list, logical(1)))) return(x)
263 |         x <- Reduce(c, x)
264 |     }
265 | }
266 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | [![Build Status](https://travis-ci.org/rvw-org/rvw.svg?branch=master)](https://travis-ci.org/rvw-org/rvw)
  2 | 
  3 | ## rvw
  4 | 
  5 | Development of **rvw** package started as R Vowpal Wabbit (Google Summer of Code 2018) [project](https://summerofcode.withgoogle.com/projects/#5511455416254464).
  6 | 
  7 | **Vowpal Wabbit** is an online machine learning system that is known for its speed and scalability and is widely used in research and industry.
  8 | 
  9 | This package aims to bring its functionality to **R**.
 10 | 
 11 | ## Installation
 12 | 
 13 | ### From Source 
 14 | 
 15 | First, you have to install **Vowpal Wabbit** itself [here](https://github.com/JohnLangford/vowpal_wabbit#getting-the-code).
 16 | 
 17 | Next, once the required library is installed, you can install the **rvw** package using `remotes`:
 18 | 
 19 | ```r
 20 | install.packages("remotes")  ## or devtools
 21 | remotes::install_github("rvw-org/rvw")
 22 | ```
 23 | 
 24 | or (in case you have the package sources) via a standard `R CMD INSTALL .`.
 25 | 
 26 | This installation from source currently works best on Linux; on macOS you
 27 | have to locally compile using the R-compatible toolchain (and not the
 28 | brew-based one as the Vowpal Wabbit documentation suggests).
 29 | 
 30 | There is one possible shortcut: you can use the Debian/Ubuntu package as our
 31 | Docker container does: `sudo apt-get install libvw-dev vowpal-wabbit
 32 | libboost-program-options-dev`. 
 33 | 
 34 | 
 35 | ### Using Docker
 36 | 
 37 | We use [Docker](https://www.docker.com) for the [Travis CI](https://www.travis-ci.org) tests, and also provide a container
 38 | for deployment. Do 
 39 | 
 40 | ```sh
 41 | docker pull rvowpalwabbit/run                 ## one time 
 42 | docker run --rm -ti rvowpalwabbit/run bash    ## launch container
 43 | ```
 44 | 
 45 | to start the container with `rvw` installed.  See the 
 46 | [Boettiger and Eddelbuettel RJournal paper](https://journal.r-project.org/archive/2017/RJ-2017-065/index.html)
 47 | for more on Docker for R, and the [Rocker Project](https://www.rocker-project.org) used here.
 48 | 
 49 | ## Getting Started
 50 | [Introduction](https://github.com/rvw-org/rvw/wiki/Introduction)
 51 | 
 52 | Examples:
 53 | 
 54 | * [Binary classification](https://github.com/rvw-org/rvw/wiki/Binary-classification)
 55 | * [CSOAA multiclass classification](https://github.com/rvw-org/rvw/wiki/CSOAA-multiclass-classification)
 56 | * [Topic modeling with Latent Dirichlet Allocation (LDA)](https://github.com/rvw-org/rvw/wiki/Topic-modeling-with-Latent-Dirichlet-Allocation-(LDA))
 57 | 
 58 | 
 59 | ## Example 
 60 | 
 61 | In this example we will try to predict age groups (based on number of abalone shell rings) from physical measurements. We will use Abalone Data Set from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/Abalone).
 62 | 
 63 | First we prepare our data:
 64 | 
 65 | ```r
 66 | library(mltools)
 67 | library(rvw)
 68 | 
 69 | set.seed(1)
 70 | aburl = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
 71 | abnames = c('sex','length','diameter','height','weight.w','weight.s','weight.v','weight.sh','rings')
 72 | abalone = read.table(aburl, header = F , sep = ',', col.names = abnames)
 73 | data_full <- abalone
 74 | 
 75 | # Split number of rings into groups with equal (as possible) number of observations
 76 | data_full$group <- bin_data(data_full$rings, bins=3, binType = "quantile")
 77 | group_lvls <- levels(data_full$group)
 78 | levels(data_full$group) <- c(1, 2, 3)
 79 | 
 80 | # Prepare indices to split data
 81 | ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full))
 82 | # Split data into train and test subsets
 83 | df_train <- data_full[ind_train,]
 84 | df_test <- data_full[-ind_train,]
 85 | ```
 86 | 
 87 | Then we set up a *Vowpal Wabbit* model:
 88 | ```r
 89 | vwmodel <- vwsetup(option = "ect", num_classes = 3)
 90 | ```
 91 | 
 92 | * *option = "ect"* - we will use [Error Correcting Tournament](https://github.com/JohnLangford/vowpal_wabbit/wiki/Error-Correcting-Tournament-(ect)-multi-class-example) algorithm to train multiclass classification model;
 93 | * *num_classes = 3* - number of classes in our data;
 94 | 
 95 | Now we start training:
 96 | 
 97 | ```r
 98 | vwtrain(vwmodel, data = df_train,
 99 |         namespaces = list(NS1 = list("sex", "rings"),
100 |                           NS2 = list("weight.w","weight.s","weight.v","weight.sh", "diameter", "length", "height")),
101 |         targets = "group"
102 | )
103 | ```
104 | And we get: `average loss = 0.278060`
105 | 
106 | * *namespaces* - We will split our features into two namespaces `NS1` and `NS2`;
107 | * *targets = "group"* - ground truth labels;
108 | 
109 | 
110 | And finally compute predictions using trained model:
111 | 
112 | ```r
113 | predict.vw(vwmodel, data = df_test)
114 | ```
115 | Here we get: `average loss = 0.221292`
116 | 
117 | We can add more learning algorithms to our model. For example we want to use *boosting* algorithm with 100 "weak" learners. Then we will just add this option to our model and train again:
118 | 
119 | ```r
120 | vwmodel <- add_option(vwmodel, option = "boosting", num_learners=100)
121 | 
122 | vwtrain(vwmodel, data = df_train,
123 |         namespaces = list(NS1 = list("sex", "rings"),
124 |                           NS2 = list("weight.w","weight.s","weight.v","weight.sh", "diameter", "length", "height")),
125 |         targets = "group"
126 | )
127 | ```
128 | We get: `average loss = 0.229273`
129 | 
130 | And compute predictions:
131 | 
132 | ```r
133 | predict.vw(vwmodel, data = df_test)
134 | ```
135 | Finally we get: `average loss = 0.081340`
136 | 
137 | In order to inspect parameters of our model we can simply print it:
138 | 
139 | ```r
140 | vwmodel
141 | ```
142 | 
143 | ```
144 | 	Vowpal Wabbit model
145 | Learning algorithm:   sgd 
146 | Working directory:   /var/folders/yx/6949djdd3yb4qsw7x_95wfjr0000gn/T//RtmpjO3DD1 
147 | Model file:   /var/folders/yx/6949djdd3yb4qsw7x_95wfjr0000gn/T//RtmpjO3DD1/vw_1534253637_mdl.vw 
148 | General parameters: 
149 | 	 random_seed :   0 
150 | 	 ring_size :  Not defined
151 | 	 holdout_off :   FALSE 
152 | 	 holdout_period :   10 
153 | 	 holdout_after :   0 
154 | 	 early_terminate :   3 
155 | 	 loss_function :   squared 
156 | 	 link :   identity 
157 | 	 quantile_tau :   0.5 
158 | Feature parameters: 
159 | 	 bit_precision :   18 
160 | 	 quadratic :  Not defined
161 | 	 cubic :  Not defined
162 | 	 interactions :  Not defined
163 | 	 permutations :   FALSE 
164 | 	 leave_duplicate_interactions :   FALSE 
165 | 	 noconstant :   FALSE 
166 | 	 feature_limit :  Not defined
167 | 	 ngram :  Not defined
168 | 	 skips :  Not defined
169 | 	 hash :  Not defined
170 | 	 affix :  Not defined
171 | 	 spelling :  Not defined
172 | Learning algorithms / Reductions: 
173 | 	 ect :
174 | 		 num_classes :   3 
175 | 	 boosting :
176 | 		 num_learners :   100 
177 | 		 gamma :   0.1 
178 | 		 alg :   BBM 
179 | Optimization parameters: 
180 | 	 adaptive :   TRUE 
181 | 	 normalized :   TRUE 
182 | 	 invariant :   TRUE 
183 | 	 adax :   FALSE 
184 | 	 sparse_l2 :   0 
185 | 	 l1_state :   0 
186 | 	 l2_state :   1 
187 | 	 learning_rate :   0.5 
188 | 	 initial_pass_length :  Not defined
189 | 	 l1 :   0 
190 | 	 l2 :   0 
191 | 	 no_bias_regularization :  Not defined
192 | 	 feature_mask :  Not defined
193 | 	 decay_learning_rate :   1 
194 | 	 initial_t :   0 
195 | 	 power_t :   0.5 
196 | 	 initial_weight :   0 
197 | 	 random_weights :  Not defined
198 | 	 normal_weights :  Not defined
199 | 	 truncated_normal_weights :  Not defined
200 | 	 sparse_weights :   FALSE 
201 | 	 input_feature_regularizer :  Not defined
202 | Model evaluation. Training: 
203 | 	 num_examples :   3341 
204 | 	 weighted_example_sum :   3341 
205 | 	 weighted_label_sum :   0 
206 | 	 avg_loss :   0.2292727 
207 | 	 total_feature :   33408 
208 | Model evaluation. Testing: 
209 | 	 num_examples :   836 
210 | 	 weighted_example_sum :   836 
211 | 	 weighted_label_sum :   0 
212 | 	 avg_loss :   0.08133971 
213 | 	 total_feature :   8360 
214 | ```
215 | 


--------------------------------------------------------------------------------
/cleanup:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | rm -f src/Makevars src/*.o src/*.so src/*.dylib
4 | 


--------------------------------------------------------------------------------
/configure:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | 
3 | : ${R_HOME:=`R RHOME`}
4 | 
5 | cmd="${R_HOME}/bin/Rscript ${PWD}/tools/r_configure.R"
6 | 
7 | $cmd
8 | 


--------------------------------------------------------------------------------
/demo/00Index:
--------------------------------------------------------------------------------
1 | rvw_overview		Basic package usage
2 | rvw_df       Using data.frame as input data for CSOAA multiclass classification
3 | rvw_lda       LDA example
4 | rvw_bin       Simple binary classification
5 | 


--------------------------------------------------------------------------------
/demo/rvw_bin.R:
--------------------------------------------------------------------------------
 1 | library(rvw)
 2 | library(mlbench) # For a dataset
 3 | 
 4 | # First, switch to a temporary directory
 5 | curr_dir <- getwd()
 6 | setwd(tempdir())
 7 | 
 8 | set.seed(42)
 9 | 
10 | # We will try to identify benign or malignant class of a tumour using its histology characteristics.
11 | data("BreastCancer", package = "mlbench")
12 | data_full <- BreastCancer
13 | 
14 | # First, start with data preprocessing
15 | data_full <- data_full[complete.cases(data_full),]
16 | ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full))
17 | 
18 | str(data_full)
19 | summary(data_full)
20 | # We can see that "benign" cases appear more often in our dataset
21 | # This will be used to set up a baseline model
22 | 
23 | data_full <- data_full[,-1]
24 | data_full$Class <- ifelse(data_full$Class == "malignant", 1, -1)
25 | 
26 | data_train <- data_full[ind_train,]
27 | data_test <- data_full[-ind_train,]
28 | 
29 | # Our baseline model simply reports every tumour class as benign
30 | baseline_pred <- rep(-1, length(data_test$Class))
31 | 
32 | # Accuracy for binary classification case
33 | acc_prc <- function(y_pred, y_true){sum(y_pred == y_true) / length(y_pred) * 100}
34 | 
35 | acc_prc(data_test$Class, baseline_pred)
36 | # With our baseline model, we get an accuracy of around 65%
37 | 
38 | # Now we a ready to use Vowpal Wabbit models
39 | # Setup model
40 | test_vwmodel <-  vwsetup(dir = "./", model = "mdl.vw",
41 |                          option = "binary") # Convert predictions to {-1,+1}
42 | 
43 | # Basic training and testing
44 | vwtrain(vwmodel = test_vwmodel,
45 |         data = data_train,
46 |         passes = 10,
47 |         targets = "Class")
48 | 
49 | vw_output <- vwtest(vwmodel = test_vwmodel, data = data_test)
50 | 
51 | acc_prc(data_test$Class, vw_output)
52 | # Now we get much better results with an accuracy of around 97%
53 | 
54 | # Switch back
55 | setwd(curr_dir)
56 | 


--------------------------------------------------------------------------------
/demo/rvw_df.R:
--------------------------------------------------------------------------------
 1 | library(mltools)
 2 | library(rvw)
 3 | 
 4 | curr_dir <- getwd()
 5 | setwd(tempdir())
 6 | # We will use abalone dataset and will try to predict age groups (based on number of abalone shell rings) from physical measurements
 7 | aburl = 'http://archive.ics.uci.edu/ml/machine-learning-databases/abalone/abalone.data'
 8 | abnames = c('sex','length','diameter','height','weight.w','weight.s','weight.v','weight.sh','rings')
 9 | abalone = read.table(aburl, header = F , sep = ',', col.names = abnames)
10 | data_full <- abalone
11 | 
12 | # Split number of rings into groups with equal (as possible) number of observations
13 | data_full$group <- bin_data(data_full$rings, bins=3, binType = "quantile")
14 | group_lvls <- levels(data_full$group)
15 | levels(data_full$group) <- c(1, 2, 3)
16 | # Prepare variables for CSOAA algorithm
17 | data_full$cost_class_1 <- ifelse(data_full$group == 1, 0.8, 0.1)
18 | data_full$cost_class_2 <- ifelse(data_full$group == 2, 0.8, 0.1)
19 | data_full$cost_class_3 <- ifelse(data_full$group == 3, 0.8, 0.1)
20 | data_full$rings <- factor(data_full$rings)
21 | data_full$tag <- sapply(1:nrow(data_full), function(x) paste0("ex",x))
22 | # Prepare indices to split data
23 | ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full))
24 | # Split data into train and test subsets
25 | df_train <- data_full[ind_train,]
26 | df_test <- data_full[-ind_train,]
27 | 
28 | vwmodel <- vwsetup(dir = "./",
29 |                    option = "csoaa", num_classes = 3)
30 | 
31 | vwtrain(vwmodel, data = df_train,
32 |         namespaces = list(NS1 = list("sex", "rings"), NS2 = list("diameter", "length", "height")),
33 |         targets = c("cost_class_1", "cost_class_2", "cost_class_3"), tag = "tag"
34 | )
35 | vwpreds <- predict(vwmodel, data = df_test, full_probs = T)
36 | 
37 | head(vwpreds)
38 | 
39 | setwd(curr_dir)
40 | 
41 | 


--------------------------------------------------------------------------------
/demo/rvw_lda.R:
--------------------------------------------------------------------------------
 1 | library(rvw)
 2 | 
 3 | # In this demo, we will take a look at the topic modeling problem.
 4 | # For this, we will use Latent Dirichlet Allocation (LDA) method implemented in Vowpal Wabbit (VW).
 5 | 
 6 | # First, switch to a temporary directory.
 7 | curr_dir <- getwd()
 8 | setwd(tempdir())
 9 | 
10 | # Here we prepare our dataset. We consider the WebKB dataset. 
11 | # It consists of web pages collected from various Universities and manually classified into seven different classes (topics).
12 | # Original reference: The 4 Universities Data Set
13 | # http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/
14 | # We use a preprocessed version of this dataset from Ana Cardoso-Cachopo PhD thesis:
15 | # http://ana.cachopo.org/datasets-for-single-label-text-categorization
16 | data_url <- "http://ana.cachopo.org/datasets-for-single-label-text-categorization/webkb-test-stemmed.txt"
17 | lda_data <- read.delim(file = data_url, header = F, stringsAsFactors = F)
18 | names(lda_data) <- c("topic", "text")
19 | 
20 | # Clear out empty lines.
21 | lda_data <- lda_data[!(lda_data$text == ""), ]
22 | # Prepare a vocabulary from all documents.
23 | lda_vocab <- sort(unique(unlist(strsplit(lda_data$text, " "))))
24 | 
25 | 
26 | # In order to use VW LDA algorithm, we have to convert plain text to "word:word_count word:word_count ..." format.
27 | # Also, we replace the words with their indexes in the vocabulary.
28 | # This is needed if we want to easily decode feature hashes later and show topics in a human-readable format.
29 | lda_data$features <- sapply(lda_data$text, function(x) {
30 |     splitted_words <- unlist(strsplit(x, " "))
31 |     counted_words <- aggregate(data.frame(count=splitted_words), list(word=splitted_words), length)
32 |     res_str <- paste0(apply(counted_words, 1, function(x){
33 |         paste0( (which(lda_vocab == x[["word"]]) - 1) , ":", as.numeric(x[["count"]]))
34 |         # Or use this if no replacement with index is needed:
35 |         # paste0(x[["word"]], ":", as.numeric(x[["count"]]))
36 |     }),
37 |     collapse = " ")
38 |     res_str
39 | })
40 | 
41 | # Calculate required number of bits (b) for feature hashes range: [0, 2^(b) - 1].
42 | bits <- ceiling(log2(length(lda_vocab)))
43 | # Total number of unique documents in data
44 | num_docs <- as.numeric(nrow(lda_data))
45 | 
46 | # Now we can set up a LDA model.
47 | lda_model <- vwsetup(feature_params = list(bit_precision=bits),
48 |                      optimization_params = list(initial_t=1, power_t=0.5), # Parameters for learning rate schedule
49 |                      option = "lda", # Enable LDA algorithm
50 |                      num_topics = 7, # Specify the number of topics to learn (the same as were manually classified)
51 |                      lda_D = num_docs, 
52 |                      minibatch = 16) # Analyze 16 documents at a time
53 | 
54 | # And start learning a set of topics.
55 | vwtrain(vwmodel = lda_model,
56 |         data = lda_data,
57 |         namespaces = list(" " = "features"),
58 |         fixed = "features")
59 | 
60 | # Here we get our topic predictions for each word from regressor values.
61 | vwout <- vwaudit(vwmodel = lda_model)
62 | # Each line of vwout corresponds to a single feature (a single word in our case)
63 | # Output contains following columns:
64 | # Names - feature names
65 | # Hashes - feature hashes
66 | # V1-V7 - Regressor values for each topic
67 | 
68 | 
69 | # Now we need to post-process this output to get final word - topic correspondences.
70 | # First, filter out zero valued features.
71 | selected_rows <- apply(vwout[, 3:9], 1, function(x) {
72 |     !all(x == 0)
73 | })
74 | vwout<- vwout[selected_rows,]
75 | 
76 | # And finaly:
77 | # 1) Connect words from prepared vocabulary with feature hashes from our model.
78 | # 2) Connect words with a maximum valued topic prediction.
79 | lda_results <- data.frame(
80 |     word = lda_vocab,
81 |     topic = apply(vwout[order(vwout$Hashes), 3:9], 1, function(x) {
82 |         which.max(x)
83 |     }),
84 |     value = apply(vwout[order(vwout$Hashes), 3:9], 1, function(x) {
85 |         max(x)
86 |     })
87 | )
88 | 
89 | head(lda_results)
90 | 
91 | # Switch back.
92 | setwd(curr_dir)
93 | 


--------------------------------------------------------------------------------
/demo/rvw_overview.R:
--------------------------------------------------------------------------------
 1 | library(rvw)
 2 | 
 3 | curr_dir <- getwd()
 4 | setwd(tempdir())
 5 | 
 6 | # Get VW format datafiles
 7 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
 8 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw")
 9 | multiclass_train_data <- system.file("extdata", "multiclass_train.vw", package = "rvw")
10 | multiclass_test_data <- system.file("extdata", "multiclass_valid.vw", package = "rvw")
11 | 
12 | # Setup model
13 | test_vwmodel <-  vwsetup(dir = "./", model = "mdl.vw",
14 |                          feature_params = list(hash="all", bit_precision=25),
15 |                          optimization_params = list(adaptive=FALSE, learning_rate=0.1))
16 | # Basic training and testing
17 | vwtrain(test_vwmodel, data = ext_train_data)
18 | vw_output <- vwtest(test_vwmodel, data = ext_test_data, probs_path = "./probs.vw")
19 | 
20 | # Printing readable model
21 | test_vwmodel <- vwsetup()
22 | vwtrain(test_vwmodel, data = ext_train_data, readable_model = "hashed")
23 | vwtest(test_vwmodel, data = ext_test_data, readable_model = "inverted")
24 | # Model audit
25 | vwaudit(test_vwmodel)
26 | # No console output
27 | vwtrain(test_vwmodel, data = ext_train_data, quiet = T)
28 | vwtest(test_vwmodel, data = ext_train_data, quiet = T)
29 | 
30 | # Add learning options
31 | library(magrittr)
32 | test_vwmodel <-  vwsetup(dir = "./", model = "mdl.vw",
33 |                          option = "ect", num_classes=3) %>%
34 |     add_option(option = "boosting", num_learners=10)
35 | # Print vwmodel contents
36 | test_vwmodel
37 | # Access vw parameters
38 | vwparams(test_vwmodel, "num_classes")
39 | # Modify parameters
40 | vwparams(test_vwmodel, "num_learners") <- 100
41 | 
42 | vwtrain(test_vwmodel, data = multiclass_train_data)
43 | vwtest(test_vwmodel, data = multiclass_test_data)
44 | 
45 | setwd(curr_dir)
46 | 


--------------------------------------------------------------------------------
/docker/ci/Dockerfile:
--------------------------------------------------------------------------------
 1 | ## Emacs, make this -*- mode: sh; -*-
 2 | 
 3 | FROM r-base:latest
 4 | 
 5 | LABEL org.label-schema.license="GPL-2.0" \
 6 |       org.label-schema.vcs-url="https://github.com/rvw-org" \
 7 |       maintainer="Dirk Eddelbuettel <edd@debian.org>"
 8 | 
 9 | RUN apt-get update \
10 |         && apt-get install -y --no-install-recommends \
11 |                 r-cran-rcpp \
12 |                 r-cran-testthat \
13 |                 r-cran-runit \
14 |                 r-cran-data.table \
15 |                 r-cran-knitr \
16 |                 r-cran-rmarkdown \
17 |                 libvw-dev \
18 |                 vowpal-wabbit \
19 |                 libboost-program-options-dev \
20 |                 r-cran-matrix \
21 |         && install.r RApiSerialize mltools \
22 |         && mkdir ~/.R \
23 |         && echo _R_CHECK_FORCE_SUGGESTS_=FALSE > ~/.R/check.Renviron 
24 | 
25 | CMD ["bash"]
26 | 


--------------------------------------------------------------------------------
/docker/run/Dockerfile:
--------------------------------------------------------------------------------
 1 | ## Emacs, make this -*- mode: sh; -*-
 2 | 
 3 | FROM rvowpalwabbit/ci
 4 | 
 5 | LABEL org.label-schema.license="GPL-2.0" \
 6 |       org.label-schema.vcs-url="https://github.com/rvw-org/rvw" \
 7 |       maintainer="Dirk Eddelbuettel <edd@debian.org>"
 8 | 
 9 | ## If on CRAN, install the latest version from CRAN
10 | #RUN install.r ...
11 | 
12 | ## Alternatively, install from Github (after first installing remotes)
13 | RUN install.r remotes && installGithub.r rvw-org/rvw
14 | 
15 | CMD ["bash"]
16 | 


--------------------------------------------------------------------------------
/inst/extdata/binary_train.vw:
--------------------------------------------------------------------------------
  1 | -1.000000 |A carat:1.590000 depth:62.300000 table:60.000000 price:11613.000000 x:7.420000 z:4.640000 cut_Very_Good color_G clarity_SI1
  2 | 1.000000 |A carat:0.420000 depth:61.500000 table:59.000000 price:709.000000 x:4.840000 z:2.960000 cut_Premium color_I clarity_SI2
  3 | -1.000000 |A carat:1.040000 depth:61.600000 table:56.000000 price:3960.000000 x:6.560000 z:4.020000 cut_Premium color_I clarity_SI2
  4 | 1.000000 |A carat:0.510000 depth:63.300000 table:56.000000 price:1656.000000 x:5.080000 z:3.230000 cut_Good color_E clarity_VS2
  5 | 1.000000 |A carat:0.550000 depth:62.100000 table:56.000000 price:1117.000000 x:5.250000 z:3.250000 cut_Ideal color_G clarity_SI2
  6 | -1.000000 |A carat:0.900000 depth:61.200000 table:60.000000 price:4304.000000 x:6.190000 z:3.770000 cut_Premium color_D clarity_SI1
  7 | -1.000000 |A carat:0.700000 depth:61.800000 table:60.000000 price:2330.000000 x:5.680000 z:3.520000 cut_Premium color_H clarity_VS2
  8 | 1.000000 |A carat:0.230000 depth:59.500000 table:58.000000 price:530.000000 x:4.000000 z:2.390000 cut_Very_Good color_F clarity_IF
  9 | -1.000000 |A carat:2.040000 depth:61.500000 table:57.000000 price:16800.000000 x:8.150000 z:5.020000 cut_Ideal color_H clarity_SI2
 10 | 1.000000 |A carat:0.310000 depth:61.600000 table:54.000000 price:891.000000 x:4.400000 z:2.720000 cut_Very_Good color_G clarity_IF
 11 | 1.000000 |A carat:0.410000 depth:62.800000 table:60.000000 price:834.000000 x:4.720000 z:2.980000 cut_Very_Good color_E clarity_VS1
 12 | -1.000000 |A carat:1.290000 depth:61.600000 table:57.000000 price:6588.000000 x:7.000000 z:4.300000 cut_Ideal color_H clarity_SI1
 13 | 1.000000 |A carat:0.500000 depth:62.100000 table:55.000000 price:1080.000000 x:5.090000 z:3.170000 cut_Ideal color_G clarity_SI2
 14 | -1.000000 |A carat:0.710000 depth:55.600000 table:73.000000 price:2368.000000 x:6.010000 z:3.330000 cut_Fair color_D clarity_VS2
 15 | -1.000000 |A carat:1.210000 depth:61.500000 table:58.000000 price:5211.000000 x:6.850000 z:4.230000 cut_Very_Good color_J clarity_VS2
 16 | 1.000000 |A carat:0.410000 depth:61.400000 table:57.000000 price:1079.000000 x:4.790000 z:2.950000 cut_Ideal color_E clarity_VS1
 17 | 1.000000 |A carat:0.500000 depth:62.900000 table:61.000000 price:1845.000000 x:5.030000 z:3.140000 cut_Premium color_D clarity_VS2
 18 | -1.000000 |A carat:1.050000 depth:61.900000 table:56.000000 price:4586.000000 x:6.560000 z:4.040000 cut_Ideal color_H clarity_SI2
 19 | -1.000000 |A carat:1.060000 depth:61.800000 table:55.000000 price:5697.000000 x:6.540000 z:4.040000 cut_Ideal color_I clarity_SI1
 20 | 1.000000 |A carat:0.270000 depth:62.300000 table:54.000000 price:500.000000 x:4.160000 z:2.600000 cut_Ideal color_H clarity_VS1
 21 | 1.000000 |A carat:0.410000 depth:61.700000 table:56.000000 price:1115.000000 x:4.780000 z:2.960000 cut_Ideal color_F clarity_VVS2
 22 | -1.000000 |A carat:1.160000 depth:62.400000 table:55.000000 price:3800.000000 x:6.760000 z:4.200000 cut_Ideal color_I clarity_SI2
 23 | 1.000000 |A carat:0.720000 depth:63.900000 table:62.000000 price:2188.000000 x:5.700000 z:3.620000 cut_Good color_F clarity_SI1
 24 | -1.000000 |A carat:0.910000 depth:62.700000 table:59.000000 price:4720.000000 x:6.130000 z:3.850000 cut_Very_Good color_D clarity_VS2
 25 | 1.000000 |A carat:0.300000 depth:62.200000 table:54.000000 price:819.000000 x:4.330000 z:2.680000 cut_Ideal color_G clarity_VVS2
 26 | 1.000000 |A carat:0.330000 depth:60.100000 table:58.000000 price:1052.000000 x:4.510000 z:2.700000 cut_Premium color_E clarity_VVS1
 27 | 1.000000 |A carat:0.550000 depth:61.200000 table:56.400000 price:1975.000000 x:5.280000 z:3.250000 cut_Very_Good color_F clarity_VS1
 28 | -1.000000 |A carat:1.730000 depth:61.700000 table:56.000000 price:12998.000000 x:7.640000 z:4.730000 cut_Very_Good color_H clarity_VS2
 29 | 1.000000 |A carat:0.400000 depth:59.200000 table:61.000000 price:900.000000 x:4.810000 z:2.840000 cut_Premium color_G clarity_SI1
 30 | -1.000000 |A carat:0.900000 depth:61.600000 table:55.000000 price:3587.000000 x:6.240000 z:3.850000 cut_Ideal color_J clarity_VS1
 31 | 1.000000 |A carat:0.580000 depth:61.100000 table:56.000000 price:1719.000000 x:5.400000 z:3.310000 cut_Ideal color_G clarity_VS2
 32 | -1.000000 |A carat:1.120000 depth:59.600000 table:56.000000 price:8973.000000 x:6.800000 z:4.070000 cut_Good color_G clarity_IF
 33 | -1.000000 |A carat:0.760000 depth:61.700000 table:55.000000 price:2553.000000 x:5.870000 z:3.630000 cut_Ideal color_G clarity_SI2
 34 | -1.000000 |A carat:1.010000 depth:62.000000 table:58.000000 price:10688.000000 x:6.410000 z:3.990000 cut_Ideal color_F clarity_IF
 35 | -1.000000 |A carat:0.700000 depth:62.400000 table:53.000000 price:2839.000000 x:5.730000 z:3.570000 cut_Ideal color_F clarity_VS1
 36 | -1.000000 |A carat:0.910000 depth:62.200000 table:57.000000 price:3884.000000 x:6.120000 z:3.820000 cut_Very_Good color_G clarity_SI1
 37 | -1.000000 |A carat:1.070000 depth:62.300000 table:53.000000 price:6049.000000 x:6.570000 z:4.100000 cut_Ideal color_F clarity_SI1
 38 | 1.000000 |A carat:0.300000 depth:63.100000 table:56.000000 price:675.000000 x:4.280000 z:2.690000 cut_Very_Good color_G clarity_VS2
 39 | -1.000000 |A carat:1.060000 depth:61.000000 table:57.000000 price:7900.000000 x:6.550000 z:4.000000 cut_Ideal color_G clarity_VVS2
 40 | -1.000000 |A carat:0.930000 depth:61.900000 table:55.000000 price:4511.000000 x:6.260000 z:3.890000 cut_Ideal color_E clarity_SI1
 41 | 1.000000 |A carat:0.310000 depth:61.100000 table:55.000000 price:840.000000 x:4.380000 z:2.680000 cut_Ideal color_F clarity_VVS1
 42 | -1.000000 |A carat:1.210000 depth:61.000000 table:60.000000 price:4675.000000 x:6.880000 z:4.180000 cut_Premium color_J clarity_VS1
 43 | 1.000000 |A carat:0.300000 depth:62.200000 table:60.000000 price:526.000000 x:4.240000 z:2.640000 cut_Very_Good color_G clarity_VS2
 44 | 1.000000 |A carat:0.330000 depth:61.100000 table:56.000000 price:539.000000 x:4.500000 z:2.760000 cut_Ideal color_H clarity_VS2
 45 | -1.000000 |A carat:1.700000 depth:63.200000 table:58.000000 price:7730.000000 x:7.560000 z:4.760000 cut_Very_Good color_J clarity_VS2
 46 | -1.000000 |A carat:0.900000 depth:62.300000 table:64.000000 price:3605.000000 x:6.100000 z:3.810000 cut_Very_Good color_I clarity_VS2
 47 | 1.000000 |A carat:0.310000 depth:59.000000 table:57.000000 price:446.000000 x:4.400000 z:2.610000 cut_Very_Good color_H clarity_SI1
 48 | -1.000000 |A carat:0.930000 depth:61.800000 table:55.000000 price:5375.000000 x:6.280000 z:3.880000 cut_Ideal color_H clarity_VS1
 49 | 1.000000 |A carat:0.700000 depth:61.800000 table:57.000000 price:2058.000000 x:5.710000 z:3.520000 cut_Ideal color_E clarity_SI1
 50 | -1.000000 |A carat:1.010000 depth:61.200000 table:61.000000 price:5294.000000 x:6.440000 z:3.930000 cut_Premium color_H clarity_VS1
 51 | 1.000000 |A carat:0.500000 depth:61.600000 table:62.000000 price:2437.000000 x:5.040000 z:3.120000 cut_Good color_E clarity_VVS2
 52 | -1.000000 |A carat:1.550000 depth:60.100000 table:58.000000 price:11871.000000 x:7.550000 z:4.520000 cut_Premium color_E clarity_SI1
 53 | 1.000000 |A carat:0.260000 depth:61.700000 table:57.000000 price:599.000000 x:4.070000 z:2.520000 cut_Ideal color_E clarity_VVS1
 54 | -1.000000 |A carat:1.080000 depth:61.800000 table:56.000000 price:6078.000000 x:6.620000 z:4.110000 cut_Ideal color_D clarity_SI1
 55 | -1.000000 |A carat:0.920000 depth:62.500000 table:59.000000 price:3613.000000 x:6.200000 z:3.890000 cut_Very_Good color_D clarity_SI2
 56 | 1.000000 |A carat:0.420000 depth:62.400000 table:55.000000 price:1031.000000 x:4.790000 z:3.000000 cut_Ideal color_G clarity_VVS2
 57 | 1.000000 |A carat:0.320000 depth:60.700000 table:58.000000 price:720.000000 x:4.420000 z:2.670000 cut_Premium color_G clarity_VS2
 58 | -1.000000 |A carat:1.000000 depth:61.400000 table:58.000000 price:8216.000000 x:6.360000 z:3.920000 cut_Very_Good color_F clarity_VVS2
 59 | -1.000000 |A carat:0.770000 depth:62.000000 table:56.000000 price:3697.000000 x:5.870000 z:3.650000 cut_Ideal color_E clarity_VS1
 60 | -1.000000 |A carat:1.010000 depth:63.600000 table:57.000000 price:5251.000000 x:6.270000 z:4.000000 cut_Good color_E clarity_SI1
 61 | 1.000000 |A carat:0.300000 depth:63.500000 table:54.000000 price:608.000000 x:4.300000 z:2.720000 cut_Very_Good color_H clarity_VS2
 62 | 1.000000 |A carat:0.260000 depth:60.900000 table:57.000000 price:580.000000 x:4.130000 z:2.510000 cut_Ideal color_F clarity_VS1
 63 | -1.000000 |A carat:0.690000 depth:57.800000 table:66.000000 price:2070.000000 x:5.900000 z:3.390000 cut_Fair color_G clarity_VS1
 64 | -1.000000 |A carat:1.100000 depth:61.600000 table:56.000000 price:9051.000000 x:6.620000 z:4.100000 cut_Ideal color_G clarity_VVS2
 65 | -1.000000 |A carat:0.710000 depth:58.300000 table:61.000000 price:2131.000000 x:5.840000 z:3.390000 cut_Good color_H clarity_SI1
 66 | 1.000000 |A carat:0.270000 depth:61.000000 table:61.000000 price:544.000000 x:4.140000 z:2.530000 cut_Very_Good color_D clarity_VVS1
 67 | 1.000000 |A carat:0.340000 depth:63.900000 table:56.000000 price:765.000000 x:4.480000 z:2.850000 cut_Good color_E clarity_SI1
 68 | -1.000000 |A carat:1.180000 depth:61.600000 table:56.000000 price:4229.000000 x:6.820000 z:4.190000 cut_Ideal color_E clarity_I1
 69 | 1.000000 |A carat:0.670000 depth:62.500000 table:59.000000 price:2211.000000 x:5.580000 z:3.470000 cut_Premium color_F clarity_VS2
 70 | -1.000000 |A carat:1.580000 depth:62.300000 table:55.000000 price:9457.000000 x:7.420000 z:4.640000 cut_Ideal color_I clarity_SI1
 71 | 1.000000 |A carat:0.240000 depth:60.600000 table:62.000000 price:478.000000 x:4.030000 z:2.450000 cut_Very_Good color_E clarity_VVS2
 72 | -1.000000 |A carat:0.920000 depth:58.500000 table:57.000000 price:2947.000000 x:6.370000 z:3.720000 cut_Ideal color_H clarity_SI2
 73 | 1.000000 |A carat:0.310000 depth:61.900000 table:58.000000 price:625.000000 x:4.300000 z:2.670000 cut_Very_Good color_H clarity_VVS2
 74 | -1.000000 |A carat:1.510000 depth:62.800000 table:59.000000 price:7553.000000 x:7.280000 z:4.550000 cut_Premium color_J clarity_VS2
 75 | -1.000000 |A carat:1.220000 depth:62.300000 table:56.000000 price:10221.000000 x:6.840000 z:4.250000 cut_Ideal color_G clarity_VVS2
 76 | 1.000000 |A carat:0.300000 depth:62.000000 table:55.000000 price:776.000000 x:4.330000 z:2.680000 cut_Ideal color_F clarity_VS2
 77 | 1.000000 |A carat:0.380000 depth:61.900000 table:56.000000 price:1327.000000 x:4.670000 z:2.880000 cut_Ideal color_E clarity_VVS1
 78 | -1.000000 |A carat:0.780000 depth:60.400000 table:58.000000 price:3531.000000 x:6.000000 z:3.610000 cut_Ideal color_F clarity_VS2
 79 | 1.000000 |A carat:0.700000 depth:63.100000 table:58.000000 price:2643.000000 x:5.610000 z:3.550000 cut_Very_Good color_E clarity_SI1
 80 | -1.000000 |A carat:1.020000 depth:63.900000 table:55.000000 price:4476.000000 x:6.370000 z:4.080000 cut_Good color_H clarity_SI1
 81 | -1.000000 |A carat:0.840000 depth:62.800000 table:57.000000 price:2656.000000 x:5.990000 z:3.780000 cut_Very_Good color_F clarity_SI2
 82 | 1.000000 |A carat:0.380000 depth:62.100000 table:55.000000 price:633.000000 x:4.620000 z:2.880000 cut_Very_Good color_D clarity_SI2
 83 | 1.000000 |A carat:0.370000 depth:61.800000 table:54.000000 price:1082.000000 x:4.630000 z:2.850000 cut_Ideal color_E clarity_VVS2
 84 | 1.000000 |A carat:0.310000 depth:63.800000 table:56.000000 price:489.000000 x:4.270000 z:2.740000 cut_Good color_I clarity_VS1
 85 | -1.000000 |A carat:1.020000 depth:61.600000 table:56.000000 price:4547.000000 x:6.550000 z:3.990000 cut_Ideal color_E clarity_SI2
 86 | -1.000000 |A carat:1.060000 depth:62.600000 table:58.000000 price:5889.000000 x:6.540000 z:4.080000 cut_Premium color_H clarity_VS2
 87 | -1.000000 |A carat:1.020000 depth:61.900000 table:53.000000 price:6169.000000 x:6.500000 z:4.020000 cut_Ideal color_G clarity_VS2
 88 | -1.000000 |A carat:1.030000 depth:61.300000 table:57.000000 price:6981.000000 x:6.510000 z:4.000000 cut_Ideal color_G clarity_VS2
 89 | -1.000000 |A carat:0.710000 depth:61.300000 table:56.000000 price:3406.000000 x:5.750000 z:3.530000 cut_Ideal color_D clarity_VS1
 90 | 1.000000 |A carat:0.500000 depth:62.500000 table:58.000000 price:1746.000000 x:5.110000 z:3.180000 cut_Premium color_G clarity_VS1
 91 | 1.000000 |A carat:0.320000 depth:61.900000 table:55.000000 price:915.000000 x:4.380000 z:2.730000 cut_Ideal color_F clarity_IF
 92 | 1.000000 |A carat:0.310000 depth:60.500000 table:55.000000 price:877.000000 x:4.430000 z:2.670000 cut_Ideal color_D clarity_VS1
 93 | -1.000000 |A carat:0.700000 depth:58.200000 table:59.000000 price:2513.000000 x:5.870000 z:3.440000 cut_Very_Good color_G clarity_VS2
 94 | -1.000000 |A carat:1.080000 depth:62.600000 table:56.000000 price:4407.000000 x:6.550000 z:4.120000 cut_Very_Good color_F clarity_SI2
 95 | -1.000000 |A carat:0.900000 depth:63.400000 table:57.000000 price:4447.000000 x:6.130000 z:3.880000 cut_Very_Good color_D clarity_SI1
 96 | -1.000000 |A carat:1.060000 depth:63.100000 table:59.000000 price:6212.000000 x:6.450000 z:4.080000 cut_Good color_G clarity_VS2
 97 | -1.000000 |A carat:1.560000 depth:61.300000 table:56.000000 price:14237.000000 x:7.480000 z:4.590000 cut_Ideal color_G clarity_VS2
 98 | -1.000000 |A carat:1.000000 depth:59.500000 table:63.000000 price:4077.000000 x:6.470000 z:3.830000 cut_Very_Good color_E clarity_SI2
 99 | 1.000000 |A carat:0.310000 depth:62.000000 table:55.200000 price:515.000000 x:4.320000 z:2.690000 cut_Ideal color_F clarity_SI1
100 | 1.000000 |A carat:0.700000 depth:64.700000 table:58.000000 price:2051.000000 x:5.590000 z:3.600000 cut_Fair color_H clarity_SI1
101 | 


--------------------------------------------------------------------------------
/inst/extdata/binary_valid.vw:
--------------------------------------------------------------------------------
  1 | 1.000000 |A carat:0.330000 depth:62.200000 table:58.000000 price:854.000000 x:4.420000 z:2.740000 cut_Premium color_G clarity_VS1
  2 | 1.000000 |A carat:0.330000 depth:61.600000 table:58.000000 price:854.000000 x:4.440000 z:2.720000 cut_Premium color_G clarity_VS1
  3 | 1.000000 |A carat:0.330000 depth:60.100000 table:58.000000 price:854.000000 x:4.470000 z:2.680000 cut_Premium color_F clarity_VS2
  4 | 1.000000 |A carat:0.330000 depth:62.300000 table:55.000000 price:854.000000 x:4.470000 z:2.770000 cut_Ideal color_F clarity_VS2
  5 | 1.000000 |A carat:0.330000 depth:61.900000 table:57.000000 price:854.000000 x:4.420000 z:2.730000 cut_Ideal color_F clarity_VS2
  6 | 1.000000 |A carat:0.330000 depth:61.900000 table:57.000000 price:854.000000 x:4.440000 z:2.740000 cut_Premium color_F clarity_VS2
  7 | 1.000000 |A carat:0.390000 depth:61.700000 table:57.000000 price:854.000000 x:4.770000 z:2.920000 cut_Premium color_G clarity_SI1
  8 | 1.000000 |A carat:0.330000 depth:61.600000 table:55.000000 price:854.000000 x:4.480000 z:2.750000 cut_Ideal color_F clarity_VS2
  9 | 1.000000 |A carat:0.330000 depth:60.300000 table:55.000000 price:855.000000 x:4.520000 z:2.720000 cut_Ideal color_D clarity_VS2
 10 | 1.000000 |A carat:0.390000 depth:62.600000 table:58.000000 price:855.000000 x:4.660000 z:2.930000 cut_Premium color_G clarity_VS1
 11 | 1.000000 |A carat:0.390000 depth:62.100000 table:58.000000 price:855.000000 x:4.630000 z:2.880000 cut_Very_Good color_H clarity_VVS2
 12 | 1.000000 |A carat:0.500000 depth:65.300000 table:56.000000 price:855.000000 x:4.910000 z:3.240000 cut_Good color_I clarity_SI2
 13 | 1.000000 |A carat:0.380000 depth:62.400000 table:59.000000 price:855.000000 x:4.640000 z:2.880000 cut_Premium color_G clarity_SI1
 14 | 1.000000 |A carat:0.400000 depth:61.400000 table:59.000000 price:855.000000 x:4.750000 z:2.900000 cut_Premium color_E clarity_SI2
 15 | 1.000000 |A carat:0.400000 depth:62.100000 table:60.000000 price:855.000000 x:4.750000 z:2.930000 cut_Premium color_D clarity_SI2
 16 | 1.000000 |A carat:0.380000 depth:61.400000 table:61.000000 price:855.000000 x:4.660000 z:2.850000 cut_Premium color_G clarity_SI1
 17 | 1.000000 |A carat:0.380000 depth:62.400000 table:57.000000 price:855.000000 x:4.650000 z:2.890000 cut_Ideal color_G clarity_SI1
 18 | 1.000000 |A carat:0.380000 depth:60.900000 table:56.000000 price:855.000000 x:4.710000 z:2.860000 cut_Ideal color_G clarity_SI1
 19 | 1.000000 |A carat:0.460000 depth:66.800000 table:55.000000 price:855.000000 x:4.820000 z:3.200000 cut_Fair color_I clarity_VS2
 20 | 1.000000 |A carat:0.330000 depth:61.900000 table:56.000000 price:856.000000 x:4.440000 z:2.760000 cut_Very_Good color_D clarity_VVS2
 21 | 1.000000 |A carat:0.530000 depth:61.900000 table:54.000000 price:856.000000 x:5.200000 z:3.230000 cut_Ideal color_J clarity_SI2
 22 | 1.000000 |A carat:0.370000 depth:62.400000 table:56.000000 price:857.000000 x:4.560000 z:2.870000 cut_Ideal color_D clarity_VVS2
 23 | 1.000000 |A carat:0.310000 depth:63.500000 table:55.000000 price:465.000000 x:4.330000 z:2.740000 cut_Very_Good color_J clarity_SI1
 24 | 1.000000 |A carat:0.250000 depth:61.600000 table:56.000000 price:467.000000 x:4.070000 z:2.510000 cut_Very_Good color_H clarity_VVS1
 25 | 1.000000 |A carat:0.280000 depth:61.900000 table:55.000000 price:467.000000 x:4.200000 z:2.610000 cut_Ideal color_H clarity_VS2
 26 | 1.000000 |A carat:0.310000 depth:62.000000 table:56.100000 price:468.000000 x:4.310000 z:2.680000 cut_Ideal color_J clarity_VVS2
 27 | 1.000000 |A carat:0.260000 depth:59.800000 table:59.000000 price:468.000000 x:4.120000 z:2.480000 cut_Premium color_H clarity_VVS1
 28 | 1.000000 |A carat:0.260000 depth:63.300000 table:58.000000 price:468.000000 x:4.040000 z:2.570000 cut_Good color_H clarity_VVS2
 29 | 1.000000 |A carat:0.300000 depth:61.800000 table:55.000000 price:857.000000 x:4.320000 z:2.680000 cut_Ideal color_F clarity_VVS1
 30 | 1.000000 |A carat:0.300000 depth:62.200000 table:53.000000 price:857.000000 x:4.300000 z:2.680000 cut_Ideal color_F clarity_VVS1
 31 | 1.000000 |A carat:0.390000 depth:62.500000 table:54.000000 price:857.000000 x:4.670000 z:2.930000 cut_Ideal color_G clarity_VS2
 32 | 1.000000 |A carat:0.300000 depth:60.900000 table:57.000000 price:857.000000 x:4.340000 z:2.650000 cut_Ideal color_D clarity_VS2
 33 | 1.000000 |A carat:0.400000 depth:64.700000 table:58.000000 price:857.000000 x:4.670000 z:3.010000 cut_Fair color_F clarity_SI1
 34 | 1.000000 |A carat:0.300000 depth:58.100000 table:61.000000 price:858.000000 x:4.390000 z:2.560000 cut_Very_Good color_F clarity_VVS1
 35 | 1.000000 |A carat:0.400000 depth:62.900000 table:59.000000 price:858.000000 x:4.700000 z:2.970000 cut_Very_Good color_E clarity_VS2
 36 | 1.000000 |A carat:0.360000 depth:62.400000 table:63.000000 price:858.000000 x:4.460000 z:2.800000 cut_Good color_F clarity_VVS1
 37 | 1.000000 |A carat:0.290000 depth:62.700000 table:55.000000 price:858.000000 x:4.220000 z:2.640000 cut_Ideal color_E clarity_VVS1
 38 | 1.000000 |A carat:0.430000 depth:62.100000 table:57.000000 price:858.000000 x:4.830000 z:2.980000 cut_Ideal color_H clarity_SI1
 39 | 1.000000 |A carat:0.330000 depth:62.000000 table:56.000000 price:859.000000 x:4.410000 z:2.740000 cut_Very_Good color_F clarity_VVS1
 40 | 1.000000 |A carat:0.350000 depth:61.800000 table:57.000000 price:859.000000 x:4.500000 z:2.790000 cut_Very_Good color_G clarity_VVS1
 41 | 1.000000 |A carat:0.350000 depth:61.700000 table:54.000000 price:859.000000 x:4.530000 z:2.800000 cut_Ideal color_H clarity_IF
 42 | 1.000000 |A carat:0.390000 depth:59.100000 table:59.000000 price:860.000000 x:4.810000 z:2.830000 cut_Premium color_E clarity_SI1
 43 | 1.000000 |A carat:0.400000 depth:62.900000 table:54.800000 price:861.000000 x:4.710000 z:2.980000 cut_Very_Good color_G clarity_VS1
 44 | 1.000000 |A carat:0.410000 depth:62.200000 table:56.000000 price:861.000000 x:4.770000 z:2.960000 cut_Ideal color_G clarity_SI1
 45 | 1.000000 |A carat:0.300000 depth:62.100000 table:57.000000 price:862.000000 x:4.270000 z:2.660000 cut_Ideal color_E clarity_VVS1
 46 | 1.000000 |A carat:0.300000 depth:62.300000 table:55.000000 price:862.000000 x:4.310000 z:2.690000 cut_Ideal color_E clarity_VVS1
 47 | 1.000000 |A carat:0.320000 depth:61.500000 table:56.000000 price:862.000000 x:4.410000 z:2.720000 cut_Ideal color_D clarity_VS1
 48 | 1.000000 |A carat:0.420000 depth:62.000000 table:59.000000 price:862.000000 x:4.830000 z:2.980000 cut_Premium color_D clarity_SI2
 49 | 1.000000 |A carat:0.300000 depth:62.000000 table:55.000000 price:863.000000 x:4.310000 z:2.680000 cut_Very_Good color_D clarity_VS1
 50 | 1.000000 |A carat:0.410000 depth:63.200000 table:57.000000 price:863.000000 x:4.740000 z:3.000000 cut_Good color_F clarity_VS2
 51 | 1.000000 |A carat:0.410000 depth:61.100000 table:56.000000 price:863.000000 x:4.800000 z:2.940000 cut_Ideal color_F clarity_VS2
 52 | 1.000000 |A carat:0.410000 depth:62.300000 table:53.000000 price:863.000000 x:4.740000 z:2.960000 cut_Ideal color_E clarity_VS1
 53 | 1.000000 |A carat:0.410000 depth:61.600000 table:59.000000 price:863.000000 x:4.740000 z:2.930000 cut_Premium color_F clarity_VS2
 54 | 1.000000 |A carat:0.410000 depth:62.300000 table:57.000000 price:863.000000 x:4.750000 z:2.970000 cut_Ideal color_F clarity_VS2
 55 | 1.000000 |A carat:0.410000 depth:60.000000 table:56.000000 price:863.000000 x:4.820000 z:2.910000 cut_Ideal color_F clarity_VS2
 56 | 1.000000 |A carat:0.410000 depth:62.600000 table:58.000000 price:863.000000 x:4.750000 z:2.990000 cut_Very_Good color_F clarity_VS2
 57 | 1.000000 |A carat:0.410000 depth:59.800000 table:61.000000 price:863.000000 x:4.790000 z:2.880000 cut_Very_Good color_I clarity_VVS1
 58 | 1.000000 |A carat:0.410000 depth:62.600000 table:57.000000 price:863.000000 x:4.710000 z:2.960000 cut_Very_Good color_F clarity_VS2
 59 | 1.000000 |A carat:0.410000 depth:60.300000 table:60.000000 price:863.000000 x:4.810000 z:2.910000 cut_Premium color_F clarity_VS2
 60 | 1.000000 |A carat:0.300000 depth:60.500000 table:56.000000 price:863.000000 x:4.340000 z:2.660000 cut_Ideal color_F clarity_VVS1
 61 | 1.000000 |A carat:0.310000 depth:61.100000 table:56.000000 price:863.000000 x:4.360000 z:2.670000 cut_Ideal color_E clarity_VVS1
 62 | 1.000000 |A carat:0.340000 depth:61.600000 table:54.000000 price:863.000000 x:4.490000 z:2.780000 cut_Ideal color_H clarity_IF
 63 | 1.000000 |A carat:0.340000 depth:61.800000 table:54.000000 price:863.000000 x:4.510000 z:2.800000 cut_Ideal color_H clarity_IF
 64 | 1.000000 |A carat:0.340000 depth:61.700000 table:55.000000 price:863.000000 x:4.500000 z:2.790000 cut_Ideal color_H clarity_IF
 65 | 1.000000 |A carat:0.300000 depth:61.300000 table:55.000000 price:863.000000 x:4.330000 z:2.660000 cut_Ideal color_G clarity_IF
 66 | 1.000000 |A carat:0.300000 depth:62.300000 table:56.000000 price:863.000000 x:4.310000 z:2.690000 cut_Ideal color_G clarity_IF
 67 | 1.000000 |A carat:0.300000 depth:62.400000 table:54.000000 price:863.000000 x:4.310000 z:2.700000 cut_Ideal color_G clarity_IF
 68 | 1.000000 |A carat:0.300000 depth:60.700000 table:57.000000 price:863.000000 x:4.340000 z:2.650000 cut_Ideal color_G clarity_IF
 69 | 1.000000 |A carat:0.300000 depth:62.000000 table:55.000000 price:863.000000 x:4.330000 z:2.690000 cut_Ideal color_G clarity_IF
 70 | 1.000000 |A carat:0.300000 depth:61.200000 table:57.000000 price:863.000000 x:4.350000 z:2.670000 cut_Ideal color_G clarity_IF
 71 | 1.000000 |A carat:0.300000 depth:62.100000 table:55.000000 price:863.000000 x:4.320000 z:2.690000 cut_Ideal color_G clarity_IF
 72 | 1.000000 |A carat:0.300000 depth:60.900000 table:58.000000 price:863.000000 x:4.320000 z:2.640000 cut_Ideal color_G clarity_IF
 73 | 1.000000 |A carat:0.300000 depth:61.600000 table:56.000000 price:863.000000 x:4.340000 z:2.680000 cut_Ideal color_G clarity_IF
 74 | 1.000000 |A carat:0.300000 depth:61.800000 table:55.000000 price:863.000000 x:4.320000 z:2.680000 cut_Ideal color_G clarity_IF
 75 | 1.000000 |A carat:0.300000 depth:60.900000 table:56.000000 price:863.000000 x:4.340000 z:2.650000 cut_Ideal color_G clarity_IF
 76 | 1.000000 |A carat:0.300000 depth:62.300000 table:56.000000 price:863.000000 x:4.290000 z:2.680000 cut_Ideal color_G clarity_IF
 77 | 1.000000 |A carat:0.500000 depth:62.400000 table:61.000000 price:863.000000 x:5.020000 z:3.120000 cut_Premium color_G clarity_SI2
 78 | 1.000000 |A carat:0.320000 depth:62.400000 table:56.000000 price:864.000000 x:4.390000 z:2.750000 cut_Ideal color_E clarity_VVS2
 79 | 1.000000 |A carat:0.310000 depth:60.900000 table:55.000000 price:864.000000 x:4.420000 z:2.700000 cut_Ideal color_E clarity_VVS2
 80 | 1.000000 |A carat:0.310000 depth:62.000000 table:54.000000 price:864.000000 x:4.380000 z:2.720000 cut_Ideal color_E clarity_VVS2
 81 | 1.000000 |A carat:0.320000 depth:61.900000 table:59.000000 price:864.000000 x:4.390000 z:2.710000 cut_Premium color_I clarity_IF
 82 | 1.000000 |A carat:0.360000 depth:62.300000 table:56.000000 price:865.000000 x:4.550000 z:2.850000 cut_Very_Good color_H clarity_IF
 83 | 1.000000 |A carat:0.340000 depth:60.900000 table:56.000000 price:865.000000 x:4.510000 z:2.760000 cut_Ideal color_D clarity_VS1
 84 | 1.000000 |A carat:0.420000 depth:60.500000 table:57.000000 price:865.000000 x:4.840000 z:2.940000 cut_Ideal color_I clarity_VVS1
 85 | 1.000000 |A carat:0.310000 depth:61.500000 table:56.000000 price:865.000000 x:4.360000 z:2.690000 cut_Ideal color_E clarity_VVS1
 86 | 1.000000 |A carat:0.310000 depth:61.300000 table:56.000000 price:865.000000 x:4.380000 z:2.690000 cut_Ideal color_E clarity_VVS1
 87 | 1.000000 |A carat:0.310000 depth:60.200000 table:61.000000 price:865.000000 x:4.400000 z:2.660000 cut_Ideal color_E clarity_VVS1
 88 | 1.000000 |A carat:0.380000 depth:61.300000 table:56.000000 price:865.000000 x:4.670000 z:2.870000 cut_Ideal color_F clarity_VS1
 89 | 1.000000 |A carat:0.410000 depth:59.700000 table:58.000000 price:866.000000 x:4.790000 z:2.870000 cut_Very_Good color_E clarity_VS2
 90 | 1.000000 |A carat:0.380000 depth:59.200000 table:60.000000 price:866.000000 x:4.730000 z:2.810000 cut_Very_Good color_E clarity_VS2
 91 | 1.000000 |A carat:0.380000 depth:62.200000 table:59.000000 price:866.000000 x:4.630000 z:2.890000 cut_Premium color_E clarity_VS2
 92 | 1.000000 |A carat:0.380000 depth:60.500000 table:58.000000 price:866.000000 x:4.660000 z:2.830000 cut_Very_Good color_E clarity_VS2
 93 | 1.000000 |A carat:0.380000 depth:60.100000 table:56.000000 price:866.000000 x:4.690000 z:2.830000 cut_Ideal color_E clarity_VS2
 94 | 1.000000 |A carat:0.380000 depth:61.800000 table:58.000000 price:866.000000 x:4.610000 z:2.860000 cut_Premium color_E clarity_VS2
 95 | 1.000000 |A carat:0.330000 depth:60.600000 table:58.000000 price:866.000000 x:4.490000 z:2.710000 cut_Premium color_E clarity_VS2
 96 | 1.000000 |A carat:0.430000 depth:60.800000 table:59.000000 price:867.000000 x:4.860000 z:2.970000 cut_Premium color_G clarity_VS2
 97 | 1.000000 |A carat:0.430000 depth:59.100000 table:60.000000 price:867.000000 x:4.880000 z:2.900000 cut_Very_Good color_G clarity_VS2
 98 | 1.000000 |A carat:0.430000 depth:62.300000 table:58.000000 price:867.000000 x:4.750000 z:2.980000 cut_Premium color_H clarity_VS1
 99 | 1.000000 |A carat:0.430000 depth:61.100000 table:59.000000 price:867.000000 x:4.830000 z:2.960000 cut_Premium color_G clarity_VS2
100 | 1.000000 |A carat:0.320000 depth:61.500000 table:56.000000 price:867.000000 x:4.410000 z:2.720000 cut_Ideal color_F clarity_VVS1
101 | 


--------------------------------------------------------------------------------
/inst/extdata/ref_print.out:
--------------------------------------------------------------------------------
 1 | 	Vowpal Wabbit model
 2 | Working directory:   . 
 3 | Model file:   ./pk_mdl.vw 
 4 | Learning algorithm:   sgd 
 5 | General parameters: 
 6 | 	 random_seed :   0 
 7 | 	 ring_size :  Not defined
 8 | 	 holdout_off :   FALSE 
 9 | 	 holdout_period :   10 
10 | 	 holdout_after :   0 
11 | 	 early_terminate :   3 
12 | 	 loss_function :  Not defined
13 | 	 link :  Not defined
14 | 	 quantile_tau :   0.5 
15 | Feature parameters: 
16 | 	 bit_precision :   18 
17 | 	 quadratic :  Not defined
18 | 	 cubic :  Not defined
19 | 	 interactions :  Not defined
20 | 	 permutations :   FALSE 
21 | 	 leave_duplicate_interactions :   FALSE 
22 | 	 noconstant :   FALSE 
23 | 	 feature_limit :  Not defined
24 | 	 ngram :  Not defined
25 | 	 skips :  Not defined
26 | 	 hash :  Not defined
27 | 	 affix :  Not defined
28 | 	 spelling :  Not defined
29 | 	 interact :  Not defined
30 | Learning algorithms / Reductions: 
31 | 	 boosting :
32 | 		 num_learners :   10 
33 | 		 gamma :   0.1 
34 | 		 alg :   BBM 
35 | Optimization parameters: 
36 | 	 adaptive :   TRUE 
37 | 	 normalized :   TRUE 
38 | 	 invariant :   TRUE 
39 | 	 adax :   FALSE 
40 | 	 sparse_l2 :   0 
41 | 	 l1_state :   0 
42 | 	 l2_state :   1 
43 | 	 learning_rate :   0.5 
44 | 	 initial_pass_length :  Not defined
45 | 	 l1 :   0 
46 | 	 l2 :   0 
47 | 	 no_bias_regularization :  Not defined
48 | 	 feature_mask :  Not defined
49 | 	 decay_learning_rate :   1 
50 | 	 initial_t :   0 
51 | 	 power_t :   0.5 
52 | 	 initial_weight :   0 
53 | 	 random_weights :   off 
54 | 	 normal_weights :   off 
55 | 	 truncated_normal_weights :   off 
56 | 	 sparse_weights :   FALSE 
57 | 	 input_feature_regularizer :  Not defined
58 | Model evaluation. Training: 
59 | 	 num_examples :   100 
60 | 	 weighted_example_sum :   100 
61 | 	 weighted_label_sum :   -10 
62 | 	 avg_loss :   0.32 
63 | 	 best_const :   -0.1 
64 | 	 best_const_loss :   0.99 
65 | 	 total_feature :   1000 
66 | Model evaluation. Testing: 
67 | 	 num_examples :   100 
68 | 	 weighted_example_sum :   100 
69 | 	 weighted_label_sum :   100 
70 | 	 avg_loss :   0.12 
71 | 	 best_const :   1 
72 | 	 best_const_loss :   0 
73 | 	 total_feature :   1000 
74 | 


--------------------------------------------------------------------------------
/man/add_option.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions.R
 3 | \name{add_option}
 4 | \alias{add_option}
 5 | \title{Add option to the model}
 6 | \usage{
 7 | add_option(vwmodel, option = c("binary", "oaa", "ect", "csoaa", "wap",
 8 |   "log_multi", "recall_tree", "lda", "multilabel_oaa", "classweight",
 9 |   "new_mf", "lrq", "stage_poly", "bootstrap", "autolink", "replay",
10 |   "explore_eval", "cb", "cb_explore", "cbify", "multiworld_test_check",
11 |   "nn", "topk", "search", "boosting", "marginal"), ...)
12 | }
13 | \arguments{
14 | \item{vwmodel}{[vw] Model of vw class}
15 | 
16 | \item{option}{[string] Name of an option}
17 | 
18 | \item{...}{Additional options for a learning algorithm / reduction}
19 | }
20 | \description{
21 | Add a learning algorithm / reduction to the option stack inside model
22 | }
23 | 


--------------------------------------------------------------------------------
/man/df2vw.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions.R
 3 | \name{df2vw}
 4 | \alias{df2vw}
 5 | \title{Create a VW data file from a R data.frame object}
 6 | \usage{
 7 | df2vw(data, file_path, namespaces = NULL, keep_space = NULL,
 8 |   fixed = NULL, targets = NULL, probabilities = NULL,
 9 |   weight = NULL, base = NULL, tag = NULL, multiline = NULL,
10 |   append = FALSE)
11 | }
12 | \arguments{
13 | \item{data}{[data.frame] data.frame object to be converted}
14 | 
15 | \item{file_path}{[string] file name of the resulting data in
16 | VW-friendly format}
17 | 
18 | \item{namespaces}{[list or yaml file] name of each namespace and
19 | each variable for each namespace can be a R list, or a YAML
20 | file example namespace with the IRIS database: namespaces =
21 | list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length',
22 | 'Petal.Width') this creates 2 namespaces (sepal
23 | and petal) containing the features defined by elements of this lists.}
24 | 
25 | \item{keep_space}{[string vector] keep spaces for this features
26 | Example:"FERRARI 4Si"
27 | With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features
28 | Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature}
29 | 
30 | \item{fixed}{[string vector] fixed parsing for this features
31 | Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'").
32 | Can be used for LDA ("word_1:2 word_2:3" will stay the same),
33 | but should be used carefully, because special characters can ruin final VW format file.}
34 | 
35 | \item{targets}{[string or string vector]
36 | If \code{[string]} then will be treated as vector with real number labels for regular VW input format.
37 | If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa
38 | multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm.}
39 | 
40 | \item{probabilities}{[string vector] vectors with action probabilities for Contextual Bandit algorithm.}
41 | 
42 | \item{weight}{[string] weight (importance) of each line of the dataset.}
43 | 
44 | \item{base}{[string] base of each line of the dataset. Used for residual regression.}
45 | 
46 | \item{tag}{[string] tag of each line of the dataset.}
47 | 
48 | \item{multiline}{[integer] number of labels (separate lines) for multilines example}
49 | 
50 | \item{append}{[bool] data to be appended to the result file}
51 | }
52 | \description{
53 | Create a VW data file from a R data.frame object
54 | }
55 | 


--------------------------------------------------------------------------------
/man/print.vw.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions.R
 3 | \name{print.vw}
 4 | \alias{print.vw}
 5 | \title{Print VW model}
 6 | \usage{
 7 | \method{print}{vw}(x, ...)
 8 | }
 9 | \arguments{
10 | \item{x}{[vw] Model of vw class}
11 | 
12 | \item{...}{Not used currently}
13 | }
14 | \description{
15 | Print information about Vowpal Wabbit model
16 | }
17 | \examples{
18 | vwmodel <- vwsetup()
19 | print(vwmodel)
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/man/rvwgsoc-package.Rd:
--------------------------------------------------------------------------------
 1 | \name{rvw-package}
 2 | \alias{rvw-package}
 3 | \alias{rvw}
 4 | \docType{package}
 5 | \title{
 6 |   R interface for Vowpal Wabbit
 7 | }
 8 | \description{
 9 |   R interface for Vowpal Wabbit using Rcpp and libvw for GSoC 2018.
10 | }
11 | \details{
12 |   This section should provide a more detailed overview of how to use the
13 |   package, including the most important functions.
14 | }
15 | \author{
16 | Ivan Pavlov, Dirk Eddelbuettel, James J Balamuta
17 | 
18 | Maintainer: Ivan Pavlov <pavlov.ivan095@gmail.com>
19 | }
20 | \references{
21 |   This optional section can contain literature or other references for
22 |   background information.
23 | }
24 | \keyword{ package }
25 | \seealso{
26 |   Optional links to other man pages
27 | }
28 | \examples{
29 |   \dontrun{
30 |      ## Optional simple examples of the most important functions
31 |      ## These can be in \dontrun{} and \donttest{} blocks.   
32 |   }
33 | }
34 | 


--------------------------------------------------------------------------------
/man/vwaudit.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{vwaudit}
 4 | \alias{vwaudit}
 5 | \title{Audit Vowpal Wabbit model}
 6 | \usage{
 7 | vwaudit(vwmodel, quiet = FALSE)
 8 | }
 9 | \arguments{
10 | \item{vwmodel}{Model of vw class to train}
11 | 
12 | \item{quiet}{[bool] Do not print anything to the console.}
13 | }
14 | \value{
15 | Data.frame containing feature names, feature hashes and model values
16 | }
17 | \description{
18 | Get feature names and their model values.
19 | }
20 | \examples{
21 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
22 | test_vwmodel <- vwsetup()
23 | vwtrain(test_vwmodel, data = ext_train_data)
24 | vwaudit(test_vwmodel)
25 | }
26 | 


--------------------------------------------------------------------------------
/man/vwparams.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/functions.R
 3 | \name{vwparams}
 4 | \alias{vwparams}
 5 | \alias{vwparams<-}
 6 | \title{Access and modify parameters of VW model}
 7 | \usage{
 8 | vwparams(vwmodel, name)
 9 | 
10 | vwparams(vwmodel, name) <- value
11 | }
12 | \arguments{
13 | \item{vwmodel}{[vw] Model of vw class}
14 | 
15 | \item{name}{[string] Name of VW parameter}
16 | 
17 | \item{value}{[string/int/real/bool] Replacment value of a parameter}
18 | }
19 | \value{
20 | Value of a parameter
21 | }
22 | \description{
23 | These functions allow to access VW model parameters by name and correctly modify them
24 | }
25 | \examples{
26 | vwmodel <- vwsetup()
27 | # Access parameter
28 | vwparams(vwmodel, "bit_precision")
29 | # Modify parameter
30 | vwparams(vwmodel, "bit_precision") <- 25
31 | 
32 | }
33 | 


--------------------------------------------------------------------------------
/man/vwsetup.Rd:
--------------------------------------------------------------------------------
  1 | % Generated by roxygen2: do not edit by hand
  2 | % Please edit documentation in R/functions.R
  3 | \name{vwsetup}
  4 | \alias{vwsetup}
  5 | \title{Create Vowpal Wabbit model, setup model parameters and data}
  6 | \usage{
  7 | vwsetup(algorithm = c("sgd", "bfgs", "ftrl", "pistol", "ksvm",
  8 |   "OjaNewton", "svrg"), general_params = list(),
  9 |   feature_params = list(), optimization_params = list(),
 10 |   dir = tempdir(), model = NULL, params_str = NULL, option = c("",
 11 |   "binary", "oaa", "ect", "csoaa", "wap", "log_multi", "recall_tree",
 12 |   "lda", "multilabel_oaa", "classweight", "new_mf", "lrq", "stage_poly",
 13 |   "bootstrap", "autolink", "replay", "explore_eval", "cb", "cb_explore",
 14 |   "cbify", "multiworld_test_check", "nn", "topk", "search", "boosting",
 15 |   "marginal"), ...)
 16 | }
 17 | \arguments{
 18 | \item{algorithm}{[string] Optimzation algorithm
 19 | \itemize{
 20 |  \item \code{sgd} - adaptive, normalized, invariant stochastic gradient descent
 21 |  \item \code{bfgs} - Limited-memory Broyden-Fletcher-Goldfarb-Shanno optimization algorithm
 22 |  \item \code{ftrl} - FTRL: Follow the Regularized Leader optimization algorithm
 23 |  \item \code{pistol} - FTRL: Parameter-free Stochastic Learning
 24 |  \item \code{ksvm} - Kernel svm
 25 |  \item \code{OjaNewton} - Online Newton with Oja's Sketch
 26 |  \item \code{svrg} - Stochastic Variance Reduced Gradient
 27 | }}
 28 | 
 29 | \item{general_params}{List of parameters:
 30 | \itemize{
 31 |  \item \code{random_seed} [int] - Seed random number generator (default: 0)
 32 |  \item \code{ring_size} [int] - Size of example ring
 33 |  \item \code{holdout_off} [bool] - No holdout data in multiple passes (default: FALSE)
 34 |  \item \code{holdout_period} [int] - Holdout period for test only (default: 10)
 35 |  \item \code{holdout_after} [int] - Holdout after n training examples, default off (disables holdout_period) (default: 0)
 36 |  \item \code{early_terminate} [int] - Specify the number of passes tolerated when holdout loss doesn't decrease before early termination (default: 3)
 37 |  \item \code{loss_function} [string] - Specify the loss function to be used, uses squared by default. Currently available ones are: squared, classic, hinge, logistic, quantile and poisson. (default: squared)
 38 |  \item \code{link} [string] - Specify the link function: identity, logistic, glf1 or poisson. (default: identity)
 39 |  \item \code{quantile_tau} [real] - Parameter "tau" associated with Quantileloss. (default: 0.5)
 40 | }}
 41 | 
 42 | \item{feature_params}{List of parameters:
 43 | More information about "interactions" option (also "quadratic", "cubic") avaliable here \url{https://github.com/VowpalWabbit/vowpal_wabbit/wiki/Command-line-arguments#example-manipulation-options}
 44 | \itemize{
 45 |  \item \code{bit_precision} [int] - Number of bits in the feature table (default: 18)
 46 |  \item \code{quadratic} [string] - Create and use quadratic features (Specify 2 namespaces)
 47 |  \item \code{cubic} [string] - Create and use cubic features (Specify 3 namespaces)
 48 |  \item \code{interactions} [string] - Create feature interactions of any level between namespaces (Specify several namespaces)
 49 |  \item \code{permutations} [bool] - Use permutations instead of combinations for feature interactions of same namespace (default: FALSE)
 50 |  \item \code{leave_duplicate_interactions} [bool] - Don't remove interactions with duplicate combinations of namespaces. For ex. this is a duplicate: 'quadratic="ab", quadratic="ba"' and a lot more in 'quadratic="::"'. (default: FALSE)
 51 |  \item \code{noconstant} [bool] - Don't add a constant feature (default: FALSE)
 52 |  \item \code{feature_limit} [string] - limit to N features. To apply to a single namespace 'foo', arg should be "fN"
 53 |  \item \code{ngram} [string] - Generate N grams. To generate N grams for a single namespace 'foo', arg should be "fN".
 54 |  \item \code{skips} [string] - Use second derivative in line searchGenerate skips in N grams. This in conjunction with the ngram tag can be used to generate generalized n-skip-k-gram. To generate n-skips for a single namespace 'foo', arg should be "fN".
 55 |  \item \code{hash} [string] - How to hash the features. Available options: "strings", "all" (default: "strings")
 56 |  \item \code{affix} [string] - Generate prefixes/suffixes of features; argument "+2a,-3b,+1" means generate 2-char prefixes for namespace a, 3-char suffixes for b and 1 char prefixes for default namespace
 57 |  \item \code{spelling} [string] - Compute spelling features for a given namespace (use '_' for default namespace)
 58 |  \item \code{interact} [string] - Put weights on feature products from namespaces <n1> and <n2>
 59 | }}
 60 | 
 61 | \item{optimization_params}{List of parameters:
 62 | \itemize{
 63 |  \item \code{learning_rate} [real] - Set initial learning Rate (default: 0.5)
 64 |  \item \code{initial_pass_length} [int] - Initial number of examples per pass
 65 |  \item \code{l1} [real] - L1 regularization (default: 0)
 66 |  \item \code{l2} [real] - L2 regularization (default: 0)
 67 |  \item \code{no_bias_regularization} [string] - no bias in regularization (Available options: "on", "off")
 68 |  \item \code{feature_mask} [string] - Use existing regressor to determine which parameters may be updated.  If no initial_regressor given, also used for initial weights.
 69 |  \item \code{decay_learning_rate} [real] - Set Decay factor for learning_rate between passes (default: 1)
 70 |  \item \code{initial_t} [real] - initial t value (default: 0)
 71 |  \item \code{power_t} [real] - t power value (default: 0.5)
 72 |  \item \code{initial_weight} [int] - Set all weights to an initial value of arg (default: 0)
 73 |  \item \code{random_weights} [string] - Make initial weights random (Available options: "on", "off") (default: "off")
 74 |  \item \code{normal_weights} [string] - Make initial weights normal (Available options: "on", "off") (default: "off")
 75 |  \item \code{truncated_normal_weights} [string] - Make initial weights truncated normal (Available options: "on", "off") (default: "off")
 76 |  \item \code{sparse_weights} [bool] - Use a sparse datastructure for weights.
 77 |  \item \code{input_feature_regularizer} [string] - Per feature regularization input file.
 78 | }
 79 | Additional parameters depending on \code{algorithm} choice:
 80 | \itemize{
 81 |  \item \code{sgd}:
 82 |    \itemize{
 83 |      \item \code{adaptive} [bool] - Use adaptive, individual learning rates (default: TRUE)
 84 |      \item \code{normalized} [bool] - Use per feature normalized updates (default: TRUE)
 85 |      \item \code{invariant} [bool] - Use safe/importance aware updates (default: TRUE)
 86 |      \item \code{adax} [bool] - Use adaptive learning rates with x^2 instead of g^2x^2 (default: FALSE)
 87 |      \item \code{sparse_l2} [real] - use per feature normalized updates (default: 0)
 88 |      \item \code{l1_state} [real] - use per feature normalized updates (default: 0)
 89 |      \item \code{l2_state} [real] - use per feature normalized updates (default: 1)
 90 |    }
 91 |  \item \code{bfgs}:
 92 |    \itemize{
 93 |      \item \code{conjugate_gradient} [bool] - Use conjugate gradient based optimization (default: FALSE)
 94 |      \item \code{hessian_on} [bool] - Use second derivative in line search (default: FALSE)
 95 |      \item \code{mem} [int] - Memory in bfgs. (default: 15)
 96 |      \item \code{termination} [real] - Termination threshold. (default: 0.00100000005)
 97 |    }
 98 |  \item \code{ftrl}:
 99 |    \itemize{
100 |      \item \code{ftrl_alpha} [real] - Learning rate for FTRL optimization (default: 0.005)
101 |      \item \code{ftrl_beta} [real] - FTRL beta parameter (default: 0.1)
102 |    }
103 |  \item \code{pistol}:
104 |    \itemize{
105 |      \item \code{ftrl_alpha} [real] - Learning rate for FTRL optimization (default: 0.005)
106 |      \item \code{ftrl_beta} [real] - FTRL beta parameter (default: 0.1)
107 |    }
108 |  \item \code{ksvm}:
109 |    \itemize{
110 |      \item \code{reprocess} [int] - number of reprocess steps for LASVM (default: 1)
111 |      \item \code{kernel} [string] - type of kernel (rbf or linear) (default: "linear")
112 |      \item \code{bandwidth} [real] - bandwidth of rbf kernel (default: 1.0)
113 |      \item \code{degree} [int] - degree of poly kernel (default: 2)
114 |      \item \code{lambda} [real] - saving regularization for test time (default: -1)
115 |    }
116 |  \item \code{OjaNewton}:
117 |    \itemize{
118 |      \item \code{sketch_size} [int] - size of sketch (default: 10)
119 |      \item \code{epoch_size} [int] - size of epoch (default: 1)
120 |      \item \code{alpha} [real] - multiplicative constant for identity (default: 1)
121 |      \item \code{alpha_inverse} [real] - one over alpha, similar to learning rate
122 |      \item \code{learning_rate_cnt} - constant for the learning rate 1/t (default: 2)
123 |      \item \code{normalize} [string] - normalize the features or not (Available options: "on", "off") (default: "on")
124 |      \item \code{random_init} [string] - randomize initialization of Oja or not (Available options: "on", "off") (default: "on")
125 |    }
126 |  \item \code{svrg}:
127 |    \itemize{
128 |      \item \code{stage_size} [int] - Number of passes per SVRG stage (default: 1)
129 |    }
130 | }}
131 | 
132 | \item{dir}{[string] Working directory path, default is tempdir()}
133 | 
134 | \item{model}{[string] File name for model weights or path to existng model file.}
135 | 
136 | \item{params_str}{[string] Pass cmd line parameters directly, bypassing the default approach.
137 | For compatibility, parameters from vwtrain,vwtest, predict.vw can't be used here and functions add_option, vwparams aren't supported.}
138 | 
139 | \item{option}{[string] Add Learning algorithm / reduction option:
140 | \itemize{
141 |  \item \code{binary} - Reports loss as binary classification with -1,1 labels
142 |  \item \code{oaa} - One-against-all multiclass learning with  labels
143 |  \item \code{ect} - Error correcting tournament with  labels
144 |  \item \code{csoaa} - One-against-all multiclass learning with costs
145 |  \item \code{wap} - Weighted all-pairs multiclass learning with costs
146 |  \item \code{multilabel_oaa} - One-against-all multilabel with multiple labels
147 |  \item \code{log_multi} - Online (decision) trees for  classes
148 |  \item \code{classweight} - Importance weight classes
149 |  \item \code{lda} - Latent Dirichlet Allocation
150 |  \item \code{recall_tree} - Use online tree for multiclass
151 |  \item \code{new_mf} - Matrix factorization mode
152 |  \item \code{lrq} - Low rank quadratic features
153 |  \item \code{stage_poly} - Stagewise polynomial features
154 |  \item \code{bootstrap} - bootstrap with K rounds by online importance resampling
155 |  \item \code{autolink} - Create link function with polynomial N
156 |  \item \code{replay} - Experience Replay
157 |  \item \code{explore_eval} - Explore evaluation
158 |  \item \code{cb} - Contextual bandit learning
159 |  \item \code{cb_explore} - Contextual Bandit Exploration
160 |  \item \code{cbify} - Convert multiclass on K classes into a contextual bandit problem
161 |  \item \code{multiworld_test} - Multiworld Testing
162 |  \item \code{nn} - Sigmoidal feedforward network
163 |  \item \code{topk} - Top K recommendation
164 |  \item \code{struct_search} - Search-based structured prediction (SEARN or DAgger)
165 |  \item \code{boosting} - Online boosting with weak learners
166 |  \item \code{marginal} - Substitute marginal label estimates for ids
167 | }}
168 | 
169 | \item{...}{Additional options for a learning algorithm / reduction
170 | \itemize{
171 |  \item \code{oaa} or \code{ect}:
172 |    \itemize{
173 |      \item \code{num_classes} [int] - Number of classes
174 |      \item \code{oaa_subsample} [int] - Subsample this number of negative examples when learning
175 |    }
176 |  \item \code{multilabel_oaa}:
177 |    \itemize{
178 |      \item \code{num_labels} [int] - Number of labels
179 |    }
180 |  \item \code{csoaa} or \code{wap}:
181 |    \itemize{
182 |      \item \code{num_classes} [int] - Number of classes
183 |      \item \code{csoaa_ldf} or \code{wap_ldf} - \code{singleline} (Default) or \code{multiline} label dependent features
184 |    }
185 |  \item \code{log_multi}:
186 |    \itemize{
187 |      \item \code{num_classes} [int] - Number of classes
188 |      \item \code{no_progress} [bool] - Disable progressive validation (default: FALSE)
189 |      \item \code{swap_resistance} [int] - Higher = more resistance to swap, (default: 4)
190 |    }
191 |  \item \code{classweight}:
192 |    \itemize{
193 |      \item \code{class_multiplier} [real] - importance weight multiplier for class
194 |    }
195 |  \item \code{recall_tree}:
196 |    \itemize{
197 |      \item \code{num_classes} [int] - Number of classes
198 |      \item \code{max_candidates} [int] - Maximum number of labels per leaf in the tree
199 |      \item \code{bern_hyper} [real] - Recall tree depth penalty (default: 1)
200 |      \item \code{max_depth} [int] - Maximum depth of the tree, (default: log_2(number of classes) )
201 |      \item \code{node_only} [string] - Only use node features, not full path (Available options: "on", "off") (default: "off")
202 |      \item \code{randomized_routing} [string] - Randomized routing (Available options: "on", "off") (default: "off")
203 |    }
204 |  \item \code{lda}:
205 |    \itemize{
206 |      \item \code{num_topics} [int] - Number of topics
207 |      \item \code{lda_alpha} [real] - Prior on sparsity of per-document topic weights (default: 0.100000001)
208 |      \item \code{lda_rho} [real] - Prior on sparsity of topic distributions (default: 0.100000001)
209 |      \item \code{lda_D} [int] - Number of documents (default: 10000)
210 |      \item \code{lda_epsilon} [real] - Loop convergence threshold (default: 0.00100000005)
211 |      \item \code{math-mode} [string] - Math mode: simd, accuracy, fast-approx
212 |      \item \code{minibatch} [int] - Minibatch size (default: 1)
213 |      \item \code{metrics} [string] - Compute metrics (Available options: "on", "off") (default: "off")
214 |    }
215 |  \item \code{new_mf}:
216 |    \itemize{
217 |      \item \code{rank} [int] - rank for matrix factorization
218 |    }
219 |  \item \code{lrq}:
220 |    \itemize{
221 |      \item \code{features} [string] - low rank quadratic features
222 |      \item \code{lrqdropout} [bool] - use dropout training for low rank quadratic features (default: FALSE)
223 |    }
224 |  \item \code{stage_poly}:
225 |    \itemize{
226 |      \item \code{sched_exponent} [real] - exponent controlling quantity of included features (default: 1.0)
227 |      \item \code{batch_sz} [int] - multiplier on batch size before including more features (default: 1000)
228 |      \item \code{batch_sz_no_doubling} [bool] - batch_sz does not double (default: TRUE)
229 |    }
230 |  \item \code{bootstrap}:
231 |    \itemize{
232 |      \item \code{num_rounds} [int] - number of rounds
233 |      \item \code{bs_type} [string] - the bootstrap mode: 'mean' or 'vote' (default: "mean")
234 |    }
235 |  \item \code{autolink}:
236 |    \itemize{
237 |      \item \code{degree} [int] - polynomial degree (default: 2)
238 |    }
239 |  \item \code{replay}:
240 |    \itemize{
241 |      \item \code{level} [string] - Use experience replay at a specified level (b=classification/regression, m=multiclass, c=cost sensitive)
242 |      \item \code{buffer} [int] - Buffer size (default: 100)
243 |      \item \code{count} [int] - how many times (in expectation) should each example be played (default: 1 = permuting)
244 |    }
245 |  \item \code{explore_eval}:
246 |    \itemize{
247 |      \item \code{multiplier} [real]  - Multiplier used to make all rejection sample probabilities <= 1
248 |    }
249 |  \item \code{cb}:
250 |    \itemize{
251 |      \item \code{num_costs} [int] - number of num_costs If costs=0, contextual bandit learning
252 |      with multiline action dependent features (ADF) is triggered ("--cb_adf").
253 |      \item \code{cb_type} [string] - contextual bandit method to use in {ips,dm,dr, mtr (for ADF)} (default: "dr")
254 |      \item \code{eval} [bool] - Evaluate a policy rather than optimizing (default: FALSE)
255 |      \item \code{rank_all} [bool] - Return actions sorted by score order. (for ADF) (default: FALSE)
256 |      \item \code{no_predict} [bool] - Do not do a prediction when training. (for ADF) (default: FALSE)
257 |    }
258 |  \item \code{cb_explore}:
259 |    \itemize{
260 |      \item \code{num_actions} [bool] - number of actions in online explore-exploit for a <k> action contextual bandit problem.
261 |      If num_actions=0, online explore-exploit for a contextual bandit problem with multiline action dependent features (ADF) is triggered ("--cb_explore_adf").
262 |      \item \code{explore_type} [string] - Type of exploration to use: "epsilon" (epsilon-greedy exploration) (default),
263 |       "first" (tau-first exploration), "bag" (bagging-based exploration), "cover" (Online cover based exploration), "softmax" (softmax exploration),
264 |       "regcb" (RegCB-elim exploration), "regcbopt" (RegCB optimistic exploration). "softmax", "regcb" and "regcbopt" types are only avaliable for exploration with ADF. (default: "epsilon")
265 |      \item \code{explore_arg} [real] - Parameter for exploration algorithm. Applicable for "epsilon", "first", "bag" and "cover" types of exploration. (default: 0.05)
266 |      \item \code{psi} [real] - Disagreement parameter for "cover" algorithm. (default: 1)
267 |      \item \code{nounif} [bool] - Do not explore uniformly on zero-probability actions in "cover" algorithm. (default: FALSE)
268 |      \item \code{mellowness} [real] - "RegCB" mellowness parameter c_0. (default: 0.1)
269 |      \item \code{greedify} [bool] - Always update first policy once in "bag" (default: FALSE)
270 |      \item \code{lambda} [real] - Parameter for "softmax". (default: -1)
271 |      \item \code{cb_min_cost} [real] - Lower bound on cost. (default: 0) For ADF only
272 |      \item \code{cb_max_cost} [real] - Upper bound on cost. (default: 1) For ADF only
273 |      \item \code{first_only} [bool] - Only explore the first action in a tie-breaking event. For ADF only (default: FALSE)
274 |    }
275 |  \item \code{cbify}:
276 |    \itemize{
277 |      \item \code{num_classes} [int] - number of classes
278 |      \item \code{cbify_cs} [bool] - consume cost-sensitive classification examples instead of multiclass (default: FALSE)
279 |      \item \code{loss0} [real] - loss for correct label (default: 0)
280 |      \item \code{loss1} [real] - loss for incorrect label (default: 1)
281 |    }
282 |  \item \code{multiworld_test}:
283 |    \itemize{
284 |      \item \code{features} [string] - Evaluate features as a policies
285 |      \item \code{learn} [int] - Do Contextual Bandit learning on <n> classes.
286 |      \item \code{num_classes} [bool] - Discard mwt policy features before learning (default: FALSE)
287 |    }
288 |  \item \code{nn}:
289 |    \itemize{
290 |      \item \code{num_hidden} [int] - number of hidden units
291 |      \item \code{inpass} [bool] - Train or test sigmoidal feedforward network with input passthrough (default: FALSE)
292 |      \item \code{multitask} [bool] - Share hidden layer across all reduced tasks (default: FALSE)
293 |      \item \code{dropout} [bool] - Train or test sigmoidal feedforward network using dropout (default: FALSE)
294 |      \item \code{meanfield} [bool] - Train or test sigmoidal feedforward network using mean field (default: FALSE)
295 |    }
296 |  \item \code{topk}:
297 |    \itemize{
298 |      \item \code{num_k} [int] - number of top k recomendations
299 |    }
300 |  \item \code{struct_search}:
301 |    \itemize{
302 |      \item \code{id} [int] - maximum action id or 0 for LDF
303 |      \item \code{search_task} [string] - search task: sequence, sequencespan, sequence_ctg, argmax, sequence_demoldf, multiclasstask, dep_parser, entity_relation, hook, graph
304 |      \item \code{search_interpolation} [string] - at what level should interpolation happen? (data or policy)
305 |      \item \code{search_rollout} [string] - how should rollouts be executed? (policy, oracle, mix_per_state, mix_per_roll, none)
306 |      \item \code{search_rollin} [string] - how should past trajectories be generated? (policy, oracle, mix_per_state, mix_per_roll)
307 |      \item \code{search_passes_per_policy} [int] - number of passes per policy (only valid for search_interpolation=policy). (default: 1)
308 |      \item \code{search_beta} [real] - interpolation rate for policies (only valid for search_interpolation=policy). (default: 0.5)
309 |      \item \code{search_alpha} [real] - annealed beta = 1-(1-alpha)^t (only valid for search_interpolation=data). (default: 1e-10)
310 |      \item \code{search_total_nb_policies} [int] - if we are going to train the policies through multiple separate calls to vw, we need to specify this parameter and tell vw how many policies are eventually going to be trained
311 |      \item \code{search_trained_nb_policies} [int] - the number of trained policies in a file
312 |      \item \code{search_allowed_transitions} [string] - read file of allowed transitions. default: all transitions are allowed
313 |      \item \code{search_subsample_time} [real] - instead of training at all timesteps, use a subset. if value in (0,1), train on a random v%. if v>=1, train on precisely v steps per example, if v<=-1, use active learning
314 |      \item \code{search_neighbor_features} [string] - copy features from neighboring lines. argument looks like: '-1:a,+2' meaning copy previous line from namespace "a" and next line from namespace "unnamed", where ',' separates them
315 |      \item \code{search_rollout_num_steps} [int] - how many calls of "loss" before we stop really predicting on rollouts and switch to oracle (default means "infinite")
316 |      \item \code{search_history_length} [int] - some tasks allow you to specify how much history their depend on; specify that here. (default: 1)
317 |      \item \code{search_no_caching} [bool] - turn off the built-in caching ability (makes things slower, but technically more safe) (default: FALSE)
318 |      \item \code{search_xv} [bool] - train two separate policies, alternating prediction/learning. (default: FALSE)
319 |      \item \code{search_perturb_oracle} [real] - perturb the oracle on rollin with this probability. (default: 0)
320 |      \item \code{search_linear_ordering} [bool]  - insist on generating examples in linear order. (default: FALSE and using hoopla permutation)
321 |      \item \code{search_active_verify} [real] - verify that active learning is doing the right thing (arg = multiplier, should be = cost_range * range_c)
322 |      \item \code{search_save_every_k_runs} [int] - save model every k runs
323 |    }
324 |  \item \code{boosting}:
325 |    \itemize{
326 |      \item \code{num_learners} [int] - number of weak learners
327 |      \item \code{gamma} [real] - weak learner's edge (=0.1), used only by online BBM (default: 0.100000001)
328 |      \item \code{alg} - specify the boosting algorithm: BBM (default), logistic (AdaBoost.OL.W), adaptive (AdaBoost.OL) (default: "BBM")
329 |    }
330 |  \item \code{marginal}:
331 |    \itemize{
332 |      \item \code{ids} [string] - Substitute marginal label estimates for ids
333 |      \item \code{initial_denominator} [real] - Initial denominator (default: 1)
334 |      \item \code{initial_numerator} [real] - Initial numerator (default: 0.5)
335 |      \item \code{compete} [bool] - Enable competition with marginal features (default: FALSE)
336 |      \item \code{update_before_learn} [string] - Update marginal values before learning (Available options: "on", "off") (default: "off")
337 |      \item \code{unweighted_marginals} [string] - Ignore importance weights when computing marginals (Available options: "on", "off") (default: "off")
338 |      \item \code{decay} [real] - Decay multiplier per event (1e-3 for example) (default=0)
339 |    }
340 | }}
341 | }
342 | \value{
343 | vwmodel list class
344 | }
345 | \description{
346 | Sets up VW model together with parameters and data
347 | }
348 | \examples{
349 | vwsetup(
350 |  dir = tempdir(),
351 |  model = "pk_mdl.vw",
352 |  general_params = list(loss_function="logistic", link="logistic"),
353 |  optimization_params = list(adaptive=FALSE),
354 |  option = "binary"
355 | )
356 | 
357 | }
358 | 


--------------------------------------------------------------------------------
/man/vwtest.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R, R/functions.R
 3 | \name{vwtest}
 4 | \alias{vwtest}
 5 | \alias{predict.vw}
 6 | \title{Compute predictions using Vowpal Wabbit model}
 7 | \usage{
 8 | vwtest(vwmodel, data, probs_path = "", full_probs = FALSE,
 9 |   readable_model = NULL, readable_model_path = "", quiet = FALSE,
10 |   passes = 1L, cache = FALSE, raw = FALSE, progress = NULL,
11 |   namespaces = NULL, keep_space = NULL, fixed = NULL,
12 |   targets = NULL, probabilities = NULL, weight = NULL, base = NULL,
13 |   tag = NULL, multiline = NULL)
14 | 
15 | \method{predict}{vw}(object, data, probs_path = "", full_probs = FALSE,
16 |   readable_model = NULL, quiet = FALSE, ...)
17 | }
18 | \arguments{
19 | \item{vwmodel}{[vw] Model of vw class to train.}
20 | 
21 | \item{data}{[string or data.frame] Path to training data in .vw plain text format or data.frame.
22 | If \code{[data.frame]} then will be parsed using \code{df2vw} function.}
23 | 
24 | \item{probs_path}{[string] Path to file where to save predictions.}
25 | 
26 | \item{full_probs}{[bool] Output full predictions in data.frame format. If not, force predictions into a single vector (default).}
27 | 
28 | \item{readable_model}{[string] Print trained model in human readable format ("hashed") 
29 | and also with human readable features ("inverted").}
30 | 
31 | \item{readable_model_path}{[string] Path to file where to save readable model.}
32 | 
33 | \item{quiet}{[bool] Do not print anything to the console.}
34 | 
35 | \item{passes}{[int] Number of times the algorithm will cycle over the data (epochs).}
36 | 
37 | \item{cache}{[bool] Use a cache for a data file.}
38 | 
39 | \item{raw}{[bool] Output unnormalized predictions. Default is FALSE.}
40 | 
41 | \item{progress}{[int/real] Progress update frequency. int: additive, real: multiplicative}
42 | 
43 | \item{namespaces}{[list or yaml file] For \code{df2vw}. Name of each namespace and
44 | each variable for each namespace can be a R list, or a YAML
45 | file example namespace with the IRIS database: namespaces =
46 | list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length',
47 | 'Petal.Width') this creates 2 namespaces (sepal
48 | and petal) containing the features defined by elements of this lists.}
49 | 
50 | \item{keep_space}{[string vector] For \code{df2vw}. Keep spaces for this features
51 | Example:"FERRARI 4Si"
52 | With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features
53 | Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature}
54 | 
55 | \item{fixed}{[string vector] fixed parsing for this features
56 | Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'").
57 | Can be used for LDA ("word_1:2 word_2:3" will stay the same),
58 | but should be used carefully, because special characters can ruin final VW format file.}
59 | 
60 | \item{targets}{[string or string vector] For \code{df2vw}.
61 | If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 
62 | If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 
63 | multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm.}
64 | 
65 | \item{probabilities}{[string vector] For \code{df2vw}. Vectors with action probabilities for Contextual Bandit algorithm.}
66 | 
67 | \item{weight}{[string] For \code{df2vw}. Weight (importance) of each line of the dataset.}
68 | 
69 | \item{base}{[string] For \code{df2vw}. Base of each line of the dataset. Used for residual regression.}
70 | 
71 | \item{tag}{[string] For \code{df2vw}. Tag of each line of the dataset.}
72 | 
73 | \item{multiline}{[integer] Number of labels (separate lines) for multilines example}
74 | 
75 | \item{object}{Model of vw class to train for \code{predict.vw}}
76 | 
77 | \item{...}{Parameters passed to \code{predict.vw}}
78 | }
79 | \value{
80 | Numerical vector containing predictions
81 | }
82 | \description{
83 | \code{vwtest} computes predictions using VW model from \code{\link{vwsetup}}
84 | \code{predict.vw} compute predictions using parser settings from \code{\link{vwtrain}}
85 | }
86 | \examples{
87 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
88 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw") 
89 | test_vwmodel <- vwsetup()
90 | vwtrain(test_vwmodel, data = ext_train_data)
91 | vwtest(test_vwmodel, data = ext_test_data)
92 | }
93 | 


--------------------------------------------------------------------------------
/man/vwtrain.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/RcppExports.R
 3 | \name{vwtrain}
 4 | \alias{vwtrain}
 5 | \title{Train Vowpal Wabbit model}
 6 | \usage{
 7 | vwtrain(vwmodel, data, readable_model = NULL, readable_model_path = "",
 8 |   quiet = FALSE, update_model = FALSE, passes = 1L, cache = FALSE,
 9 |   progress = NULL, namespaces = NULL, keep_space = NULL,
10 |   fixed = NULL, targets = NULL, probabilities = NULL,
11 |   weight = NULL, base = NULL, tag = NULL, multiline = NULL)
12 | }
13 | \arguments{
14 | \item{vwmodel}{[vw] Model of vw class to train}
15 | 
16 | \item{data}{[string or data.frame] Path to training data in .vw plain text format or data.frame.
17 | If \code{[data.frame]} then will be parsed using \code{df2vw} function.}
18 | 
19 | \item{readable_model}{[string] Print trained model in human readable format ("hashed") 
20 | and also with human readable features ("inverted")}
21 | 
22 | \item{readable_model_path}{[string] Path to file where to save readable model.}
23 | 
24 | \item{quiet}{[logical] Do not print anything to the console}
25 | 
26 | \item{update_model}{[logical] Update an existing model, when training with new data. \code{FALSE} by default.}
27 | 
28 | \item{passes}{[int] Number of times the algorithm will cycle over the data (epochs).}
29 | 
30 | \item{cache}{[bool] Use a cache for a data file.}
31 | 
32 | \item{progress}{[int/real] Progress update frequency. int: additive, real: multiplicative}
33 | 
34 | \item{namespaces}{[list or yaml file] For \code{df2vw}. Name of each namespace and
35 | each variable for each namespace can be a R list, or a YAML
36 | file example namespace with the IRIS database: namespaces =
37 | list(sepal = list('Sepal.Length', 'Sepal.Width'), petal = list('Petal.Length',
38 | 'Petal.Width') this creates 2 namespaces (sepal
39 | and petal) containing the features defined by elements of this lists.}
40 | 
41 | \item{keep_space}{[string vector] For \code{df2vw}. Keep spaces for this features
42 | Example:"FERRARI 4Si"
43 | With \code{keep_space} will be "FERRARI 4Si" and will be treated as two features
44 | Without \code{keep_space} will be "FERRARI_4Si" and will be treated as one feature}
45 | 
46 | \item{fixed}{[string vector] fixed parsing for this features
47 | Similar to \code{keep_space}, but parse features exactly without replacement of special characters ("(", ")", "|", ":", "'").
48 | Can be used for LDA ("word_1:2 word_2:3" will stay the same),
49 | but should be used carefully, because special characters can ruin final VW format file.}
50 | 
51 | \item{targets}{[string or string vector] For \code{df2vw}.
52 | If \code{[string]} then will be treated as vector with real number labels for regular VW input format. 
53 | If \code{[string vector]} then will be treated as vectors with class costs for wap and csoaa 
54 | multi-class classification algorithms or as vectors with actions for Contextual Bandit algorithm.}
55 | 
56 | \item{probabilities}{[string vector] For \code{df2vw}. vectors with action probabilities for Contextual Bandit algorithm.}
57 | 
58 | \item{weight}{[string] For \code{df2vw}. Weight (importance) of each line of the dataset.}
59 | 
60 | \item{base}{[string] For \code{df2vw}. base of each line of the dataset. Used for residual regression.}
61 | 
62 | \item{tag}{[string] For \code{df2vw}. Tag of each line of the dataset.}
63 | 
64 | \item{multiline}{[integer] number of labels (separate lines) for multilines examle}
65 | }
66 | \description{
67 | vwtrain is an interface to train VW model from \code{\link{vwsetup}}
68 | }
69 | \examples{
70 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
71 | test_vwmodel <- vwsetup()
72 | vwtrain(test_vwmodel, data = ext_train_data)
73 | }
74 | 


--------------------------------------------------------------------------------
/rvw.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageInstallArgs: --no-multiarch --with-keep.source --clean
17 | PackageCheckArgs: --as-cran
18 | PackageRoxygenize: rd
19 | 


--------------------------------------------------------------------------------
/src/Makevars.in:
--------------------------------------------------------------------------------
1 | PKG_LIBS = -lvw
2 | CXX_STD = CXX11
3 | 


--------------------------------------------------------------------------------
/src/RcppExports.cpp:
--------------------------------------------------------------------------------
  1 | // Generated by using Rcpp::compileAttributes() -> do not edit by hand
  2 | // Generator token: 10BE3573-1514-4C36-9D1C-5A225CD40393
  3 | 
  4 | #include <Rcpp.h>
  5 | 
  6 | using namespace Rcpp;
  7 | 
  8 | // get_vw_version
  9 | std::string get_vw_version();
 10 | RcppExport SEXP _rvw_get_vw_version() {
 11 | BEGIN_RCPP
 12 |     Rcpp::RObject rcpp_result_gen;
 13 |     Rcpp::RNGScope rcpp_rngScope_gen;
 14 |     rcpp_result_gen = Rcpp::wrap(get_vw_version());
 15 |     return rcpp_result_gen;
 16 | END_RCPP
 17 | }
 18 | // vwtrain
 19 | void vwtrain(Rcpp::List& vwmodel, SEXP data, Rcpp::Nullable<Rcpp::String> readable_model, std::string readable_model_path, bool quiet, bool update_model, int passes, bool cache, Rcpp::Nullable<SEXP *> progress, Rcpp::Nullable<SEXP *> namespaces, Rcpp::Nullable<Rcpp::CharacterVector> keep_space, Rcpp::Nullable<Rcpp::CharacterVector> fixed, Rcpp::Nullable<Rcpp::CharacterVector> targets, Rcpp::Nullable<Rcpp::CharacterVector> probabilities, Rcpp::Nullable<Rcpp::String> weight, Rcpp::Nullable<Rcpp::String> base, Rcpp::Nullable<Rcpp::String> tag, Rcpp::Nullable<int> multiline);
 20 | RcppExport SEXP _rvw_vwtrain(SEXP vwmodelSEXP, SEXP dataSEXP, SEXP readable_modelSEXP, SEXP readable_model_pathSEXP, SEXP quietSEXP, SEXP update_modelSEXP, SEXP passesSEXP, SEXP cacheSEXP, SEXP progressSEXP, SEXP namespacesSEXP, SEXP keep_spaceSEXP, SEXP fixedSEXP, SEXP targetsSEXP, SEXP probabilitiesSEXP, SEXP weightSEXP, SEXP baseSEXP, SEXP tagSEXP, SEXP multilineSEXP) {
 21 | BEGIN_RCPP
 22 |     Rcpp::RNGScope rcpp_rngScope_gen;
 23 |     Rcpp::traits::input_parameter< Rcpp::List& >::type vwmodel(vwmodelSEXP);
 24 |     Rcpp::traits::input_parameter< SEXP >::type data(dataSEXP);
 25 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type readable_model(readable_modelSEXP);
 26 |     Rcpp::traits::input_parameter< std::string >::type readable_model_path(readable_model_pathSEXP);
 27 |     Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP);
 28 |     Rcpp::traits::input_parameter< bool >::type update_model(update_modelSEXP);
 29 |     Rcpp::traits::input_parameter< int >::type passes(passesSEXP);
 30 |     Rcpp::traits::input_parameter< bool >::type cache(cacheSEXP);
 31 |     Rcpp::traits::input_parameter< Rcpp::Nullable<SEXP *> >::type progress(progressSEXP);
 32 |     Rcpp::traits::input_parameter< Rcpp::Nullable<SEXP *> >::type namespaces(namespacesSEXP);
 33 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type keep_space(keep_spaceSEXP);
 34 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type fixed(fixedSEXP);
 35 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type targets(targetsSEXP);
 36 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type probabilities(probabilitiesSEXP);
 37 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type weight(weightSEXP);
 38 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type base(baseSEXP);
 39 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type tag(tagSEXP);
 40 |     Rcpp::traits::input_parameter< Rcpp::Nullable<int> >::type multiline(multilineSEXP);
 41 |     vwtrain(vwmodel, data, readable_model, readable_model_path, quiet, update_model, passes, cache, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline);
 42 |     return R_NilValue;
 43 | END_RCPP
 44 | }
 45 | // vwtest
 46 | SEXP vwtest(Rcpp::List& vwmodel, SEXP data, std::string probs_path, bool full_probs, Rcpp::Nullable<Rcpp::String> readable_model, std::string readable_model_path, bool quiet, int passes, bool cache, bool raw, Rcpp::Nullable<SEXP *> progress, Rcpp::Nullable<SEXP *> namespaces, Rcpp::Nullable<Rcpp::CharacterVector> keep_space, Rcpp::Nullable<Rcpp::CharacterVector> fixed, Rcpp::Nullable<Rcpp::CharacterVector> targets, Rcpp::Nullable<Rcpp::CharacterVector> probabilities, Rcpp::Nullable<Rcpp::String> weight, Rcpp::Nullable<Rcpp::String> base, Rcpp::Nullable<Rcpp::String> tag, Rcpp::Nullable<int> multiline);
 47 | RcppExport SEXP _rvw_vwtest(SEXP vwmodelSEXP, SEXP dataSEXP, SEXP probs_pathSEXP, SEXP full_probsSEXP, SEXP readable_modelSEXP, SEXP readable_model_pathSEXP, SEXP quietSEXP, SEXP passesSEXP, SEXP cacheSEXP, SEXP rawSEXP, SEXP progressSEXP, SEXP namespacesSEXP, SEXP keep_spaceSEXP, SEXP fixedSEXP, SEXP targetsSEXP, SEXP probabilitiesSEXP, SEXP weightSEXP, SEXP baseSEXP, SEXP tagSEXP, SEXP multilineSEXP) {
 48 | BEGIN_RCPP
 49 |     Rcpp::RObject rcpp_result_gen;
 50 |     Rcpp::RNGScope rcpp_rngScope_gen;
 51 |     Rcpp::traits::input_parameter< Rcpp::List& >::type vwmodel(vwmodelSEXP);
 52 |     Rcpp::traits::input_parameter< SEXP >::type data(dataSEXP);
 53 |     Rcpp::traits::input_parameter< std::string >::type probs_path(probs_pathSEXP);
 54 |     Rcpp::traits::input_parameter< bool >::type full_probs(full_probsSEXP);
 55 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type readable_model(readable_modelSEXP);
 56 |     Rcpp::traits::input_parameter< std::string >::type readable_model_path(readable_model_pathSEXP);
 57 |     Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP);
 58 |     Rcpp::traits::input_parameter< int >::type passes(passesSEXP);
 59 |     Rcpp::traits::input_parameter< bool >::type cache(cacheSEXP);
 60 |     Rcpp::traits::input_parameter< bool >::type raw(rawSEXP);
 61 |     Rcpp::traits::input_parameter< Rcpp::Nullable<SEXP *> >::type progress(progressSEXP);
 62 |     Rcpp::traits::input_parameter< Rcpp::Nullable<SEXP *> >::type namespaces(namespacesSEXP);
 63 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type keep_space(keep_spaceSEXP);
 64 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type fixed(fixedSEXP);
 65 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type targets(targetsSEXP);
 66 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::CharacterVector> >::type probabilities(probabilitiesSEXP);
 67 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type weight(weightSEXP);
 68 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type base(baseSEXP);
 69 |     Rcpp::traits::input_parameter< Rcpp::Nullable<Rcpp::String> >::type tag(tagSEXP);
 70 |     Rcpp::traits::input_parameter< Rcpp::Nullable<int> >::type multiline(multilineSEXP);
 71 |     rcpp_result_gen = Rcpp::wrap(vwtest(vwmodel, data, probs_path, full_probs, readable_model, readable_model_path, quiet, passes, cache, raw, progress, namespaces, keep_space, fixed, targets, probabilities, weight, base, tag, multiline));
 72 |     return rcpp_result_gen;
 73 | END_RCPP
 74 | }
 75 | // vwaudit
 76 | Rcpp::DataFrame vwaudit(Rcpp::List& vwmodel, bool quiet);
 77 | RcppExport SEXP _rvw_vwaudit(SEXP vwmodelSEXP, SEXP quietSEXP) {
 78 | BEGIN_RCPP
 79 |     Rcpp::RObject rcpp_result_gen;
 80 |     Rcpp::RNGScope rcpp_rngScope_gen;
 81 |     Rcpp::traits::input_parameter< Rcpp::List& >::type vwmodel(vwmodelSEXP);
 82 |     Rcpp::traits::input_parameter< bool >::type quiet(quietSEXP);
 83 |     rcpp_result_gen = Rcpp::wrap(vwaudit(vwmodel, quiet));
 84 |     return rcpp_result_gen;
 85 | END_RCPP
 86 | }
 87 | 
 88 | static const R_CallMethodDef CallEntries[] = {
 89 |     {"_rvw_get_vw_version", (DL_FUNC) &_rvw_get_vw_version, 0},
 90 |     {"_rvw_vwtrain", (DL_FUNC) &_rvw_vwtrain, 18},
 91 |     {"_rvw_vwtest", (DL_FUNC) &_rvw_vwtest, 20},
 92 |     {"_rvw_vwaudit", (DL_FUNC) &_rvw_vwaudit, 2},
 93 |     {NULL, NULL, 0}
 94 | };
 95 | 
 96 | RcppExport void R_init_rvw(DllInfo *dll) {
 97 |     R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
 98 |     R_useDynamicSymbols(dll, FALSE);
 99 | }
100 | 


--------------------------------------------------------------------------------
/src/extra/array_parameters.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <string.h>
  3 | #include <unordered_map>
  4 | #ifndef _WIN32
  5 | #include <sys/mman.h>
  6 | #endif
  7 | 
  8 | // It appears that on OSX MAP_ANONYMOUS is mapped to MAP_ANON
  9 | // https://github.com/leftmike/foment/issues/4
 10 | #ifdef __APPLE__
 11 | #define MAP_ANONYMOUS MAP_ANON
 12 | #endif
 13 | 
 14 | #include "array_parameters_dense.h"
 15 | 
 16 | class sparse_parameters;
 17 | typedef std::unordered_map<uint64_t, weight*> weight_map;
 18 | 
 19 | template <typename T>
 20 | class sparse_iterator
 21 | {
 22 | private:
 23 | 	weight_map::iterator _iter;
 24 | 	uint32_t _stride;
 25 | 
 26 | public:
 27 | 	typedef std::forward_iterator_tag iterator_category;
 28 | 	typedef T value_type;
 29 | 	typedef ptrdiff_t difference_type;
 30 | 	typedef  T* pointer;
 31 | 	typedef  T& reference;
 32 | 
 33 | 	sparse_iterator(weight_map::iterator& iter, uint32_t stride)
 34 | 		: _iter(iter), _stride(stride)
 35 | 	{ }
 36 | 
 37 | 	sparse_iterator& operator=(const sparse_iterator& other)
 38 | 	{
 39 | 		_iter = other._iter;
 40 | 		_stride = other._stride;
 41 | 		return *this;
 42 | 
 43 | 	}
 44 | 	uint64_t index() { return _iter->first; }
 45 | 
 46 | 	T& operator*() { return *(_iter->second); }
 47 | 
 48 | 	sparse_iterator& operator++()
 49 | 	{
 50 | 		_iter++;
 51 | 		return *this;
 52 | 	}
 53 | 
 54 | 	bool operator==(const sparse_iterator& rhs) const { return _iter == rhs._iter; }
 55 | 	bool operator!=(const sparse_iterator& rhs) const { return _iter != rhs._iter; }
 56 | };
 57 | 
 58 | 
 59 | class sparse_parameters
 60 | {
 61 | private:
 62 | 	weight_map _map;
 63 | 	uint64_t _weight_mask;  // (stride*(1 << num_bits) -1)
 64 | 	uint32_t _stride_shift;
 65 | 	bool _seeded; // whether the instance is sharing model state with others
 66 | 	bool _delete;
 67 | 	void* default_data;
 68 |   float* default_value;
 69 | public:
 70 | 	typedef sparse_iterator<weight> iterator;
 71 | 	typedef sparse_iterator<const weight> const_iterator;
 72 |  private:
 73 | 	void(*fun)(const weight*, void*);
 74 |  public:
 75 | 
 76 | 	sparse_parameters(size_t length, uint32_t stride_shift = 0)
 77 | 		: _map(),
 78 | 		_weight_mask((length << stride_shift) - 1),
 79 | 		_stride_shift(stride_shift),
 80 | 		_seeded(false), _delete(false), default_data(nullptr),
 81 |     fun(nullptr)
 82 | 	{ default_value = calloc_mergable_or_throw<weight>(stride());}
 83 | 
 84 | 	sparse_parameters()
 85 | 		: _map(), _weight_mask(0), _stride_shift(0), _seeded(false), _delete(false), default_data(nullptr), fun(nullptr)
 86 | 	{ default_value = calloc_mergable_or_throw<weight>(stride());}
 87 | 
 88 | 	bool not_null() { return (_weight_mask > 0 && !_map.empty()); }
 89 | 
 90 | 	sparse_parameters(const sparse_parameters &other) { shallow_copy(other); }
 91 | 	sparse_parameters(sparse_parameters &&) = delete;
 92 | 
 93 | 	weight* first() { throw 1; } //TODO: Throw better exceptions. Allreduce currently not supported in sparse.
 94 | 
 95 | 	//iterator with stride
 96 | 	iterator begin() { weight_map::iterator i = _map.begin(); return iterator(i, stride()); }
 97 | 	iterator end() { weight_map::iterator i = _map.end(); return iterator(i, stride()); }
 98 | 
 99 | 	//const iterator
100 | 	const_iterator cbegin() { weight_map::iterator i = _map.begin(); return const_iterator(i,  stride()); }
101 | 	const_iterator cend() { weight_map::iterator i = _map.begin(); return const_iterator(i, stride()); }
102 | 
103 | 	inline weight& operator[](size_t i)
104 | 	{   uint64_t index = i & _weight_mask;
105 |     weight_map::iterator iter = _map.find(index);
106 | 		if (iter == _map.end())
107 | 		  {     _map.insert(std::make_pair(index, calloc_mergable_or_throw<weight>(stride())));
108 | 			iter = _map.find(index);
109 | 			if (fun != nullptr)
110 |         fun(iter->second, default_data);
111 | 		}
112 | 		return *(iter->second);
113 | 	}
114 | 
115 |   inline const weight& operator[](size_t i) const
116 | 	{   uint64_t index = i & _weight_mask;
117 | 		weight_map::const_iterator iter = _map.find(index);
118 | 		if (iter == _map.end())
119 |       return *default_value;
120 | 		return *(iter->second);
121 |   }
122 | 
123 | 	inline weight& strided_index(size_t index) { return operator[](index << _stride_shift); }
124 | 
125 | 	void shallow_copy(const sparse_parameters& input)
126 | 	{
127 | 		// TODO: this is level-1 copy (weight* are stilled shared)
128 | 		if (!_seeded)
129 | 		{
130 | 		  for (auto iter = _map.begin(); iter != _map.end(); ++iter)
131 | 				free(iter->second);
132 | 		}
133 | 		_map = input._map;
134 | 		_weight_mask = input._weight_mask;
135 | 		_stride_shift = input._stride_shift;
136 |     free(default_value);
137 |     default_value = calloc_mergable_or_throw<weight>(stride());
138 |     memcpy(default_value, input.default_value, stride());
139 |     default_data = input.default_data;
140 | 		_seeded = true;
141 | 	}
142 | 
143 | 	template<class R, class T> void set_default(R& info)
144 | 	{
145 | 	  R& new_R = calloc_or_throw<R>();
146 | 	  new_R = info;
147 | 	  default_data = &new_R;
148 | 	  fun = (void(*)(const weight*, void*))T::func;
149 |     fun(default_value, default_data);
150 | 	}
151 | 
152 | 	template<class T> void set_default() { fun = (void(*)(const weight*, void*))T::func; }
153 | 
154 | 	void set_zero(size_t offset)
155 | 	{
156 | 		for (weight_map::iterator iter = _map.begin(); iter != _map.end(); ++iter)
157 | 			(&(*(iter->second)))[offset] = 0;
158 | 	}
159 | 
160 | 	uint64_t mask()	const { return _weight_mask; }
161 | 
162 | 	uint64_t seeded() const { return _seeded; }
163 | 
164 | 	uint32_t stride() const { return 1 << _stride_shift; }
165 | 
166 | 	uint32_t stride_shift()	const { return _stride_shift; }
167 | 
168 | 	void stride_shift(uint32_t stride_shift) {
169 |     _stride_shift = stride_shift;
170 |     free(default_value);
171 |     default_value = calloc_mergable_or_throw<weight>(stride());
172 |     if (fun != nullptr)
173 |       fun(default_value, default_data);
174 |   }
175 | 
176 | #ifndef _WIN32
177 | 	void share(size_t length)
178 | 	{throw 1; //TODO: add better exceptions
179 | 	}
180 | #endif
181 | 
182 | 	~sparse_parameters()
183 | 	{if (!_delete && !_seeded)  // don't free weight vector if it is shared with another instance
184 | 		{
185 |     for (auto iter = _map.begin(); iter != _map.end(); ++iter)
186 |       free(iter->second);
187 |     _map.clear();
188 | 		 _delete = true;
189 | 		}
190 |     if (default_data != nullptr)
191 |       free(default_data);
192 |     free(default_value);
193 | 	}
194 | };
195 | 
196 | class parameters {
197 |  public:
198 |   bool sparse;
199 |   dense_parameters dense_weights;
200 |   sparse_parameters sparse_weights;
201 | 
202 |   inline weight& operator[](size_t i)
203 |   {
204 |     if (sparse)
205 |       return sparse_weights[i];
206 |     else
207 |       return dense_weights[i];
208 |   }
209 | 
210 |   inline uint32_t stride_shift()
211 |   {
212 |     if (sparse)
213 |       return sparse_weights.stride_shift();
214 |     else
215 |       return dense_weights.stride_shift();
216 |   }
217 | 
218 |   inline uint32_t stride()
219 |   {
220 |     if (sparse)
221 |       return sparse_weights.stride();
222 |     else
223 |       return dense_weights.stride();
224 |   }
225 | 
226 |   inline uint64_t mask()
227 |   {
228 |     if (sparse)
229 |       return sparse_weights.mask();
230 |     else
231 |       return dense_weights.mask();
232 |   }
233 | 
234 |   inline uint64_t seeded()
235 |   {
236 |     if (sparse)
237 |       return sparse_weights.seeded();
238 |     else
239 |       return dense_weights.seeded();
240 |   }
241 | 
242 |   inline void shallow_copy(const parameters& input)
243 |   {
244 |     if (sparse)
245 |       sparse_weights.shallow_copy(input.sparse_weights);
246 |     else
247 |       dense_weights.shallow_copy(input.dense_weights);
248 |   }
249 | 
250 |   inline void set_zero(size_t offset)
251 |   {
252 |     if (sparse)
253 |       sparse_weights.set_zero(offset);
254 |     else
255 |       dense_weights.set_zero(offset);
256 |   }
257 | #ifndef _WIN32
258 |   inline void share(size_t length)
259 |   {
260 |     if (sparse)
261 |       sparse_weights.share(length);
262 |     else
263 |       dense_weights.share(length);
264 |   }
265 |   #endif
266 | 
267 |   inline void stride_shift(uint32_t stride_shift)
268 |   { if (sparse)
269 |       sparse_weights.stride_shift(stride_shift);
270 |     else
271 |       dense_weights.stride_shift(stride_shift);
272 |   }
273 | 
274 |   inline weight& strided_index(size_t index)
275 |   {
276 |     if (sparse)
277 |       return sparse_weights.strided_index(index);
278 |     else
279 |       return dense_weights.strided_index(index);
280 |   }
281 | 
282 |   inline bool not_null()
283 |   {
284 |     if (sparse)
285 |       return sparse_weights.not_null();
286 |     else
287 |       return dense_weights.not_null();
288 |   }
289 | };
290 | 


--------------------------------------------------------------------------------
/src/extra/array_parameters_dense.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | 
  3 | #include <cstdint>
  4 | #include "memory.h"
  5 | 
  6 | typedef float weight;
  7 | 
  8 | template <typename T>
  9 | class dense_iterator
 10 | {
 11 | private:
 12 |   T * _current;
 13 |   T* _begin;
 14 |   uint32_t _stride;
 15 | 
 16 | public:
 17 |   typedef std::forward_iterator_tag iterator_category;
 18 |   typedef T value_type;
 19 |   typedef std::ptrdiff_t difference_type;
 20 |   typedef  T* pointer;
 21 |   typedef  T& reference;
 22 | 
 23 |   dense_iterator(T* current, T* begin, uint32_t stride)
 24 |     : _current(current), _begin(begin), _stride(stride)
 25 |   { }
 26 | 
 27 |   T& operator*() { return *_current; }
 28 | 
 29 |   size_t index() { return _current - _begin; }
 30 | 
 31 |   dense_iterator& operator++()
 32 |   {
 33 |     _current += _stride;
 34 |     return *this;
 35 |   }
 36 | 
 37 |   bool operator==(const dense_iterator& rhs) const { return _current == rhs._current; }
 38 |   bool operator!=(const dense_iterator& rhs) const { return _current != rhs._current; }
 39 | };
 40 | 
 41 | class dense_parameters
 42 | {
 43 | private:
 44 |   weight * _begin;
 45 |   uint64_t _weight_mask;  // (stride*(1 << num_bits) -1)
 46 |   uint32_t _stride_shift;
 47 |   bool _seeded; // whether the instance is sharing model state with others
 48 | 
 49 | public:
 50 |   typedef dense_iterator<weight> iterator;
 51 |   typedef dense_iterator<const weight> const_iterator;
 52 |   dense_parameters(size_t length, uint32_t stride_shift = 0)
 53 |     : _begin(calloc_mergable_or_throw<weight>(length << stride_shift)),
 54 |     _weight_mask((length << stride_shift) - 1),
 55 |     _stride_shift(stride_shift),
 56 |     _seeded(false)
 57 |   { }
 58 | 
 59 |   dense_parameters()
 60 |     : _begin(nullptr), _weight_mask(0), _stride_shift(0), _seeded(false)
 61 |   {}
 62 | 
 63 |   bool not_null() { return (_weight_mask > 0 && _begin != nullptr); }
 64 | 
 65 |   dense_parameters(const dense_parameters &other) { shallow_copy(other); }
 66 |   dense_parameters(dense_parameters &&) = delete;
 67 | 
 68 |   weight* first() { return _begin; } //TODO: Temporary fix for allreduce.
 69 | 
 70 |                                      //iterator with stride
 71 |   iterator begin() { return iterator(_begin, _begin, stride()); }
 72 |   iterator end() { return iterator(_begin + _weight_mask + 1, _begin, stride()); }
 73 | 
 74 |   //const iterator
 75 |   const_iterator cbegin() { return const_iterator(_begin, _begin, stride()); }
 76 |   const_iterator cend() { return const_iterator(_begin + _weight_mask + 1, _begin, stride()); }
 77 | 
 78 |   inline weight& operator[](size_t i) const { return _begin[i & _weight_mask]; }
 79 |   void shallow_copy(const dense_parameters& input)
 80 |   {
 81 |     if (!_seeded)
 82 |       free(_begin);
 83 |     _begin = input._begin;
 84 |     _weight_mask = input._weight_mask;
 85 |     _stride_shift = input._stride_shift;
 86 |     _seeded = true;
 87 |   }
 88 | 
 89 |   inline weight& strided_index(size_t index) { return operator[](index << _stride_shift); }
 90 | 
 91 |   template<class R, class T> void set_default(R& info)
 92 |   {
 93 |     iterator iter = begin();
 94 |     for (size_t i = 0; iter != end(); ++iter, i += stride())
 95 |       T::func(*iter, info, iter.index());
 96 |   }
 97 | 
 98 |   template<class T> void set_default()
 99 |   {
100 |     iterator iter = begin();
101 |     for (size_t i = 0; iter != end(); ++iter, i += stride())
102 |       T::func(*iter, iter.index());
103 |   }
104 | 
105 |   void set_zero(size_t offset)
106 |   {
107 |     for (iterator iter = begin(); iter != end(); ++iter)
108 |       (&(*iter))[offset] = 0;
109 |   }
110 | 
111 |   uint64_t mask()	const { return _weight_mask; }
112 | 
113 |   uint64_t seeded() const { return _seeded; }
114 | 
115 |   uint32_t stride() const { return 1 << _stride_shift; }
116 | 
117 |   uint32_t stride_shift() const { return _stride_shift; }
118 | 
119 |   void stride_shift(uint32_t stride_shift) { _stride_shift = stride_shift; }
120 | 
121 | #ifndef _WIN32
122 | #ifndef DISABLE_SHARED_WEIGHTS
123 |   void share(size_t length)
124 |   {
125 |     float* shared_weights = (float*)mmap(0, (length << _stride_shift) * sizeof(float),
126 |       PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
127 |     size_t float_count = length << _stride_shift;
128 |     weight* dest = shared_weights;
129 |     memcpy(dest, _begin, float_count * sizeof(float));
130 |     free(_begin);
131 |     _begin = dest;
132 |   }
133 | #endif
134 | #endif
135 | 
136 |   ~dense_parameters()
137 |   {
138 |     if (_begin != nullptr && !_seeded)  // don't free weight vector if it is shared with another instance
139 |     {
140 |       free(_begin);
141 |       _begin = nullptr;
142 |     }
143 |   }
144 | };
145 | 


--------------------------------------------------------------------------------
/src/extra/error_reporting.h:
--------------------------------------------------------------------------------
 1 | #pragma once
 2 | typedef void(*trace_message_t)(void *context, const std::string&);
 3 | 
 4 | // TODO: change to virtual class
 5 | 
 6 | // invoke trace_listener when << endl is encountered.
 7 | class vw_ostream : public std::ostream
 8 | {
 9 | 	class vw_streambuf : public std::stringbuf
10 | 	{
11 | 		vw_ostream& parent;
12 | 	public:
13 |   vw_streambuf(vw_ostream& str) : parent(str){};
14 | 
15 | 		virtual int sync();
16 | 	};
17 | 	vw_streambuf buf;
18 | 
19 |  public:
20 | 	vw_ostream();
21 | 
22 | 	void* trace_context;
23 | 	trace_message_t trace_listener;
24 | };
25 | 


--------------------------------------------------------------------------------
/src/extra/example_predict.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) by respective owners including Yahoo!, Microsoft, and
 3 | individual contributors. All rights reserved.  Released under a BSD
 4 | license as described in the file LICENSE.
 5 | */
 6 | #pragma once
 7 | 
 8 | typedef unsigned char namespace_index;
 9 | 
10 | #include "v_array.h"
11 | #include "feature_group.h"
12 | 
13 | struct example_predict
14 | {
15 |   class iterator
16 |   {
17 |     features* _feature_space;
18 |     namespace_index* _index;
19 |   public:
20 |     iterator(features* feature_space, namespace_index* index)
21 |       : _feature_space(feature_space), _index(index)
22 |     { }
23 | 
24 |     features& operator*()
25 |     {
26 |       return _feature_space[*_index];
27 |     }
28 | 
29 |     iterator& operator++()
30 |     {
31 |       _index++;
32 |       return *this;
33 |     }
34 | 
35 |     namespace_index index() { return *_index; }
36 | 
37 |     bool operator==(const iterator& rhs) { return _index == rhs._index; }
38 |     bool operator!=(const iterator& rhs) { return _index != rhs._index; }
39 |   };
40 | 
41 |   v_array<namespace_index> indices;
42 |   features feature_space[256]; //Groups of feature values.
43 |   uint64_t ft_offset;//An offset for all feature values.
44 | 
45 |   iterator begin() { return iterator(feature_space, indices.begin()); }
46 |   iterator end() { return iterator(feature_space, indices.end()); }
47 | };
48 | 
49 | // make sure we have an exception safe version of example_predict
50 | class safe_example_predict : public example_predict
51 | {
52 | public:
53 |   safe_example_predict();
54 |   ~safe_example_predict();
55 | };
56 | 


--------------------------------------------------------------------------------
/src/extra/hash.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) by respective owners including Yahoo!, Microsoft, and
 3 | individual contributors. All rights reserved.  Released under a BSD
 4 | license as described in the file LICENSE.
 5 |  */
 6 | #pragma once
 7 | 
 8 | #include <sys/types.h>  // defines size_t
 9 | 
10 | // Platform-specific functions and macros
11 | #if defined(_MSC_VER)                       // Microsoft Visual Studio
12 | #   include <stdint.h>
13 | 
14 | #   include <stdlib.h>
15 | #   define ROTL32(x,y)  _rotl(x,y)
16 | #   define BIG_CONSTANT(x) (x)
17 | 
18 | #else                                       // Other compilers
19 | #   include <stdint.h>   // defines uint32_t etc
20 | 
21 | inline uint32_t rotl32(uint32_t x, int8_t r)
22 | { return (x << r) | (x >> (32 - r));
23 | }
24 | 
25 | #   define ROTL32(x,y)     rotl32(x,y)
26 | #   define BIG_CONSTANT(x) (x##LLU)
27 | 
28 | #endif                                      // !defined(_MSC_VER)
29 | 
30 | namespace MURMUR_HASH_3
31 | {
32 | 
33 | //-----------------------------------------------------------------------------
34 | // Finalization mix - force all bits of a hash block to avalanche
35 | 
36 | static inline uint32_t fmix(uint32_t h)
37 | { h ^= h >> 16;
38 |   h *= 0x85ebca6b;
39 |   h ^= h >> 13;
40 |   h *= 0xc2b2ae35;
41 |   h ^= h >> 16;
42 | 
43 |   return h;
44 | }
45 | }
46 | 
47 | const uint32_t hash_base = 0;
48 | 
49 | uint64_t uniform_hash(const void *key, size_t length, uint64_t seed);
50 | 


--------------------------------------------------------------------------------
/src/extra/no_label.h:
--------------------------------------------------------------------------------
 1 | /*
 2 | Copyright (c) by respective owners including Yahoo!, Microsoft, and
 3 | individual contributors. All rights reserved.  Released under a BSD
 4 | license as described in the file LICENSE.
 5 |  */
 6 | #pragma once
 7 | #include "label_parser.h"
 8 | 
 9 | struct example;
10 | struct vw;
11 | 
12 | namespace no_label {
13 |   typedef char no_label;
14 |   
15 |   void return_no_label_example(vw& all, void*, example& ec);
16 |   
17 |   extern label_parser no_label_parser;
18 |   
19 |   void print_no_label_update(vw& all, example &ec);
20 |   void output_and_account_no_label_example(vw& all, example& ec);
21 | }
22 | 


--------------------------------------------------------------------------------
/src/extra/parser_helper.h:
--------------------------------------------------------------------------------
  1 | #pragma once
  2 | #include <boost/program_options.hpp>
  3 | #include <type_traits>
  4 | #include "vw_exception.h"
  5 | #include "error_reporting.h"
  6 | #include<iostream>
  7 | #include <string.h>
  8 | namespace po = boost::program_options;
  9 | 
 10 | struct vw;
 11 | 
 12 | class arguments {
 13 |   po::options_description new_od;//a set of options
 14 |   po::variables_map add_options_skip_duplicates(po::options_description& opts, bool do_notify);
 15 |   bool missing_critical;
 16 | 
 17 |   std::string long_only(const char* in)
 18 |     {//strip off a trailing short option
 19 |       char* maybe = strchr(const_cast<char*>(in), ',');
 20 |       if (maybe==nullptr)
 21 |         return std::string(in);
 22 |       else
 23 |         return std::string(in,maybe-in);
 24 |     }
 25 | 
 26 |  public:
 27 |   po::options_description all_opts; //All specified options.
 28 |   po::options_description opts; //Critical options and their dependencies.
 29 |   vw_ostream trace_message;//error reporting
 30 |   std::stringstream* file_options; // the set of options to store in the model file.
 31 |   po::variables_map vm; //A stored map from option to value.
 32 |   std::vector<std::string> args;//All arguments
 33 |   vw* all;//backdoor that should go away over time.
 34 | 
 35 |   //initialization
 36 |  arguments(vw& all_in, std::string name_in=""):new_od(name_in), missing_critical(false), all(&all_in) {file_options = new std::stringstream;};
 37 |  arguments():missing_critical(false){};//this should not be used but appears sometimes unavoidable.  Do an in-place allocation with the upper initializer after it is used.
 38 |   ~arguments(){ delete file_options;};
 39 | 
 40 |   //reinitialization
 41 |   arguments& new_options(std::string name_in="")
 42 |   {
 43 |     (&new_od)->~options_description();//in place delete
 44 |     new (&new_od) po::options_description(name_in);
 45 |     missing_critical=false;
 46 |     return *this;
 47 |   }
 48 | 
 49 |   //insert arguments
 50 |   arguments& operator()(const char* option, const char* description)
 51 |     {
 52 |       new_od.add_options()(option, description);
 53 |       return *this;
 54 |     }
 55 |   arguments& operator()(bool& exists, const char* option, const char* description)
 56 |     { return operator()(option, po::bool_switch(&exists), description); }
 57 |   template<class T> arguments& operator()(const char* option, T& location, const char* description)
 58 |     { return operator()(option, po::value(&location), description); }
 59 |   template<class T> arguments& operator()(const char* option, T& location, T def, const char* description)
 60 |     { return operator()(option, po::value(&location)->default_value(def), description); }
 61 |   arguments& operator()(const char* option, const po::value_semantic* type, const char* description)
 62 |     {
 63 |       new_od.add_options()(option, type, description);
 64 |       return *this;
 65 |     }
 66 |   //A keep option is kept in the model.
 67 |   template<class T> arguments& keep(const char* option, T& store, const char* description)
 68 |     { return keep<T>(option, po::value(&store), description); }
 69 |   template<class T> arguments& keep(const char* option, T& store, T def, const char* description)
 70 |     {
 71 |       return operator()(option,
 72 |                         po::value(&store)->default_value(def)
 73 |                         ->notifier([this, option, def] (T arg)
 74 |                                    {
 75 |                                      *this->file_options << " --" << long_only(option) << " " << arg;
 76 |                                    }),
 77 |                         description);
 78 |     }
 79 |   template<class T> arguments& keep(const char* option, po::typed_value<T>* type, const char* description)
 80 |     {
 81 |       return operator()(option,
 82 |                         type->notifier([this, option] (T arg)
 83 |                                        { *this->file_options << " --" << long_only(option) << " " << arg; }),
 84 |                         description);
 85 |     }
 86 |   template<class T> arguments& keep_vector(const char* option, po::typed_value<std::vector<T>>* type, const char* description)
 87 |     {
 88 |       return operator()(option,
 89 |                         type->multitoken()->composing()
 90 |                         ->notifier([this, option] (std::vector<T> arg)
 91 |                                    {
 92 |                                      for (auto i : arg)
 93 |                                        *this->file_options << " --" << long_only(option) << " " << i;
 94 |                                    }),
 95 |                         description);
 96 |     }
 97 |   arguments& keep(bool& exists, const char* option, const char* description)
 98 |     {
 99 |       return operator()(option,
100 |                         po::bool_switch(&exists)
101 |                         ->notifier([this, option] (bool v)
102 |                                    { if (v) *this->file_options << " --" << long_only(option); }),
103 |                         description);
104 |     }
105 |   arguments& keep(const char* option, const char* description)
106 |     {
107 |       bool temp=false;
108 |       return keep(temp, option, description);
109 |     }
110 | 
111 |   //A missing critical argument raises the missing flag.  Critical implies keep.
112 |   template<class T> arguments& critical(const char* option, T& store, const char* description)
113 |     { return critical<T>(option, po::value<T>(&store), description); }
114 |   template<class T> arguments& critical(const char* option, po::typed_value<T>* type, const char* description)
115 |     {
116 |       keep(option, type, description);
117 |       missing();
118 |       new_options();
119 |       missing_critical = !vm.count(option);
120 |       return *this;
121 |     }
122 |   template<class T> arguments& critical_vector(const char* option, po::typed_value<std::vector<T>>* type, const char* description, bool keep = true)
123 |     {
124 |       if (keep)
125 |         keep_vector(option, type, description);
126 |       else
127 |         operator()(option, type->multitoken()->composing(), description);
128 |       missing();
129 |       new_options();
130 |       missing_critical = !vm.count(option);
131 |       return *this;
132 |     }
133 |   template<class T> arguments& critical(const char* option, const char* description)
134 |     { return critical<T>(option, po::value<T>(), description); }
135 |   arguments& critical(const char* option, const char* description)
136 |     {
137 |       keep(option, description);
138 |       missing();
139 |       new_options();
140 |       missing_critical = !vm[option].as<bool>();
141 |       return *this;
142 |     }
143 | 
144 |   bool missing()  //Return true if key options are missing.
145 |   {
146 |     all_opts.add(new_od);
147 |     if (!missing_critical)
148 |       {
149 |         opts.add(new_od);    //compile options
150 |         auto new_vm = add_options_skip_duplicates(new_od, true);//do notify
151 |         for (auto& it : new_vm)
152 |           vm.insert(it);
153 |       }
154 |     return missing_critical;
155 |   }
156 | };
157 | 


--------------------------------------------------------------------------------
/src/helpers.cpp:
--------------------------------------------------------------------------------
  1 | #include "vw.h"
  2 | 
  3 | #include <Rcpp.h>
  4 | #include "helpers.h"
  5 | 
  6 | #include <fstream>
  7 | #include <stdio.h>
  8 | #include <string.h>
  9 | #include <iostream>
 10 | #include <streambuf>
 11 | #include <sstream>
 12 | #include <stdlib.h>
 13 | 
 14 | 
 15 | #include <RApiSerializeAPI.h>
 16 | 
 17 | #include <R.h>
 18 | #include <R_ext/Rdynload.h>
 19 | 
 20 | extern "C" {
 21 | #include "md5.h"
 22 | }
 23 | 
 24 | #ifdef _WIN32
 25 | #define PATH_SEPARATOR '\\' 
 26 | #else 
 27 | #define PATH_SEPARATOR '/' 
 28 | #endif 
 29 | 
 30 | 
 31 | // Based on code from R digest package http://dirk.eddelbuettel.com/code/digest.html
 32 | // Copyright (C) 2003 - 2016  Dirk Eddelbuettel <edd@debian.org>
 33 | std::string md5sum(char * char_x, uint32_t nChar) {
 34 |     char output[33+1];
 35 |     md5_context ctx;
 36 |     unsigned char md5sum[16];
 37 |     int j;
 38 |     md5_starts( &ctx );
 39 |     md5_update( &ctx, (uint8 *) char_x, nChar);
 40 |     md5_finish( &ctx, md5sum );
 41 |     memcpy(output, md5sum, 16);
 42 |     
 43 |     for (j = 0; j < 16; j++)
 44 |         sprintf(output + j * 2, "%02x", md5sum[j]);
 45 |     
 46 |     std::string output_str(output);
 47 |     
 48 |     return(output_str);
 49 | }
 50 | 
 51 | Rcpp::String check_data(Rcpp::List & vwmodel, std::string & valid_data_str, SEXP data, bool quiet, std::string mode,
 52 |                                  Rcpp::Nullable<SEXP *> namespaces, Rcpp::Nullable<Rcpp::CharacterVector> keep_space,
 53 |                                  Rcpp::Nullable<Rcpp::CharacterVector> fixed, 
 54 |                                  Rcpp::Nullable<Rcpp::CharacterVector> targets, Rcpp::Nullable<Rcpp::CharacterVector> probabilities,
 55 |                                  Rcpp::Nullable<Rcpp::String> weight, Rcpp::Nullable<Rcpp::String> base,
 56 |                                  Rcpp::Nullable<Rcpp::String> tag, Rcpp::Nullable<int> multiline) {
 57 |     
 58 |     // Check if:
 59 |     //  no previous parser options are in the model
 60 |     //  OR
 61 |     //  any of passed parser options is NOT NULL (possible when using "predict.vw" function)
 62 |     if(Rf_isNull(vwmodel["parser_opts"]) || (!Rf_isNull(namespaces) || !Rf_isNull(keep_space) || !Rf_isNull(fixed) || !Rf_isNull(targets) ||
 63 |        !Rf_isNull(probabilities) || !Rf_isNull(weight) || !Rf_isNull(base) ||
 64 |        !Rf_isNull(tag) || !Rf_isNull(multiline))) { 
 65 |        
 66 |        // In this case we want to use parser options that were passed with "vwtest" so we save parser options
 67 |        vwmodel["parser_opts"] = Rcpp::List::create(Rcpp::Named("namespaces") = namespaces , Rcpp::Named("keep_space") = keep_space,
 68 |                                                     Rcpp::Named("fixed") = fixed,
 69 |                                                     Rcpp::Named("targets") = targets, Rcpp::Named("probabilities") = probabilities,
 70 |                                                     Rcpp::Named("weight") = weight, Rcpp::Named("base") = base,
 71 |                                                     Rcpp::Named("tag") = tag, Rcpp::Named("multiline") = multiline);
 72 |        
 73 |     } else {
 74 |         // In this case we use previously saved parser options
 75 |         if(!quiet){
 76 |             Rcpp::Rcout << "Using parser options from the previous session" << std::endl;
 77 |         }
 78 |     }
 79 |     
 80 |     Rcpp::String data_md5sum("");
 81 |     uint32_t nChar;
 82 |     char * char_x;
 83 |     if(TYPEOF(data) == STRSXP) {
 84 |         // Use path to file as model input
 85 |         valid_data_str = Rcpp::as<std::string>(data);
 86 |         
 87 |         // Check path for whitespace
 88 |         if(valid_data_str.find_first_of("\t\n ") != valid_data_str.npos) {
 89 |             Rcpp::stop("Whitespace characters are not allowed in `data` path");
 90 |         }
 91 |         
 92 |         std::ifstream data_instream(valid_data_str);
 93 |         std::string data_contents((std::istreambuf_iterator<char>(data_instream)), 
 94 |                              std::istreambuf_iterator<char>());
 95 |         
 96 |         char_x = &data_contents[0u];
 97 |         nChar = data_contents.length();
 98 |         data_md5sum = md5sum(char_x, nChar);
 99 |     } else if(TYPEOF(data) == VECSXP) {
100 |         // Parse data frame and use VW file as model input
101 |         
102 |         // Update valid data string
103 |         valid_data_str = Rcpp::as<std::string>(vwmodel["dir"]) + PATH_SEPARATOR + mode + ".vw";
104 |         // Compute md5sum of data.frame
105 |         Rcpp::RawVector x = serializeToRaw(data);
106 |         char_x = (char*) RAW(x);
107 |         nChar = XLENGTH(x);
108 |         
109 |         data_md5sum = md5sum(char_x, nChar);
110 |         
111 |         // Compare new md5sum with old md5sum
112 |         Rcpp::List vwmodel_md5sums = vwmodel["data_md5sum"];
113 |         Rcpp::String model_md5sum = vwmodel_md5sums[mode];
114 |         
115 |         
116 |         if (model_md5sum != data_md5sum) {
117 |             if(!quiet){
118 |                 Rcpp::Rcout << "Converting data.frame to VW format" << std::endl;
119 |             }
120 |             Rcpp::Environment env("package:rvw");
121 |             Rcpp::Function r_df2vw = env["df2vw"];
122 |             // Convert data.frame to VW
123 |             
124 |             Rcpp::List saved_parser_opts = vwmodel["parser_opts"];
125 |             
126 |             // Rcpp::Nullable<SEXP *> valid_namespaces = saved_parser_opts["namespaces"];
127 |             // Rcpp::Nullable<Rcpp::CharacterVector> valid_keep_space = saved_parser_opts["keep_space"];
128 |             // Rcpp::Nullable<Rcpp::CharacterVector> valid_targets = saved_parser_opts["targets"];
129 |             // Rcpp::Nullable<Rcpp::CharacterVector> valid_probabilities = saved_parser_opts["probabilities"];
130 |             // Rcpp::Nullable<Rcpp::String> valid_weight = saved_parser_opts["weight"];
131 |             // Rcpp::Nullable<Rcpp::String> valid_base = saved_parser_opts["base"];
132 |             // Rcpp::Nullable<Rcpp::String> valid_tag = saved_parser_opts["tag"]; 
133 |             // Rcpp::Nullable<int> valid_multiline = saved_parser_opts["multiline"];
134 |             
135 |             r_df2vw(data, valid_data_str,
136 |                     saved_parser_opts["namespaces"], saved_parser_opts["keep_space"], saved_parser_opts["fixed"],
137 |                     saved_parser_opts["targets"], saved_parser_opts["probabilities"],
138 |                     saved_parser_opts["weight"], saved_parser_opts["base"], saved_parser_opts["tag"], saved_parser_opts["multiline"],
139 |                     false
140 |             );
141 |         }
142 |         
143 |     } else {
144 |         Rcpp::stop("Only String and data.frame types are supported");
145 |     }
146 |     return data_md5sum;
147 | }
148 | 
149 | // Get number of examples used in model
150 | int get_num_example(vw& all) {
151 |     return all.sd->example_number + all.sd->weighted_holdout_examples;
152 | }
153 | 
154 | bool file_exists(std::string file_name)
155 | {
156 |     std::ifstream infile (file_name.c_str());
157 |     return infile.good();
158 | }
159 | 
160 | // setup function from VW main.cc file
161 | // modified to work in library mode using Rcpp
162 | vw* setup_model(std::string args_str) {
163 |     
164 |     int argc;
165 |     char** argv = VW::get_argv_from_string(args_str, argc);
166 |     
167 |     vw* all = nullptr;
168 |     try { all = VW::initialize(argc, argv);
169 |     }
170 |     catch(const VW::vw_exception& ex){
171 |         Rcpp::Rcout << ex.what() << std::endl;
172 |         throw;
173 |     }
174 |     catch(...)
175 |     {
176 |         Rcpp::Rcout << "unknown exception" << std::endl;
177 |         throw;
178 |     }
179 |     
180 |     if (!all->quiet && !all->bfgs && !all->searchstr && !all->opts_n_args.vm.count("audit_regressor"))
181 |     {
182 |         Rcpp::Rcout << std::left
183 |                     << std::setw(shared_data::col_avg_loss) << std::left << "average"
184 |                     << " "
185 |                     << std::setw(shared_data::col_since_last) << std::left << "since"
186 |                     << " "
187 |                     << std::right
188 |                     << std::setw(shared_data::col_example_counter) << "example"
189 |                     << " "
190 |                     << std::setw(shared_data::col_example_weight) << "example"
191 |                     << " "
192 |                     << std::setw(shared_data::col_current_label) << "current"
193 |                     << " "
194 |                     << std::setw(shared_data::col_current_predict) << "current"
195 |                     << " "
196 |                     << std::setw(shared_data::col_current_features) << "current"
197 |                     << std::endl;
198 |         Rcpp::Rcout << std::left
199 |                     << std::setw(shared_data::col_avg_loss) << std::left << "loss"
200 |                     << " "
201 |                     << std::setw(shared_data::col_since_last) << std::left << "last"
202 |                     << " "
203 |                     << std::right
204 |                     << std::setw(shared_data::col_example_counter) << "counter"
205 |                     << " "
206 |                     << std::setw(shared_data::col_example_weight) << "weight"
207 |                     << " "
208 |                     << std::setw(shared_data::col_current_label) << "label"
209 |                     << " "
210 |                     << std::setw(shared_data::col_current_predict) << "predict"
211 |                     << " "
212 |                     << std::setw(shared_data::col_current_features) << "features"
213 |                     << std::endl;
214 |     }
215 |     
216 |     return all;
217 | }	
218 | 
219 | // Collect final performance evaluation results
220 | Rcpp::List get_eval(vw& all)
221 | {
222 |     int num_examples = all.sd->example_number;
223 |     double weighted_example_sum = all.sd->weighted_examples();
224 |     double weighted_label_sum = all.sd->weighted_labels;
225 |     double avg_loss = NA_REAL;
226 |     double avg_multiclass_log_loss = NA_REAL;
227 |     float best_const = NA_REAL;
228 |     float best_const_loss = NA_REAL;
229 |     int total_feature = all.sd->total_features;
230 |     
231 |     if(all.holdout_set_off) {
232 |         if (all.sd->weighted_labeled_examples > 0) {
233 |             avg_loss = all.sd->sum_loss / all.sd->weighted_labeled_examples; 
234 |         } else {
235 |             avg_loss = NA_REAL; 
236 |         }
237 |     } else if((all.sd->holdout_best_loss == FLT_MAX) || (all.sd->holdout_best_loss == FLT_MAX * 0.5)) {
238 |         avg_loss = NA_REAL;
239 |     } else {
240 |         avg_loss = all.sd->holdout_best_loss;
241 |     }
242 |     if (all.sd->report_multiclass_log_loss)
243 |     {
244 |         if (all.holdout_set_off) {
245 |             avg_multiclass_log_loss = all.sd->multiclass_log_loss / all.sd->weighted_labeled_examples; 
246 |         } else {
247 |             avg_multiclass_log_loss = all.sd->holdout_multiclass_log_loss / all.sd->weighted_labeled_examples;
248 |         }
249 |     }
250 |     // Get best_const and best_const_loss
251 |     copy_get_best_constant(all, best_const, best_const_loss);
252 |     
253 |     Rcpp::List eval_list = Rcpp::List::create(
254 |         Rcpp::Named("num_examples") = num_examples,
255 |         Rcpp::Named("weighted_example_sum") = weighted_example_sum,
256 |         Rcpp::Named("weighted_label_sum") = weighted_label_sum,
257 |         Rcpp::Named("avg_loss") = avg_loss,
258 |         Rcpp::Named("avg_multiclass_log_loss") = avg_multiclass_log_loss,
259 |         Rcpp::Named("best_const") = best_const,
260 |         Rcpp::Named("best_const_loss") = best_const_loss,
261 |         Rcpp::Named("total_feature") = total_feature
262 |         );
263 |     
264 |     return(eval_list);
265 | }
266 | 
267 | // Copy of get_best_constant function from best_constant.cc file
268 | bool copy_get_best_constant(vw& all, float& best_constant, float& best_constant_loss)
269 | {
270 |     if (all.sd->first_observed_label == FLT_MAX || // no non-test labels observed or function was never called
271 |         (all.loss == nullptr) || (all.sd == nullptr)) return false;
272 |     
273 |     float label1 = all.sd->first_observed_label; // observed labels might be inside [sd->Min_label, sd->Max_label], so can't use Min/Max
274 |     float label2 = (all.sd->second_observed_label == FLT_MAX)?0: all.sd->second_observed_label; // if only one label observed, second might be 0
275 |     if (label1 > label2) {float tmp = label1; label1 = label2; label2 = tmp;} // as don't use min/max - make sure label1 < label2
276 |     
277 |     float label1_cnt;
278 |     float label2_cnt;
279 |     
280 |     if (label1 != label2)
281 |     {
282 |         label1_cnt = (float) (all.sd->weighted_labels - label2*all.sd->weighted_labeled_examples)/(label1 - label2);
283 |         label2_cnt = (float)all.sd->weighted_labeled_examples - label1_cnt;
284 |     }
285 |     else
286 |         return false;
287 |     
288 |     if ( (label1_cnt + label2_cnt) <= 0.) return false;
289 |     
290 |     
291 |     po::variables_map& vm = all.opts_n_args.vm;
292 |     
293 |     std::string funcName;
294 |     if(vm.count("loss_function"))
295 |         funcName = vm["loss_function"].as<std::string>();
296 |     else
297 |         funcName = "squared";
298 |     
299 |     if(funcName.compare("squared") == 0 || funcName.compare("Huber") == 0 || funcName.compare("classic") == 0)
300 |         best_constant = (float) all.sd->weighted_labels / (float) (all.sd->weighted_labeled_examples);
301 |     else if (all.sd->is_more_than_two_labels_observed)
302 |     {
303 |         //loss functions below don't have generic formuas for constant yet.
304 |         return false;
305 |         
306 |     }
307 |     else if(funcName.compare("hinge") == 0)
308 |     {
309 |         
310 |         best_constant = label2_cnt <= label1_cnt ? -1.f: 1.f;
311 |         
312 |     }
313 |     else if(funcName.compare("logistic") == 0)
314 |     {
315 |         
316 |         label1 = -1.; //override {-50, 50} to get proper loss
317 |         label2 =  1.;
318 |         
319 |         if (label1_cnt <= 0) best_constant = 1.;
320 |         else if (label2_cnt <= 0) best_constant = -1.;
321 |         else
322 |             best_constant = std::log(label2_cnt/label1_cnt);
323 |         
324 |     }
325 |     else if(funcName.compare("quantile") == 0 || funcName.compare("pinball") == 0 || funcName.compare("absolute") == 0)
326 |     {
327 |         
328 |         float tau = 0.5;
329 |         if(vm.count("quantile_tau"))
330 |             tau = vm["quantile_tau"].as<float>();
331 |         
332 |         float q = tau*(label1_cnt + label2_cnt);
333 |         if (q < label2_cnt) best_constant = label2;
334 |         else best_constant = label1;
335 |     }
336 |     else
337 |         return false;
338 |     
339 |     if (!all.sd->is_more_than_two_labels_observed)
340 |     {
341 |         best_constant_loss =  (label1_cnt>0)?all.loss->getLoss(all.sd, best_constant, label1) * label1_cnt:0.0f;
342 |         best_constant_loss += (label2_cnt>0)?all.loss->getLoss(all.sd, best_constant, label2) * label2_cnt:0.0f;
343 |         best_constant_loss /= label1_cnt + label2_cnt;
344 |     }
345 |     else best_constant_loss = FLT_MIN;
346 |     
347 |     return true;
348 | }
349 | 
350 | std::vector<std::string> split_str(const std::string &s, char del) {
351 |     std::stringstream s_stream(s);
352 |     std::string item;
353 |     std::vector<std::string> elems;
354 |     while ( getline(s_stream, item, del) ) {
355 |         elems.push_back(item);
356 |     }
357 |     return elems;
358 | }
359 | 


--------------------------------------------------------------------------------
/src/helpers.h:
--------------------------------------------------------------------------------
 1 | #include "vw.h"
 2 | 
 3 | #include <Rcpp.h>
 4 | 
 5 | 
 6 | 
 7 | 
 8 | // Helper functions
 9 | 
10 | // Check if data from vwmodel should be used or from function arguments
11 | Rcpp::String check_data(Rcpp::List & vwmodel, std::string & valid_data_str, SEXP data, bool quiet, std::string mode="train",
12 |                                  Rcpp::Nullable<SEXP *> namespaces=R_NilValue, Rcpp::Nullable<Rcpp::CharacterVector> keep_space=R_NilValue,
13 |                                  Rcpp::Nullable<Rcpp::CharacterVector> fixed=R_NilValue,
14 |                                  Rcpp::Nullable<Rcpp::CharacterVector> targets=R_NilValue, Rcpp::Nullable<Rcpp::CharacterVector> probabilities=R_NilValue,
15 |                                  Rcpp::Nullable<Rcpp::String> weight=R_NilValue, Rcpp::Nullable<Rcpp::String> base=R_NilValue,
16 |                                  Rcpp::Nullable<Rcpp::String> tag=R_NilValue, Rcpp::Nullable<int> multiline=R_NilValue);
17 | 
18 | // Get number of examples used in model
19 | int get_num_example(vw& all);
20 | 
21 | // Custom driver to test example creation using libvw
22 | void custom_driver(vw& model, std::string & file_path);
23 | 
24 | bool file_exists(std::string file_name);
25 | 
26 | // setup function from VW main.cc file
27 | // modified to work in library mode using Rcpp
28 | vw* setup_model(std::string args_str);
29 | 
30 | // Collect final performance evaluation results
31 | Rcpp::List get_eval(vw& all);
32 | 
33 | // Copy of get_best_constant function from best_constant.cc file
34 | bool copy_get_best_constant(vw& all, float& best_constant, float& best_constant_loss);
35 | 
36 | std::vector<std::string> split_str(const std::string &s, char del);
37 | 


--------------------------------------------------------------------------------
/src/md5.c:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * RFC 1321 compliant MD5 implementation,
  3 |  * by Christophe Devine <devine@cr0.net>;
  4 |  * this program is licensed under the GPL.
  5 |  */
  6 | 
  7 | #include <string.h>
  8 | 
  9 | #include "md5.h"
 10 | 
 11 | #define GET_UINT32(n,b,i)                       \
 12 | {                                               \
 13 |     (n) = ( (uint32) (b)[(i)    ]       )       \
 14 |         | ( (uint32) (b)[(i) + 1] <<  8 )       \
 15 |         | ( (uint32) (b)[(i) + 2] << 16 )       \
 16 |         | ( (uint32) (b)[(i) + 3] << 24 );      \
 17 | }
 18 | 
 19 | #define PUT_UINT32(n,b,i)                       \
 20 | {                                               \
 21 |     (b)[(i)    ] = (uint8) ( (n)       );       \
 22 |     (b)[(i) + 1] = (uint8) ( (n) >>  8 );       \
 23 |     (b)[(i) + 2] = (uint8) ( (n) >> 16 );       \
 24 |     (b)[(i) + 3] = (uint8) ( (n) >> 24 );       \
 25 | }
 26 | 
 27 | void md5_starts( md5_context *ctx )
 28 | {
 29 |     ctx->total[0] = 0;
 30 |     ctx->total[1] = 0;
 31 | 
 32 |     ctx->state[0] = 0x67452301;
 33 |     ctx->state[1] = 0xEFCDAB89;
 34 |     ctx->state[2] = 0x98BADCFE;
 35 |     ctx->state[3] = 0x10325476;
 36 | }
 37 | 
 38 | void md5_process( md5_context *ctx, uint8 data[64] )
 39 | {
 40 |     uint32 X[16], A, B, C, D;
 41 | 
 42 |     GET_UINT32( X[0],  data,  0 );
 43 |     GET_UINT32( X[1],  data,  4 );
 44 |     GET_UINT32( X[2],  data,  8 );
 45 |     GET_UINT32( X[3],  data, 12 );
 46 |     GET_UINT32( X[4],  data, 16 );
 47 |     GET_UINT32( X[5],  data, 20 );
 48 |     GET_UINT32( X[6],  data, 24 );
 49 |     GET_UINT32( X[7],  data, 28 );
 50 |     GET_UINT32( X[8],  data, 32 );
 51 |     GET_UINT32( X[9],  data, 36 );
 52 |     GET_UINT32( X[10], data, 40 );
 53 |     GET_UINT32( X[11], data, 44 );
 54 |     GET_UINT32( X[12], data, 48 );
 55 |     GET_UINT32( X[13], data, 52 );
 56 |     GET_UINT32( X[14], data, 56 );
 57 |     GET_UINT32( X[15], data, 60 );
 58 | 
 59 | #define S(x,n) ((x << n) | ((x & 0xFFFFFFFF) >> (32 - n)))
 60 | 
 61 | #define P(a,b,c,d,k,s,t)                                \
 62 | {                                                       \
 63 |     a += F(b,c,d) + X[k] + t; a = S(a,s) + b;           \
 64 | }
 65 | 
 66 |     A = ctx->state[0];
 67 |     B = ctx->state[1];
 68 |     C = ctx->state[2];
 69 |     D = ctx->state[3];
 70 | 
 71 | #define F(x,y,z) (z ^ (x & (y ^ z)))
 72 | 
 73 |     P( A, B, C, D,  0,  7, 0xD76AA478 );
 74 |     P( D, A, B, C,  1, 12, 0xE8C7B756 );
 75 |     P( C, D, A, B,  2, 17, 0x242070DB );
 76 |     P( B, C, D, A,  3, 22, 0xC1BDCEEE );
 77 |     P( A, B, C, D,  4,  7, 0xF57C0FAF );
 78 |     P( D, A, B, C,  5, 12, 0x4787C62A );
 79 |     P( C, D, A, B,  6, 17, 0xA8304613 );
 80 |     P( B, C, D, A,  7, 22, 0xFD469501 );
 81 |     P( A, B, C, D,  8,  7, 0x698098D8 );
 82 |     P( D, A, B, C,  9, 12, 0x8B44F7AF );
 83 |     P( C, D, A, B, 10, 17, 0xFFFF5BB1 );
 84 |     P( B, C, D, A, 11, 22, 0x895CD7BE );
 85 |     P( A, B, C, D, 12,  7, 0x6B901122 );
 86 |     P( D, A, B, C, 13, 12, 0xFD987193 );
 87 |     P( C, D, A, B, 14, 17, 0xA679438E );
 88 |     P( B, C, D, A, 15, 22, 0x49B40821 );
 89 | 
 90 | #undef F
 91 | 
 92 | #define F(x,y,z) (y ^ (z & (x ^ y)))
 93 | 
 94 |     P( A, B, C, D,  1,  5, 0xF61E2562 );
 95 |     P( D, A, B, C,  6,  9, 0xC040B340 );
 96 |     P( C, D, A, B, 11, 14, 0x265E5A51 );
 97 |     P( B, C, D, A,  0, 20, 0xE9B6C7AA );
 98 |     P( A, B, C, D,  5,  5, 0xD62F105D );
 99 |     P( D, A, B, C, 10,  9, 0x02441453 );
100 |     P( C, D, A, B, 15, 14, 0xD8A1E681 );
101 |     P( B, C, D, A,  4, 20, 0xE7D3FBC8 );
102 |     P( A, B, C, D,  9,  5, 0x21E1CDE6 );
103 |     P( D, A, B, C, 14,  9, 0xC33707D6 );
104 |     P( C, D, A, B,  3, 14, 0xF4D50D87 );
105 |     P( B, C, D, A,  8, 20, 0x455A14ED );
106 |     P( A, B, C, D, 13,  5, 0xA9E3E905 );
107 |     P( D, A, B, C,  2,  9, 0xFCEFA3F8 );
108 |     P( C, D, A, B,  7, 14, 0x676F02D9 );
109 |     P( B, C, D, A, 12, 20, 0x8D2A4C8A );
110 | 
111 | #undef F
112 |     
113 | #define F(x,y,z) (x ^ y ^ z)
114 | 
115 |     P( A, B, C, D,  5,  4, 0xFFFA3942 );
116 |     P( D, A, B, C,  8, 11, 0x8771F681 );
117 |     P( C, D, A, B, 11, 16, 0x6D9D6122 );
118 |     P( B, C, D, A, 14, 23, 0xFDE5380C );
119 |     P( A, B, C, D,  1,  4, 0xA4BEEA44 );
120 |     P( D, A, B, C,  4, 11, 0x4BDECFA9 );
121 |     P( C, D, A, B,  7, 16, 0xF6BB4B60 );
122 |     P( B, C, D, A, 10, 23, 0xBEBFBC70 );
123 |     P( A, B, C, D, 13,  4, 0x289B7EC6 );
124 |     P( D, A, B, C,  0, 11, 0xEAA127FA );
125 |     P( C, D, A, B,  3, 16, 0xD4EF3085 );
126 |     P( B, C, D, A,  6, 23, 0x04881D05 );
127 |     P( A, B, C, D,  9,  4, 0xD9D4D039 );
128 |     P( D, A, B, C, 12, 11, 0xE6DB99E5 );
129 |     P( C, D, A, B, 15, 16, 0x1FA27CF8 );
130 |     P( B, C, D, A,  2, 23, 0xC4AC5665 );
131 | 
132 | #undef F
133 | 
134 | #define F(x,y,z) (y ^ (x | ~z))
135 | 
136 |     P( A, B, C, D,  0,  6, 0xF4292244 );
137 |     P( D, A, B, C,  7, 10, 0x432AFF97 );
138 |     P( C, D, A, B, 14, 15, 0xAB9423A7 );
139 |     P( B, C, D, A,  5, 21, 0xFC93A039 );
140 |     P( A, B, C, D, 12,  6, 0x655B59C3 );
141 |     P( D, A, B, C,  3, 10, 0x8F0CCC92 );
142 |     P( C, D, A, B, 10, 15, 0xFFEFF47D );
143 |     P( B, C, D, A,  1, 21, 0x85845DD1 );
144 |     P( A, B, C, D,  8,  6, 0x6FA87E4F );
145 |     P( D, A, B, C, 15, 10, 0xFE2CE6E0 );
146 |     P( C, D, A, B,  6, 15, 0xA3014314 );
147 |     P( B, C, D, A, 13, 21, 0x4E0811A1 );
148 |     P( A, B, C, D,  4,  6, 0xF7537E82 );
149 |     P( D, A, B, C, 11, 10, 0xBD3AF235 );
150 |     P( C, D, A, B,  2, 15, 0x2AD7D2BB );
151 |     P( B, C, D, A,  9, 21, 0xEB86D391 );
152 | 
153 | #undef F
154 | 
155 |     ctx->state[0] += A;
156 |     ctx->state[1] += B;
157 |     ctx->state[2] += C;
158 |     ctx->state[3] += D;
159 | }
160 | 
161 | void md5_update( md5_context *ctx, uint8 *input, uint32 length )
162 | {
163 |     uint32 left, fill;
164 | 
165 |     if( ! length ) return;
166 | 
167 |     left = ctx->total[0] & 0x3F;
168 |     fill = 64 - left;
169 | 
170 |     ctx->total[0] += length;
171 |     ctx->total[0] &= 0xFFFFFFFF;
172 | 
173 |     if( ctx->total[0] < length )
174 |         ctx->total[1]++; /* #nocov */
175 | 
176 |     if( left && length >= fill )
177 |     {
178 |         memcpy( (void *) (ctx->buffer + left),
179 |                 (void *) input, fill );
180 |         md5_process( ctx, ctx->buffer );
181 |         length -= fill;
182 |         input  += fill;
183 |         left = 0;
184 |     }
185 | 
186 |     while( length >= 64 )
187 |     {
188 |         md5_process( ctx, input );
189 |         length -= 64;
190 |         input  += 64;
191 |     }
192 | 
193 |     if( length )
194 |     {
195 |         memcpy( (void *) (ctx->buffer + left),
196 |                 (void *) input, length );
197 |     }
198 | }
199 | 
200 | static uint8 md5_padding[64] =
201 | {
202 |  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
203 |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
204 |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
205 |     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
206 | };
207 | 
208 | void md5_finish( md5_context *ctx, uint8 digest[16] )
209 | {
210 |     uint32 last, padn;
211 |     uint32 high, low;
212 |     uint8 msglen[8];
213 | 
214 |     high = ( ctx->total[0] >> 29 )
215 |          | ( ctx->total[1] <<  3 );
216 |     low  = ( ctx->total[0] <<  3 );
217 | 
218 |     PUT_UINT32( low,  msglen, 0 );
219 |     PUT_UINT32( high, msglen, 4 );
220 | 
221 |     last = ctx->total[0] & 0x3F;
222 |     padn = ( last < 56 ) ? ( 56 - last ) : ( 120 - last );
223 | 
224 |     md5_update( ctx, md5_padding, padn );
225 |     md5_update( ctx, msglen, 8 );
226 | 
227 |     PUT_UINT32( ctx->state[0], digest,  0 );
228 |     PUT_UINT32( ctx->state[1], digest,  4 );
229 |     PUT_UINT32( ctx->state[2], digest,  8 );
230 |     PUT_UINT32( ctx->state[3], digest, 12 );
231 | }
232 | 
233 | #ifdef TEST
234 | 
235 | #include <stdlib.h>
236 | #include <stdio.h>
237 | 
238 | /*
239 |  * those are the standard RFC 1321 test vectors
240 |  */
241 | 
242 | static char *msg[] = 
243 | {
244 |     "",
245 |     "a",
246 |     "abc",
247 |     "message digest",
248 |     "abcdefghijklmnopqrstuvwxyz",
249 |     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789",
250 |     "12345678901234567890123456789012345678901234567890123456789012" \
251 |         "345678901234567890"
252 | };
253 | 
254 | static char *val[] =
255 | {
256 |     "d41d8cd98f00b204e9800998ecf8427e",
257 |     "0cc175b9c0f1b6a831c399e269772661",
258 |     "900150983cd24fb0d6963f7d28e17f72",
259 |     "f96b697d7cb7938d525a2f31aaf161d0",
260 |     "c3fcd3d76192e4007dfb496cca67e13b",
261 |     "d174ab98d277d9f5a5611c2c9f419d9f",
262 |     "57edf4a22be3c955ac49da2e2107b67a"
263 | };
264 | 
265 | int main( int argc, char *argv[] )
266 | {
267 |     FILE *f;
268 |     int i, j;
269 |     char output[33];
270 |     md5_context ctx;
271 |     unsigned char buf[1000];
272 |     unsigned char md5sum[16];
273 | 
274 |     if( argc < 2 )
275 |     {
276 |         printf( "\n MD5 Validation Tests:\n\n" );
277 | 
278 |         for( i = 0; i < 7; i++ )
279 |         {
280 |             printf( " Test %d ", i + 1 );
281 | 
282 |             md5_starts( &ctx );
283 |             md5_update( &ctx, (uint8 *) msg[i], strlen( msg[i] ) );
284 |             md5_finish( &ctx, md5sum );
285 | 
286 |             for( j = 0; j < 16; j++ )
287 |             {
288 |                 sprintf( output + j * 2, "%02x", md5sum[j] );
289 |             }
290 | 
291 |             if( memcmp( output, val[i], 32 ) )
292 |             {
293 |                 printf( "failed!\n" );
294 |                 return( 1 );
295 |             }
296 | 
297 |             printf( "passed.\n" );
298 |         }
299 | 
300 |         printf( "\n" );
301 |     }
302 |     else
303 |     {
304 |         if( ! ( f = fopen( argv[1], "rb" ) ) )
305 |         {
306 |             perror( "fopen" );
307 |             return( 1 );
308 |         }
309 | 
310 |         md5_starts( &ctx );
311 | 
312 |         while( ( i = fread( buf, 1, sizeof( buf ), f ) ) > 0 )
313 |         {
314 |             md5_update( &ctx, buf, i );
315 |         }
316 | 
317 |         md5_finish( &ctx, md5sum );
318 | 
319 |         for( j = 0; j < 16; j++ )
320 |         {
321 |             printf( "%02x", md5sum[j] );
322 |         }
323 | 
324 |         printf( "  %s\n", argv[1] );
325 |     }
326 | 
327 |     return( 0 );
328 | }
329 | 
330 | #endif
331 | 


--------------------------------------------------------------------------------
/src/md5.h:
--------------------------------------------------------------------------------
 1 | #ifndef _MD5_H
 2 | #define _MD5_H
 3 | 
 4 | #ifndef uint8
 5 | #define uint8  unsigned char
 6 | #endif
 7 | 
 8 | #ifndef uint32
 9 | #define uint32 unsigned long int
10 | #endif
11 | 
12 | typedef struct
13 | {
14 |     uint32 total[2];
15 |     uint32 state[4];
16 |     uint8 buffer[64];
17 | }
18 | md5_context;
19 | 
20 | void md5_starts( md5_context *ctx );
21 | void md5_update( md5_context *ctx, uint8 *input, uint32 length );
22 | void md5_finish( md5_context *ctx, uint8 digest[16] );
23 | 
24 | #endif /* md5.h */
25 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(rvw)
3 | 
4 | test_check("rvw")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-cmdline.R:
--------------------------------------------------------------------------------
  1 | context("Check predictions against command line version of VW")
  2 | library(rvw)
  3 | 
  4 | # Switch to temporary directory
  5 | curr_dir <- getwd()
  6 | setwd(tempdir())
  7 | 
  8 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
  9 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw")
 10 | 
 11 | lda_data <- system.file("extdata", "lda_data.vw", package = "rvw")
 12 | 
 13 | multiclass_train_data <- system.file("extdata", "multiclass_train.vw", package = "rvw")
 14 | multiclass_test_data <- system.file("extdata", "multiclass_valid.vw", package = "rvw")
 15 | 
 16 | test_that("empty vwsetup works as CL version", {
 17 |   # Package session
 18 |   test_vwmodel <- vwsetup(
 19 |     dir = "./",
 20 |     model = "pk_mdl.vw"
 21 |   )
 22 |   vwtrain(test_vwmodel, data = ext_train_data, quiet = T)
 23 |   vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T)
 24 |   vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw"))
 25 |   file.remove("pk_mdl.vw")
 26 | 
 27 |   # Command Line session
 28 |   system(
 29 |     paste0("vw -d ", ext_train_data," -f ./cl_mdl.vw"),
 30 |     intern = FALSE,
 31 |     ignore.stderr = TRUE
 32 |   )
 33 |   vw_cl_output <- as.numeric(
 34 |     system(
 35 |       paste0("vw -t -d ", ext_test_data," -i ./cl_mdl.vw -p /dev/stdout"),
 36 |       intern = TRUE,
 37 |       ignore.stderr = TRUE
 38 |     )
 39 |   )
 40 |   vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw"))
 41 |   file.remove("cl_mdl.vw")
 42 | 
 43 |   # Results comparison
 44 |   expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum, tolerance=1e-7)
 45 |   expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7)
 46 | })
 47 | 
 48 | test_that("nn vwsetup works as CL version", {
 49 |   # Package session
 50 |   test_vwmodel <- vwsetup(
 51 |     dir = "./",
 52 |     model = "pk_mdl.vw",
 53 |     option = "nn",
 54 |     num_hidden = 4
 55 |   )
 56 |   vwtrain(test_vwmodel, data = ext_train_data, quiet = T)
 57 |   vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T)
 58 |   vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw"))
 59 |   file.remove("pk_mdl.vw")
 60 | 
 61 |   # Command Line session
 62 |   system(
 63 |     paste0("vw --nn 4 -d ", ext_train_data," -f ./cl_mdl.vw"),
 64 |     intern = FALSE,
 65 |     ignore.stderr = TRUE
 66 |   )
 67 |   vw_cl_output <- as.numeric(
 68 |     system(
 69 |       paste0("vw --nn 4 -t -d ", ext_test_data," -i ./cl_mdl.vw -p /dev/stdout"),
 70 |       intern = TRUE,
 71 |       ignore.stderr = TRUE
 72 |     )
 73 |   )
 74 |   vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw"))
 75 |   file.remove("cl_mdl.vw")
 76 | 
 77 |   # Results comparison
 78 |   expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum)
 79 |   expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7)
 80 | })
 81 | 
 82 | 
 83 | test_that("vwsetup with custom arguments and cache works as CL version", {
 84 |   # Skip for now, because not yet finished fix for this case
 85 |   # skip("Problem is being fixed")
 86 |   # Package session
 87 |   test_vwmodel <- vwsetup(
 88 |     dir = "./",
 89 |     model = "pk_mdl.vw",
 90 |     general_params = list(random_seed = 42, loss_function="logistic", link="logistic"),
 91 |     feature_params = list(bit_precision=20, ngram="A2", noconstant=T),
 92 |     optimization_params = list(adaptive=FALSE, l1=1E-8)
 93 |   )
 94 |   test_vwmodel <- add_option(test_vwmodel, option = "boosting", num_learners=4)
 95 |   vwtrain(test_vwmodel, data = ext_train_data, quiet = T, passes = 10)
 96 |   vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T)
 97 |   vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw"))
 98 |   file.remove("pk_mdl.vw","binary_train.vw.cache", "binary_valid.vw.cache")
 99 | 
100 |   # Command Line session
101 |   system(
102 |     paste0("vw --random_seed 42 --loss_function logistic --link logistic ",
103 |            "--bit_precision 20 --ngram A2 --noconstant --l1 1e-08 --boosting 4 --passes 10 -c",
104 |            " -d ", ext_train_data, " -f ./cl_mdl.vw"),
105 |     intern = FALSE,
106 |     ignore.stderr = TRUE
107 |   )
108 |   vw_cl_output <- as.numeric(
109 |     system(
110 |       paste0("vw -t -d ", ext_test_data, " -i ./cl_mdl.vw -p /dev/stdout"),
111 |       intern = TRUE,
112 |       ignore.stderr = TRUE
113 |     )
114 |   )
115 |   vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw"))
116 |   file.remove("cl_mdl.vw","binary_train.vw.cache", "binary_valid.vw.cache")
117 | 
118 |   # Results comparison
119 |   expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum)
120 |   expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7)
121 | })
122 | 
123 | test_that("Updating model with new data works as CL version", {
124 |     test_dir <- getwd()
125 |     # Package session
126 |     pk_mdl_file <- paste0(test_dir, "/", "pk_mdl.vw")
127 |     test_vwmodel <- vwsetup(
128 |         dir = test_dir,
129 |         model = "pk_mdl.vw"
130 |     )
131 |     vwtrain(test_vwmodel, data = ext_train_data, update_model = TRUE, quiet = T)
132 |     vw_pk_initial_mdl_checksum <- unname(tools::md5sum(pk_mdl_file))
133 |     vwtrain(test_vwmodel, data = ext_test_data, update_model = TRUE, quiet = T)
134 |     vw_pk_updated_mdl_checksum <- unname(tools::md5sum(pk_mdl_file))
135 |     vw_pk_output <- predict.vw(test_vwmodel, data = ext_test_data, quiet = T)
136 |     file.remove(pk_mdl_file)
137 | 
138 |     # Command Line session
139 |     cl_mdl_file <- paste0(test_dir, "/", "cl_mdl.vw")
140 |     system(
141 |         paste0("vw",
142 |                " -d ", ext_train_data, " -f ", cl_mdl_file, " --save_resume --quiet"),
143 |         intern = FALSE,
144 |         ignore.stderr = TRUE
145 |     )
146 |     vw_cl_initial_mdl_checksum <- unname(tools::md5sum(cl_mdl_file))
147 |     system(
148 |         paste0("vw",
149 |                " -d ", ext_test_data, " -i ", cl_mdl_file, " -f ", cl_mdl_file, " --save_resume --quiet"),
150 |         intern = FALSE,
151 |         ignore.stderr = TRUE
152 |     )
153 |     vw_cl_updated_mdl_checksum <- unname(tools::md5sum(cl_mdl_file))
154 |     vw_cl_output <- as.numeric(
155 |         system(
156 |             paste0("vw",
157 |                    " -t -d ", ext_test_data, " -i ", cl_mdl_file, " -p /dev/stdout --quiet"),
158 |             intern = TRUE,
159 |             ignore.stderr = TRUE
160 |         )
161 |     )
162 |     file.remove(cl_mdl_file)
163 | 
164 |     # Results comparison
165 |     expect_equal(vw_pk_initial_mdl_checksum, vw_cl_initial_mdl_checksum)
166 |     expect_equal(vw_pk_updated_mdl_checksum, vw_cl_updated_mdl_checksum)
167 |     expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7)
168 | })
169 | 
170 | test_that("vwsetup with multiclass classification setup works as CL version", {
171 |     # Package session
172 |     test_vwmodel <- vwsetup(
173 |         dir = "./",
174 |         model = "pk_mdl.vw",
175 |         option = "ect",
176 |         num_classes = 3
177 |     )
178 |     vwtrain(test_vwmodel, data = multiclass_train_data, quiet = T, passes = 4)
179 |     vw_pk_output <- vwtest(test_vwmodel, data = multiclass_test_data, quiet = T)
180 |     vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw"))
181 |     file.remove("pk_mdl.vw","multiclass_train.vw.cache", "multiclass_valid.vw.cache")
182 | 
183 |     # Command Line session
184 |     system(
185 |         paste0("vw --passes 4 --cache --ect 3",
186 |                " -d ", multiclass_train_data, " -f ./cl_mdl.vw"),
187 |         intern = FALSE,
188 |         ignore.stderr = TRUE
189 |     )
190 |     system(
191 |         paste0("vw",
192 |                " -t -d ", multiclass_test_data, " -i ./cl_mdl.vw -p ./cl_probs.out"),
193 |         intern = FALSE,
194 |         ignore.stderr = TRUE
195 |     )
196 |     vw_cl_output <- read.table(file = "./cl_probs.out", sep = " ", header = F)
197 |     vw_cl_output <- vw_cl_output$V1
198 |     vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw"))
199 |     file.remove("cl_mdl.vw", "multiclass_train.vw.cache", "multiclass_valid.vw.cache", "cl_probs.out")
200 | 
201 |     # Results comparison
202 |     expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum)
203 |     expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7)
204 | })
205 | 
206 | test_that("vwsetup with lda setup works as CL version", {
207 |     # Package session
208 |     test_vwmodel <- vwsetup(
209 |         dir = "./",
210 |         model = "pk_mdl.vw",
211 |         option = "lda",
212 |         num_topics = 7,
213 |         lda_D = 100,
214 |         minibatch = 16,
215 |         math_mode = "accuracy"
216 |     )
217 |     vwtrain(test_vwmodel, data = lda_data, quiet = T, passes = 2,
218 |             readable_model = "hashed", readable_model_path = "pk_readable_mdl.vw")
219 |     vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw"))
220 |     vw_pk_readable_mdl_checksum <- unname(tools::md5sum("pk_readable_mdl.vw"))
221 |     file.remove("pk_mdl.vw", "pk_readable_mdl.vw", "lda_data.vw.cache")
222 |     
223 |     # Command Line session
224 |     system(
225 |         paste0("vw --lda 7 --lda_D 100 --math-mode accuracy --minibatch 16 --passes 2",
226 |                " --cache_file ./lda_data.vw.cache",
227 |                " --readable_model ./cl_readable_mdl.vw",
228 |                " -d ", lda_data, " -f ./cl_mdl.vw"),
229 |         intern = FALSE,
230 |         ignore.stderr = TRUE
231 |     )
232 |     vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw"))
233 |     vw_cl_readable_mdl_checksum <- unname(tools::md5sum("cl_readable_mdl.vw"))
234 |     file.remove("cl_mdl.vw", "cl_readable_mdl.vw", "lda_data.vw.cache")
235 |     
236 |     # Results comparison
237 |     expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum)
238 |     expect_equal(vw_pk_readable_mdl_checksum, vw_cl_readable_mdl_checksum)
239 | })
240 | 
241 | test_that("vwsetup with multicolumn output", {
242 |     # Package session
243 |     test_vwmodel <- vwsetup(
244 |         dir = "./",
245 |         model = "pk_mdl.vw",
246 |         option = "oaa",
247 |         num_classes = 3
248 |     )
249 |     vwtrain(test_vwmodel, data = multiclass_train_data, quiet = T, passes = 2)
250 |     vw_pk_output <- vwtest(test_vwmodel, data = multiclass_test_data, quiet = T, full_probs = T)
251 |     write.table(vw_pk_output, file = "pk_probs.out", sep = " ", quote = F, row.names = F, col.names = F)
252 |     vw_pk_probs_checksum <- unname(tools::md5sum("pk_probs.out"))
253 |     
254 |     file.remove("pk_mdl.vw", "pk_probs.out", "multiclass_train.vw.cache")
255 |     
256 |     # Command Line session
257 |     system(
258 |         paste0("vw --oaa 3 --passes 2 --cache",
259 |                " -d ", multiclass_train_data, " -f ./cl_mdl.vw"),
260 |         intern = FALSE,
261 |         ignore.stderr = TRUE
262 |     )
263 |     system(
264 |         paste0("vw",
265 |                " -t -d ", multiclass_test_data, " -i ./cl_mdl.vw -p ./cl_probs.out"),
266 |         intern = FALSE,
267 |         ignore.stderr = TRUE
268 |     )
269 |     vw_cl_probs_checksum <- unname(tools::md5sum("cl_probs.out"))
270 |     
271 |     file.remove("cl_mdl.vw", "cl_probs.out", "multiclass_train.vw.cache")
272 |     
273 |     # Results comparison
274 |     expect_equal(vw_pk_probs_checksum, vw_cl_probs_checksum)
275 | })
276 | 
277 | test_that("vwsetup with df2vw conversion works as CL version", {
278 |     # Package session
279 |     test_vwmodel <- vwsetup(
280 |         dir = "./",
281 |         model = "pk_mdl.vw",
282 |         option = "oaa",
283 |         num_classes = 3
284 |     )
285 |     
286 |     data_full <- iris
287 |     levels(data_full$Species) <- c(1, 2, 3)
288 |     ind_train <- sample(1:nrow(data_full), 0.8*nrow(data_full))
289 |     
290 |     vwtrain(test_vwmodel, data = data_full[ind_train,], quiet = T,
291 |             targets = "Species")
292 |     vw_pk_output <- vwtest(test_vwmodel, data = data_full[-ind_train,], quiet = T,
293 |                            targets = "Species")
294 |     vw_pk_mdl_checksum <- unname(tools::md5sum("pk_mdl.vw"))
295 |     file.remove("pk_mdl.vw")
296 |     
297 |     # Command Line session
298 |     
299 |     df2vw(data = data_full[ind_train,], file_path = "cl_train.vw",
300 |           targets = "Species")
301 |     df2vw(data = data_full[-ind_train,], file_path = "cl_test.vw",
302 |           targets = "Species")
303 |     
304 |     system(
305 |         paste0("vw --oaa 3 -d ./cl_train.vw -f ./cl_mdl.vw"),
306 |         intern = FALSE,
307 |         ignore.stderr = TRUE
308 |     )
309 |     vw_cl_output <- as.numeric(
310 |         system(
311 |             paste0("vw -t -d ./cl_test.vw -i ./cl_mdl.vw -p /dev/stdout"),
312 |             intern = TRUE,
313 |             ignore.stderr = TRUE
314 |         )
315 |     )
316 |     vw_cl_mdl_checksum <- unname(tools::md5sum("cl_mdl.vw"))
317 |     file.remove("cl_mdl.vw", "cl_train.vw", "cl_test.vw")
318 |     
319 |     # Results comparison
320 |     expect_equal(vw_pk_mdl_checksum, vw_cl_mdl_checksum, tolerance=1e-7)
321 |     expect_equal(vw_pk_output, vw_cl_output, tolerance=1e-7)
322 | })
323 | 
324 | test_that("print.vw outputs correct results to console", {
325 |     test_vwmodel <- vwsetup(
326 |         dir = "./",
327 |         model = "pk_mdl.vw",
328 |         option = "boosting",
329 |         num_learners = 10
330 |     )
331 |     vwtrain(test_vwmodel, data = ext_train_data, quiet = T)
332 |     vw_pk_output <- vwtest(test_vwmodel, data = ext_test_data, quiet = T)
333 |     
334 |     capture.output(test_vwmodel, file = "pk_print.out")
335 |     
336 |     vw_pk_print_checksum <- unname(tools::md5sum("pk_print.out"))
337 |     ref_print_file <- system.file("extdata", "ref_print.out", package = "rvw")
338 |     vw_ref_print_checksum <- unname(tools::md5sum(ref_print_file))
339 |     
340 |     file.remove("pk_mdl.vw", "pk_print.out")
341 |     
342 |     expect_equal(vw_pk_print_checksum, vw_ref_print_checksum)
343 | })
344 | 
345 | # Return back
346 | setwd(curr_dir)
347 | 


--------------------------------------------------------------------------------
/tests/testthat/test-err.R:
--------------------------------------------------------------------------------
  1 | context("vw error messages")
  2 | library(rvw)
  3 | 
  4 | fake_vwmodel <- list(params = list(algorithm = "sgd",
  5 |                                    general_params = list(),
  6 |                                    feature_params = list(),
  7 |                                    optimization_params = list(),
  8 |                                    options = list()
  9 |                                    
 10 | ),
 11 | dir = "../my_tmp",
 12 | model = "mdl.vw",
 13 | params_str = paste0(""),
 14 | is_cl = FALSE,
 15 | data_md5sum = list(train = "",
 16 |                    test = ""),
 17 | train_file = "",
 18 | eval = list(
 19 |     train=list(),
 20 |     test=list()
 21 | ),
 22 | parser_opts=NA
 23 | )
 24 | 
 25 | 
 26 | test_that(".check_parameters raises correct errors", {
 27 |     # Wrong argument names
 28 |     
 29 |     # General params
 30 |     expect_error(
 31 |         vwsetup(general_params = list(wrong_param_1=10, ting_size=10)),
 32 |         "Wrong argument names: wrong_param_1, ting_size",
 33 |         fixed = T
 34 |     )
 35 |     # Feature params
 36 |     expect_error(
 37 |         vwsetup(feature_params = list(bit_precision=25, featurelimit=100)),
 38 |         "Wrong argument names: featurelimit",
 39 |         fixed = T
 40 |     )
 41 |     # Optimization params
 42 |     expect_error(
 43 |         vwsetup(optimization_params = list(initial_p=0.1, l1=1E-7)),
 44 |         "Wrong argument names: initial_p",
 45 |         fixed = T
 46 |     )
 47 |     # Option
 48 |     expect_error(
 49 |         vwsetup(option = "nn", num_hidden = 10, inpas = TRUE),
 50 |         "Wrong argument names: inpas",
 51 |         fixed = T
 52 |     )
 53 |     
 54 |     
 55 |     # Wrong argument values 
 56 |     
 57 |     # General params
 58 |     expect_error(
 59 |         vwsetup(general_params = list(random_seed="10", ring_size=10)),
 60 |         "Wrong argument values: random_seed",
 61 |         fixed = T
 62 |     )
 63 |     # Feature params
 64 |     # This test should be changed in future, because we want to accept both real and integer numbers
 65 |     expect_error(
 66 |         vwsetup(feature_params = list(bit_precision=25L, noconstant="foo")),
 67 |         "Wrong argument values: bit_precision, noconstant",
 68 |         fixed = T
 69 |     )
 70 |     # Optimization params
 71 |     expect_error(
 72 |         vwsetup(optimization_params = list(no_bias_regularization="on", feature_mask=1E-7)),
 73 |         "Wrong argument values: feature_mask",
 74 |         fixed = T
 75 |     )
 76 |     # Option
 77 |     expect_error(
 78 |         vwsetup(option = "nn", num_hidden = "10", inpass = "TRUE"),
 79 |         "Wrong argument values: num_hidden, inpass",
 80 |         fixed = T
 81 |     )
 82 |     
 83 |     
 84 |     # Missing first argument value in option parameters
 85 |     expect_error(
 86 |         vwsetup(option = "nn"),
 87 |         "Missing value for argument: num_hidden",
 88 |         fixed = T
 89 |     )
 90 | })
 91 | 
 92 | test_that("vwsetup raises correct errors", {
 93 |     
 94 |     # Whitespace characters in dir path
 95 |     expect_error(
 96 |         vwsetup(dir = "./some folder/"),
 97 |         "Whitespace characters are not allowed in `dir` path",
 98 |         fixed = T
 99 |     )
100 |     
101 |     # Whitespace characters in model path
102 |     expect_error(
103 |         vwsetup(model = "./some folder/mdl.vw"),
104 |         "Whitespace characters are not allowed in `model` path",
105 |         fixed = T
106 |     )
107 |     
108 |     # Forbidden flags in cmd line parameters
109 |     expect_error(
110 |         vwsetup(params_str = "--passes 10"),
111 |         "Following cmd line parameters are defined in other functions:",
112 |         fixed = T
113 |     )
114 | })
115 | 
116 | test_that("add_option raises correct errors", {
117 |     
118 |     # vwmodel should be of class vw
119 |     expect_error(
120 |         add_option(fake_vwmodel, option = "nn", num_hidden = 10),
121 |         "vwmodel should be of class vw",
122 |         fixed = T
123 |     )
124 |     
125 |     # add_option can't be used with direct cmd line parameters
126 |     test_vwmodel <- vwsetup(params_str = "--bit_precision 25")
127 |     
128 |     expect_error(
129 |         add_option(test_vwmodel, option = "nn", num_hidden = 10),
130 |         "add_option can't be used when cmd line parameters are used",
131 |         fixed = T
132 |     )
133 |     
134 |     # Overwrite option
135 |     test_vwmodel <- vwsetup(option = "nn", num_hidden = 5)
136 |     
137 |     expect_error(
138 |         add_option(test_vwmodel, option = "nn", num_hidden = 10),
139 |         "Trying to overwrite option",
140 |         fixed = T
141 |     )
142 | })
143 | 
144 | test_that("vwparams raises correct errors", {
145 |     
146 |     # vwmodel should be of class vw
147 |     expect_error(
148 |         vwparams(fake_vwmodel, name = "bit_precision"),
149 |         "vwmodel should be of class vw",
150 |         fixed = T
151 |     )
152 |     
153 |     # add_option can't be used with direct cmd line parameters
154 |     test_vwmodel <- vwsetup(params_str = "--bit_precision 25")
155 |     
156 |     expect_error(
157 |         vwparams(test_vwmodel, name = "bit_precision"),
158 |         "vwparams can't be used when cmd line parameters are used",
159 |         fixed = T
160 |     )
161 | })
162 | 


--------------------------------------------------------------------------------
/tests/testthat/test-parser.R:
--------------------------------------------------------------------------------
  1 | context("Check df2vw parser")
  2 | library(rvw)
  3 | 
  4 | # Switch to temporary directory
  5 | curr_dir <- getwd()
  6 | setwd(tempdir())
  7 | 
  8 | test_that("df2vw correctly parses data", {
  9 |     df2vw_path <- "df2vw.vw"
 10 |     ref_path <- "ref.vw"
 11 |     
 12 |     test_df = data.frame(
 13 |         num_v1 = c(0.00005, 0.333333334, 10, 100000.314),
 14 |         fact_v2 = factor(c("a", "a", "b", "c")),
 15 |         text_v3 = rep("Et harum| (quid)em: rerum facilis!", 4),
 16 |         text_v4 = rep(" Et harum| (quid)em: rerum facilis!", 4),
 17 |         regular_label = c(1, 1.2, 4, 5.4),
 18 |         base = c(1, 1, 1, 1),
 19 |         multiline_label = c(0, 1, 1, 0),
 20 |         multilabel_1 = c(0.25, 0.25, 0.25, 0.25),
 21 |         multilabel_2 = c(0.25, 0.25, 0.25, 0.25),
 22 |         multilabel_3 = c(0.25, 0.25, 0.25, 0.25),
 23 |         multilabel_4 = c(0.25, 0.25, 0.25, 0.25),
 24 |         na_multilabel_1 = c(NA, 0.25, NA, 0.25),
 25 |         na_multilabel_2 = c(NA, NA, 0.25, 0.25),
 26 |         na_multilabel_3 = c(NA, NA, NA, NA),
 27 |         tag = c("ex1", "ex2", "ex3", "ex4"),
 28 |         importance = c("10", "0.5", "0.5", "4")
 29 |     )
 30 |     
 31 |     ref_df = data.frame(
 32 |         features = c(" |NS1 num_v1:5e-05 fact_v2^a |NS2 fact_v2^a Et harum_ _quid_em_ rerum facilis! |NS3  Et harum| (quid)em: rerum facilis!",
 33 |                      " |NS1 num_v1:0.333333334 fact_v2^a |NS2 fact_v2^a Et harum_ _quid_em_ rerum facilis! |NS3  Et harum| (quid)em: rerum facilis!",
 34 |                      " |NS1 num_v1:10 fact_v2^b |NS2 fact_v2^b Et harum_ _quid_em_ rerum facilis! |NS3  Et harum| (quid)em: rerum facilis!",
 35 |                      " |NS1 num_v1:100000.314 fact_v2^c |NS2 fact_v2^c Et harum_ _quid_em_ rerum facilis! |NS3  Et harum| (quid)em: rerum facilis!"),
 36 |         regular_labels = c("1 10 1 'ex1", "1.2 0.5 1 'ex2", "4 0.5 1 'ex3", "5.4 4 1 'ex4"),
 37 |         csoaa_labels = c("1:0.25 2:0.25 3:0.25 4:0.25 'ex1", "1:0.25 2:0.25 3:0.25 4:0.25 'ex2",
 38 |                          "1:0.25 2:0.25 3:0.25 4:0.25 'ex3", "1:0.25 2:0.25 3:0.25 4:0.25 'ex4"),
 39 |         cb_labels = c("1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex1",
 40 |                       "1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex2",
 41 |                       "1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex3",
 42 |                       "1:0.25:0.25 2:0.25:0.25 3:0.25:0.25 4:0.25:0.25 'ex4"),
 43 |         na_labels = c(" 'ex1",
 44 |                       "1:0.25:0.25 'ex2",
 45 |                       "2:0.25:0.25 'ex3",
 46 |                       "1:0.25:0.25 2:0.25:0.25 'ex4"),
 47 |         multiline_labels = c("1:0", "2:1", "1:1", "2:0")
 48 |     )
 49 |     
 50 |     # Regular labels
 51 |     cat("Regular labels\n")
 52 |     ref_file <- file(ref_path,"w")
 53 |     apply(ref_df, MARGIN = 1, function(x) {
 54 |         writeLines(text = paste0(x[["regular_labels"]], x[["features"]]), con = ref_file)
 55 |     })
 56 |     close(ref_file)
 57 |     regular_ref_checksum <- unname(tools::md5sum(ref_path))
 58 |     
 59 |     df2vw(data = test_df, file_path = df2vw_path,
 60 |           namespaces = list(NS1 = c("num_v1", "fact_v2"),
 61 |                             NS2 = c("fact_v2", "text_v3"),
 62 |                             NS3 = c("text_v4")),
 63 |           keep_space = "text_v3", base = "base",
 64 |           fixed = "text_v4",
 65 |           targets = "regular_label", tag = "tag", weight = "importance")
 66 |     regular_df2vw_checksum <- unname(tools::md5sum(df2vw_path))
 67 |     
 68 |     # CSOAA labels
 69 |     cat("CSOAA labels\n")
 70 |     ref_file <- file(ref_path,"w")
 71 |     apply(ref_df, MARGIN = 1, function(x) {
 72 |         writeLines(text = paste0(x[["csoaa_labels"]], x[["features"]]), con = ref_file)
 73 |     })
 74 |     close(ref_file)
 75 |     csoaa_ref_checksum <- unname(tools::md5sum(ref_path))
 76 |     
 77 |     df2vw(data = test_df, file_path = df2vw_path,
 78 |           namespaces = list(NS1 = c("num_v1", "fact_v2"),
 79 |                             NS2 = c("fact_v2", "text_v3"),
 80 |                             NS3 = c("text_v4")),
 81 |           keep_space = "text_v3",
 82 |           fixed = "text_v4",
 83 |           targets = c("multilabel_1", "multilabel_2", "multilabel_3", "multilabel_4"),
 84 |           tag = "tag", weight = "importance")
 85 |     csoaa_df2vw_checksum <- unname(tools::md5sum(df2vw_path))
 86 |     
 87 |     # Context Bandit labels
 88 |     cat("Context Bandit labels\n")
 89 |     ref_file <- file(ref_path,"w")
 90 |     apply(ref_df, MARGIN = 1, function(x) {
 91 |         writeLines(text = paste0(x[["cb_labels"]], x[["features"]]), con = ref_file)
 92 |     })
 93 |     close(ref_file)
 94 |     cb_ref_checksum <- unname(tools::md5sum(ref_path))
 95 |     
 96 |     df2vw(data = test_df, file_path = df2vw_path,
 97 |           namespaces = list(NS1 = c("num_v1", "fact_v2"),
 98 |                             NS2 = c("fact_v2", "text_v3"),
 99 |                             NS3 = c("text_v4")),
100 |           keep_space = "text_v3",
101 |           fixed = "text_v4",
102 |           targets = c("multilabel_1", "multilabel_2", "multilabel_3", "multilabel_4"),
103 |           probabilities = c("multilabel_1", "multilabel_2", "multilabel_3", "multilabel_4"),
104 |           tag = "tag", weight = "importance")
105 |     cb_df2vw_checksum <- unname(tools::md5sum(df2vw_path))
106 |     
107 |     # CSOAA when not all labels are allowed
108 |     cat("CSOAA when not all labels are allowed\n")
109 |     ref_file <- file(ref_path,"w")
110 |     apply(ref_df, MARGIN = 1, function(x) {
111 |         writeLines(text = paste0(x[["na_labels"]], x[["features"]]), con = ref_file)
112 |     })
113 |     close(ref_file)
114 |     na_ref_checksum <- unname(tools::md5sum(ref_path))
115 |     
116 |     df2vw(data = test_df, file_path = df2vw_path,
117 |           namespaces = list(NS1 = c("num_v1", "fact_v2"),
118 |                             NS2 = c("fact_v2", "text_v3"),
119 |                             NS3 = c("text_v4")),
120 |           keep_space = "text_v3",
121 |           fixed = "text_v4",
122 |           targets = c("na_multilabel_1", "na_multilabel_2", "na_multilabel_3"),
123 |           probabilities = c("multilabel_1", "multilabel_2", "multilabel_3"),
124 |           tag = "tag", weight = "importance")
125 |     na_df2vw_checksum <- unname(tools::md5sum(df2vw_path))
126 |     
127 |     # Multiline CSOAA
128 |     cat("Multiline CSOAA\n")
129 |     ref_file <- file(ref_path,"w")
130 |     ref_df$lines <- apply(ref_df, MARGIN = 1, function(x) {
131 |         paste0(x[["multiline_labels"]], x[["features"]])
132 |     })
133 |     writeLines(text = paste0(ref_df$lines, c("", "\n"), collapse = "\n"), con = ref_file)
134 |     close(ref_file)
135 |     mult_ref_checksum <- unname(tools::md5sum(ref_path))
136 |     
137 |     df2vw(data = test_df, file_path = df2vw_path,
138 |           namespaces = list(NS1 = c("num_v1", "fact_v2"),
139 |                             NS2 = c("fact_v2", "text_v3"),
140 |                             NS3 = c("text_v4")),
141 |           keep_space = "text_v3",
142 |           fixed = "text_v4",
143 |           targets = "multiline_label",
144 |           multiline = 2)
145 |     mult_df2vw_checksum <- unname(tools::md5sum(df2vw_path))
146 |     
147 |     
148 |     file.remove(ref_path, df2vw_path)
149 |     
150 |     # Results comparison
151 |     expect_equal(regular_df2vw_checksum, regular_ref_checksum)
152 |     expect_equal(csoaa_df2vw_checksum, csoaa_ref_checksum)
153 |     expect_equal(cb_df2vw_checksum, cb_ref_checksum)
154 |     expect_equal(na_df2vw_checksum, na_ref_checksum)
155 |     expect_equal(mult_df2vw_checksum, mult_ref_checksum)
156 |     
157 | })
158 | 
159 | # Return back
160 | setwd(curr_dir)
161 | 


--------------------------------------------------------------------------------
/tests/testthat/test-utils.R:
--------------------------------------------------------------------------------
  1 | context("Check auxiliary functionality")
  2 | library(rvw)
  3 | 
  4 | # Switch to temporary directory
  5 | curr_dir <- getwd()
  6 | setwd(tempdir())
  7 | 
  8 | ext_train_data <- system.file("extdata", "binary_train.vw", package = "rvw")
  9 | ext_test_data <- system.file("extdata", "binary_valid.vw", package = "rvw")
 10 | 
 11 | test_that("vwtrain and vwtest output correct readable model", {
 12 |     # Package session
 13 |     test_vwmodel <- vwsetup(dir = "./", model = "pk_mdl.vw")
 14 |     # vwtrain
 15 |     vwtrain(test_vwmodel, data = ext_train_data, readable_model = "hashed", quiet = T)
 16 |     vw_pk_train_hashed_mdl_checksum = unname(tools::md5sum("readable_pk_mdl.vw"))
 17 |     test_vwmodel <- vwsetup(dir = "./", model = "pk_mdl.vw")
 18 |     vwtrain(test_vwmodel, data = ext_train_data, readable_model = "inverted", quiet = T)
 19 |     vw_pk_train_inverted_mdl_checksum <- unname(tools::md5sum("readable_pk_mdl.vw"))
 20 |     # vwtest
 21 |     vwtest(test_vwmodel, data = ext_test_data, readable_model = "hashed", quiet = T)
 22 |     vw_pk_test_hashed_mdl_checksum = unname(tools::md5sum("readable_pk_mdl.vw"))
 23 |     vwtest(test_vwmodel, data = ext_test_data, readable_model = "inverted", quiet = T)
 24 |     vw_pk_test_inverted_mdl_checksum <- unname(tools::md5sum("readable_pk_mdl.vw"))
 25 | 
 26 |     file.remove("pk_mdl.vw","readable_pk_mdl.vw")
 27 | 
 28 |     # Command Line session
 29 |     # train
 30 |     system(
 31 |         paste0("vw -d ", ext_train_data, " -f ./cl_mdl.vw --readable_model ./readable_cl_mdl.vw"),
 32 |         intern = FALSE,
 33 |         ignore.stderr = TRUE
 34 |     )
 35 |     vw_cl_train_hashed_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw"))
 36 |     system(
 37 |         paste0("vw -d ", ext_train_data, " -f ./cl_mdl.vw --invert_hash ./readable_cl_mdl.vw"),
 38 |         intern = FALSE,
 39 |         ignore.stderr = TRUE
 40 |     )
 41 |     vw_cl_train_inverted_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw"))
 42 |     # test
 43 |     system(
 44 |         paste0("vw -t -d ", ext_test_data, " -i ./cl_mdl.vw --readable_model ./readable_cl_mdl.vw"),
 45 |         intern = FALSE,
 46 |         ignore.stderr = TRUE
 47 |     )
 48 |     vw_cl_test_hashed_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw"))
 49 |     system(
 50 |         paste0("vw -t -d ", ext_test_data, " -i ./cl_mdl.vw --invert_hash ./readable_cl_mdl.vw"),
 51 |         intern = FALSE,
 52 |         ignore.stderr = TRUE
 53 |     )
 54 |     vw_cl_test_inverted_mdl_checksum = unname(tools::md5sum("readable_cl_mdl.vw"))
 55 |     file.remove("cl_mdl.vw","readable_cl_mdl.vw")
 56 | 
 57 |     # Results comparison
 58 |     unique_checksums_hashed <- length(unique(c(vw_pk_train_hashed_mdl_checksum,
 59 |                                                vw_pk_test_hashed_mdl_checksum,
 60 |                                                vw_cl_train_hashed_mdl_checksum,
 61 |                                                vw_cl_test_hashed_mdl_checksum)))
 62 | 
 63 |     expect_equal(vw_pk_train_hashed_mdl_checksum, vw_cl_train_hashed_mdl_checksum)
 64 |     expect_equal(vw_pk_test_hashed_mdl_checksum, vw_cl_test_hashed_mdl_checksum)
 65 |     expect_equal(vw_pk_train_inverted_mdl_checksum, vw_cl_train_inverted_mdl_checksum)
 66 |     expect_equal(vw_pk_test_inverted_mdl_checksum, vw_cl_test_inverted_mdl_checksum)
 67 | })
 68 | 
 69 | test_that("vwaudit outputs correct audit data.frame", {
 70 |     ref_df <- data.frame(Names = c("A^carat", "A^depth", "A^table", "A^price", "A^x",
 71 |                                    "A^z", "A^cut_Very_Good", "A^color_G", "A^clarity_SI1",
 72 |                                    "Constant", "A^cut_Premium", "A^color_I", "A^clarity_SI2",
 73 |                                    "A^cut_Good", "A^color_E", "A^clarity_VS2", "A^cut_Ideal",
 74 |                                    "A^color_D", "A^color_H", "A^color_F", "A^clarity_IF",
 75 |                                    "A^clarity_VS1", "A^cut_Fair", "A^color_J", "A^clarity_VVS2",
 76 |                                    "A^clarity_VVS1", "A^clarity_I1"),
 77 |                          Hashes = c(161523, 255131, 191106, 174484, 157305, 71870,
 78 |                                     197774, 147043, 202990, 116060, 179903, 131053,
 79 |                                     102042, 1176, 164360, 113391, 116290, 58595, 87066,
 80 |                                     240073, 1556, 114685, 151473, 32836, 101424, 80982, 141904),
 81 |                          V1 = c(-0.345847010612488, 0.00200122990645468,
 82 |                                           0.00143359997309744, -5.60191983822733e-05,
 83 |                                           -0.0254513993859291, -0.0392897985875607, 0.159768000245094,
 84 |                                           0.130854994058609, 0.0361801981925964, 0.119374997913837,
 85 |                                           0.0958541035652161, -0.0784583017230034, -0.191651001572609,
 86 |                                           0.144849002361298, 0.349283009767532, 0.0694333985447884,
 87 |                                           0.00745435990393162, -0.0727915987372398, -0.0811441987752914,
 88 |                                           0.273036986589432, 0.126379996538162, 0.171755000948906,
 89 |                                           -0.108182996511459, -0.328087002038956, 0.246926993131638,
 90 |                                           0.451092004776001, -0.148938998579979))
 91 | 
 92 |     test_vwmodel <- vwsetup()
 93 |     vwtrain(test_vwmodel, data = ext_train_data, quiet = T)
 94 |     aud_df <- vwaudit(test_vwmodel, quiet = T)
 95 | 
 96 |     expect_equal(aud_df, ref_df)
 97 | })
 98 | 
 99 | test_that("vwparams correctly returns and sets parameter values", {
100 |     
101 |     test_vwmodel <- vwsetup(general_params = list(link="identity", holdout_off=FALSE),
102 |                             feature_params = list(bit_precision=10),
103 |                             option = "nn", num_hidden = 5)
104 |     
105 |     
106 |     
107 |     # Character value
108 |     vwparams(test_vwmodel, name = "link") <- "logistic"
109 |     expect_equal(vwparams(test_vwmodel, name = "link"), "logistic")
110 |     
111 |     # Numerical value
112 |     vwparams(test_vwmodel, name = "bit_precision") <- 25
113 |     expect_equal(vwparams(test_vwmodel, name = "bit_precision"), 25)
114 |     
115 |     # Logical value
116 |     vwparams(test_vwmodel, name = "holdout_off") <- TRUE
117 |     expect_equal(vwparams(test_vwmodel, name = "holdout_off"), TRUE)
118 |     
119 |     # Option value
120 |     vwparams(test_vwmodel, name = "num_hidden") <- 10
121 |     expect_equal(vwparams(test_vwmodel, name = "num_hidden"), 10)
122 | })
123 | 
124 | # Return back
125 | setwd(curr_dir)
126 | 


--------------------------------------------------------------------------------
/tests/testthat/test-vwsetup.R:
--------------------------------------------------------------------------------
  1 | context("vwsetup")
  2 | library(rvw)
  3 | 
  4 | test_model <- list(params = list(algorithm = "sgd",
  5 |                                  general_params = list(random_seed=0,
  6 |                                                        ring_size=NA_real_,
  7 |                                                        holdout_off=FALSE,
  8 |                                                        holdout_period=10,
  9 |                                                        holdout_after=0,
 10 |                                                        early_terminate=3,
 11 |                                                        loss_function=NA_character_,
 12 |                                                        link=NA_character_,
 13 |                                                        quantile_tau=0.5),
 14 |                                  feature_params = list(bit_precision=18,
 15 |                                                        quadratic=NA_character_,
 16 |                                                        cubic=NA_character_,
 17 |                                                        interactions=NA_character_,
 18 |                                                        permutations=FALSE,
 19 |                                                        leave_duplicate_interactions=FALSE,
 20 |                                                        noconstant=FALSE, 
 21 |                                                        feature_limit=NA_character_,
 22 |                                                        ngram=NA_character_,
 23 |                                                        skips=NA_character_,
 24 |                                                        hash=NA_character_,
 25 |                                                        affix=NA_character_,
 26 |                                                        spelling=NA_character_,
 27 |                                                        interact=NA_character_),
 28 |                                  optimization_params = list(adaptive=TRUE,
 29 |                                                             normalized=TRUE,
 30 |                                                             invariant=TRUE,
 31 |                                                             adax=FALSE,
 32 |                                                             sparse_l2=0,
 33 |                                                             l1_state=0,
 34 |                                                             l2_state=1,
 35 |                                                             learning_rate=0.5,
 36 |                                                             initial_pass_length=NA_real_,
 37 |                                                             l1=0,
 38 |                                                             l2=0,
 39 |                                                             no_bias_regularization=NA_character_,
 40 |                                                             feature_mask=NA_character_,
 41 |                                                             decay_learning_rate=1,
 42 |                                                             initial_t=0,
 43 |                                                             power_t=0.5,
 44 |                                                             initial_weight=0,
 45 |                                                             random_weights="off",
 46 |                                                             normal_weights="off",
 47 |                                                             truncated_normal_weights="off",
 48 |                                                             sparse_weights=FALSE,
 49 |                                                             input_feature_regularizer=NA_character_),
 50 |                                  options = list()
 51 |                                  
 52 | ),
 53 | dir = "../my_tmp",
 54 | model = "mdl.vw",
 55 | params_str = paste0(""),
 56 | is_cl = FALSE,
 57 | data_md5sum = list(train = "",
 58 |                    test = ""),
 59 | train_file = "",
 60 | eval = list(
 61 |     train=list(
 62 |         num_examples = NA_real_,
 63 |         weighted_example_sum = NA_real_,
 64 |         weighted_label_sum = NA_real_,
 65 |         avg_loss = NA_real_,
 66 |         avg_multiclass_log_loss = NA_real_,
 67 |         best_const = NA_real_,
 68 |         best_const_loss = NA_real_,
 69 |         total_feature = NA_real_
 70 |     ),
 71 |     test=list(
 72 |         num_examples = NA_real_,
 73 |         weighted_example_sum = NA_real_,
 74 |         weighted_label_sum = NA_real_,
 75 |         avg_loss = NA_real_,
 76 |         avg_multiclass_log_loss = NA_real_,
 77 |         best_const = NA_real_,
 78 |         best_const_loss = NA_real_,
 79 |         total_feature = NA_real_
 80 |     )
 81 | ),
 82 | parser_opts=NA
 83 | )
 84 | class(test_model) <- "vw"
 85 | 
 86 | test_that("vwsetup correctly setup model with different learning modes", {
 87 |     # Empty setup
 88 |     expect_equal(vwsetup(dir = "../my_tmp/", model = "mdl.vw"), test_model)
 89 |     
 90 |     # CMD line setup
 91 |     cmd_test_model <- test_model
 92 |     cmd_test_model$params_str <- "-b 25 --link glm"
 93 |     cmd_test_model$is_cl <- TRUE
 94 |     cmd_test_model$params$general_params <- list()
 95 |     cmd_test_model$params$feature_params <- list()
 96 |     cmd_test_model$params$optimization_params <- list()
 97 |     cmd_test_model$params$algorithm <- NA
 98 |     expect_equal(vwsetup(dir = "../my_tmp/", model = "mdl.vw", params_str = "-b 25 --link glm"),
 99 |                  cmd_test_model)
100 |     
101 |     # Reference test model for nn mode
102 |     nn_test_model <- test_model
103 |     nn_test_model$params$options = list(nn = list(num_hidden=3,
104 |                                                   inpass=FALSE,
105 |                                                   multitask=FALSE,
106 |                                                   dropout=FALSE,
107 |                                                   meanfield=FALSE))
108 |     nn_test_model$params_str = paste0("--nn 3")
109 |     expect_equal(
110 |         vwsetup(dir = "../my_tmp/", model = "mdl.vw", option = "nn", num_hidden=3),
111 |         nn_test_model
112 |     )
113 |     
114 |     # Reference test model for lda mode
115 |     lda_test_model <- test_model
116 |     lda_test_model$params$options = list(lda = list(num_topics=5,
117 |                                                     lda_alpha=0.100000001,
118 |                                                     lda_rho=0.100000001,
119 |                                                     lda_D=10000,
120 |                                                     lda_epsilon=0.00100000005,
121 |                                                     math_mode=NA_character_,
122 |                                                     minibatch=1,
123 |                                                     metrics=0))
124 |     lda_test_model$params_str = paste0("--lda 5")
125 |     
126 |     expect_equal(
127 |         vwsetup(dir = "../my_tmp/", model = "mdl.vw", option = "lda", num_topics=5),
128 |         lda_test_model
129 |     )
130 |     
131 |     # Reference test model with custom parameters
132 |     custom_test_model <- test_model
133 |     custom_test_model$params$optimization_params$adaptive = FALSE
134 |     custom_test_model$params$options = list(binary = list(binary=TRUE))
135 |     custom_test_model$params_str <- paste0("--binary")
136 |     # Package vwmodel setup
137 |     test_vwmodel <- vwsetup(
138 |         dir = "../my_tmp/",
139 |         model = "mdl.vw",
140 |         optimization_params = list(adaptive=FALSE),
141 |         option = "binary"
142 |     )
143 |     expect_equal(test_vwmodel, custom_test_model)
144 |     
145 |     # Reference test model with Experience Replay
146 |     replay_test_model <- test_model
147 |     replay_test_model$params$options = list(replay = list(level="m",
148 |                                                           buffer=200,
149 |                                                           count=1))
150 |     replay_test_model$params_str <- paste0("--replay_m 200 --replay_m_count 1")
151 |     # Package vwmodel setup
152 |     test_vwmodel <- vwsetup(
153 |         dir = "../my_tmp/",
154 |         model = "mdl.vw",
155 |         option = "replay",
156 |         level="m",
157 |         buffer=200
158 |     )
159 |     expect_equal(test_vwmodel, replay_test_model)
160 |     
161 |     # Reference test model with Contextual Bandit Exploration with Action Dependent Features
162 |     cb_explore_test_model <- test_model
163 |     cb_explore_test_model$params$options = list(cb_explore = list(num_actions=0,
164 |                                                                   explore_type="bag",
165 |                                                                   explore_arg=10,
166 |                                                                   psi=1,
167 |                                                                   nounif=FALSE,
168 |                                                                   mellowness=0.1,
169 |                                                                   greedify=FALSE,
170 |                                                                   lambda=-1,
171 |                                                                   cb_min_cost=0,
172 |                                                                   cb_max_cost=1,
173 |                                                                   first_only=FALSE))
174 |     cb_explore_test_model$params_str <- paste0("--cb_explore_adf --bag 10")
175 |     # Package vwmodel setup
176 |     test_vwmodel <- vwsetup(
177 |         dir = "../my_tmp/",
178 |         model = "mdl.vw",
179 |         option = "cb_explore",
180 |         num_actions=0,
181 |         explore_type="bag",
182 |         explore_arg=10
183 |     )
184 |     expect_equal(test_vwmodel, cb_explore_test_model)
185 | })
186 | 


--------------------------------------------------------------------------------
/tools/r_configure.R:
--------------------------------------------------------------------------------
 1 | # For VW v8.6.1
 2 | # headers <- c('action_score.h', 'allreduce.h', 'array_parameters_dense.h',
 3 | #              'array_parameters.h', 'cb_explore.h', 'cb.h', 'comp_io.h',
 4 | #              'config.h', 'constant.h', 'cost_sensitive.h', 'crossplat_compat.h',
 5 | #              'error_reporting.h', 'example_predict.h', 'example.h', 'ezexample.h',
 6 | #              'feature_group.h', 'floatbits.h', 'global_data.h', 'hash.h',
 7 | #              'io_buf.h', 'label_parser.h', 'learner.h', 'loss_functions.h',
 8 | #              'memory.h', 'multiclass.h', 'multilabel.h', 'no_label.h',
 9 | #              'parse_example.h', 'parse_primitives.h', 'parser_helper.h',
10 | #              'parser.h', 'simple_label.h', 'v_array.h', 'v_hashmap.h',
11 | #              'vw_exception.h', 'vw_validate.h', 'vw.h', 'vwdll.h')
12 | 
13 | # For VW v8.6.1 without missing headers
14 | headers <- c('action_score.h', 'allreduce.h', 'cb_explore.h', 'cb.h', 'comp_io.h',
15 |              'config.h', 'constant.h', 'cost_sensitive.h', 'crossplat_compat.h',
16 |              'example.h', 'ezexample.h','feature_group.h', 'floatbits.h', 
17 |              'global_data.h','io_buf.h', 'label_parser.h', 'learner.h',
18 |              'loss_functions.h', 'memory.h', 'multiclass.h', 'multilabel.h',
19 |              'parse_example.h', 'parse_primitives.h','parser.h', 'simple_label.h',
20 |              'v_array.h', 'v_hashmap.h','vw_exception.h', 'vw_validate.h', 'vw.h', 'vwdll.h')
21 | 
22 | path_prefix_list <- c("/usr/local", "/usr", "/opt")
23 | path_suffix_list <- c("vw", "vowpalwabbit", "")
24 | 
25 | search_path_list <- unlist(lapply(path_suffix_list, FUN = function(x) file.path(path_prefix_list, "include", x)))
26 | 
27 | for (search_path in search_path_list) {
28 |     headers_in_path <- headers %in% list.files(search_path)
29 | 
30 |     if (all(headers_in_path)) { # All headers are found
31 |         valid_path <- search_path
32 |         # Drop "/vowpalwabbit" terminal part of a path
33 |         # valid_path <- dirname(valid_path)
34 |         
35 |         break
36 |     } else if ( (sum(headers_in_path) < length(headers_in_path)) && (sum(headers_in_path) > 0) ) { # Some headers are found
37 |         valid_path <- NULL
38 |         cat(paste0("Missing headers in ", search_path, "\n"))
39 |         cat(paste0(headers[!headers_in_path], collapse = ", "), "\n")
40 |         break
41 |     } else { # No headers are found
42 |         valid_path <- NULL
43 |     }
44 | }
45 | 
46 | if(is.null(valid_path)) {
47 |     stop("Can't find the proper 'include/vowpalwabbit' directory containing Vowpal Wabbit header files.", call. = FALSE)
48 | } else {
49 |     
50 |     # Valid path found
51 |     cat(paste0("Valid path: ", valid_path, "\n"))
52 |     # cat(paste0("Valid path: ", file.path(valid_path, "vowpalwabbit"), "\n"))
53 | 
54 |     if(!file.exists(file.path("src", "Makevars.in"))){ 
55 |         stop("No 'Makevars.in' file", call. = FALSE)
56 |     }
57 | 
58 |     makevars_in_lines <- readLines(file.path("src", "Makevars.in"))
59 |     makevars_out <- file(file.path("src", "Makevars"), "w")
60 |     
61 | 
62 |     include_line <- paste0("PKG_CPPFLAGS = -Iextra/ -I", valid_path)
63 |     writeLines(include_line, con = makevars_out)
64 |     for (line in makevars_in_lines) {
65 |         writeLines(line, con = makevars_out)
66 |     }
67 |     close(makevars_out)
68 |     
69 | }
70 | 


--------------------------------------------------------------------------------