├── .Rbuildignore ├── .github └── workflows │ └── R-CMD-check.yaml ├── .gitignore ├── DESCRIPTION ├── DrakeModelling.Rproj ├── LICENSE ├── NAMESPACE ├── R ├── assert-model-input.R ├── assert-model-output.R ├── create-tfidf.R ├── create-vocabulary.R ├── download-and-read-data.R ├── map-to-dtm.R ├── model-execution-plan.R ├── model-training-plan.R ├── new-data-to-be-scored.R ├── read-review-file.R ├── sentiment.R ├── stem-tokeniser.R ├── submit-predictions.R ├── text-preprocessor.R ├── validate-model.R ├── validate-predictions.R └── zzz.R ├── README.md ├── inst ├── eda │ └── eda.Rmd ├── execute-model.R ├── img │ ├── drake-model-execution-plan.png │ ├── drake-model-training-plan.png │ └── failed-validation.png └── train-model.R ├── man ├── create_tfidf.Rd ├── create_vocabulary.Rd ├── download_and_read_data.Rd ├── map_to_dtm.Rd ├── model_execution_plan.Rd ├── model_training_plan.Rd ├── new_data_to_be_scored.Rd ├── read_review_file.Rd ├── sentiment.Rd ├── stem_tokeniser.Rd ├── submit_predictions.Rd ├── text_preprocessor.Rd ├── validate_model.Rd └── validate_predictions.Rd └── tests ├── testthat.R └── testthat ├── test-map-to-dtm.R ├── test-stem-tokeniser.R └── test-text-preprocessor.R /.Rbuildignore: -------------------------------------------------------------------------------- 1 | ^.*\.Rproj$ 2 | ^\.Rproj\.user$ 3 | ^doc$ 4 | ^Meta$ 5 | ^LICENSE\.md$ 6 | ^\.github/workflows/R-CMD-check\.yaml$ 7 | ^.drake$ 8 | ^inst/artefacts$ 9 | ^inst/img$ 10 | ^artefacts$ 11 | -------------------------------------------------------------------------------- /.github/workflows/R-CMD-check.yaml: -------------------------------------------------------------------------------- 1 | on: [push, pull_request] 2 | 3 | name: R-CMD-check 4 | 5 | jobs: 6 | R-CMD-check: 7 | runs-on: macOS-latest 8 | steps: 9 | - uses: actions/checkout@v1 10 | - uses: r-lib/actions/setup-r@master 11 | - name: Install dependencies 12 | run: Rscript -e "install.packages(c('remotes', 'rcmdcheck'))" -e "remotes::install_deps(dependencies = TRUE)" 13 | - name: Check 14 | run: Rscript -e "rcmdcheck::rcmdcheck(args = '--no-manual', error_on = 'error')" 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .Rproj.user 2 | .Rhistory 3 | .RData 4 | .Ruserdata 5 | .drake 6 | inst/doc 7 | inst/extdata 8 | inst/artefacts 9 | inst/eda/*.html 10 | data 11 | vignettes/*.html 12 | vignettes/*.pdf 13 | doc 14 | Meta 15 | artefacts 16 | -------------------------------------------------------------------------------- /DESCRIPTION: -------------------------------------------------------------------------------- 1 | Package: DrakeModelling 2 | Type: Package 3 | Title: Using drake to Train and Execute a Model in a Custom Package 4 | Version: 0.1.0 5 | Author: David Neuzerling (@mdneuzerling) 6 | Maintainer: David Neuzerling 7 | Description: Drake is excellent for doing complicated things like model training or execution. I want to combine that with the benefits of custom R packages. 8 | Depends: 9 | R (>= 3.4.0) 10 | Imports: 11 | dplyr, 12 | readr, 13 | text2vec, 14 | tm, 15 | tidytext, 16 | SnowballC, 17 | randomForest, 18 | assertthat, 19 | drake, 20 | here, 21 | rlang 22 | License: MIT + file LICENSE 23 | Encoding: UTF-8 24 | LazyData: true 25 | RoxygenNote: 7.1.0 26 | Suggests: 27 | devtools, 28 | ggplot2, 29 | naniar, 30 | purrr, 31 | usethis, 32 | withr, 33 | wordcloud, 34 | knitr, 35 | rmarkdown, 36 | tibble, 37 | testthat (>= 2.1.0) 38 | VignetteBuilder: knitr 39 | URL: https://github.com/mdneuzerling/DrakeModelling 40 | BugReports: https://github.com/mdneuzerling/DrakeModelling/issues 41 | -------------------------------------------------------------------------------- /DrakeModelling.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: Default 4 | SaveWorkspace: Default 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 2 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | 15 | BuildType: Package 16 | PackageUseDevtools: Yes 17 | PackageInstallArgs: --no-multiarch --with-keep.source 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Murray David Neuzerling 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /NAMESPACE: -------------------------------------------------------------------------------- 1 | # Generated by roxygen2: do not edit by hand 2 | 3 | export(create_tfidf) 4 | export(create_vocabulary) 5 | export(download_and_read_data) 6 | export(map_to_dtm) 7 | export(model_execution_plan) 8 | export(model_training_plan) 9 | export(new_data_to_be_scored) 10 | export(read_review_file) 11 | export(sentiment) 12 | export(stem_tokeniser) 13 | export(submit_predictions) 14 | export(text_preprocessor) 15 | export(validate_model) 16 | export(validate_predictions) 17 | importFrom(assertthat,assert_that) 18 | importFrom(dplyr,"%>%") 19 | importFrom(drake,trigger) 20 | importFrom(randomForest,randomForest) 21 | importFrom(rlang,.data) 22 | -------------------------------------------------------------------------------- /R/assert-model-input.R: -------------------------------------------------------------------------------- 1 | # context("sentiment") 2 | # 3 | # test_that("'bad' is bad sentiment", { 4 | # expect_equal(sentiment("bad"), "bad") 5 | # }) 6 | # 7 | # test_that("'love' is good sentiment", { 8 | # expect_equal(sentiment("love"), "good") 9 | # }) 10 | -------------------------------------------------------------------------------- /R/assert-model-output.R: -------------------------------------------------------------------------------- 1 | # context("sentiment") 2 | # 3 | # test_that("'bad' is bad sentiment", { 4 | # expect_equal(sentiment("bad"), "bad") 5 | # }) 6 | # 7 | # test_that("'love' is good sentiment", { 8 | # expect_equal(sentiment("love"), "good") 9 | # }) 10 | -------------------------------------------------------------------------------- /R/create-tfidf.R: -------------------------------------------------------------------------------- 1 | #' Create a fitted tf-idf 2 | #' 3 | #' The text2vec package uses a Pythonic style of code to create a fitted term 4 | #' frequency - inverse document frequency (tf-idf) object. A new tfidf object 5 | #' is initialised and then the tfidf$fit_transform method is applied to an 6 | #' unweighted document-term matrix. This style of code is unusual in R and 7 | #' can cause issues with drake. This function creates a fitted tfidf object 8 | #' with a more functional and R-idiomatic style of code. 9 | #' 10 | #' @param dtm_unweighted An unweighted document term matrix created by the 11 | #' map_to_dtm function, or an equivalent from text2vec. 12 | #' 13 | #' @return A fitted tfidf objet created by the text2vec package 14 | #' @export 15 | #' 16 | create_tfidf <- function(dtm_unweighted) { 17 | tfidf = text2vec::TfIdf$new() 18 | tfidf$fit_transform(dtm_unweighted) 19 | return(tfidf) 20 | } -------------------------------------------------------------------------------- /R/create-vocabulary.R: -------------------------------------------------------------------------------- 1 | #' Create a pruned vocabulary from a token iterator 2 | #' 3 | #' This function creates a vocabulary from a vector of documents. A 4 | #' vocabulary defines the domain of a natural language processing problem. 5 | #' Vocabularies are often used to create vectorisers, which allow novel pieces 6 | #' of text to be mapped to a vocabulary defined by a training set. To exclude 7 | #' frequently and infrequently occurring tokens, the vocabulary is often 8 | #' trimmed. This reduces the dimension of the problem to decrease training time 9 | #' and the potential for overfitting. 10 | #' 11 | #' @param documents A vector of characters, often sentences or paragraphs. 12 | #' @param doc_proportion_min Optional. A number between 0 and 1 which specifies 13 | #' the minimum proportion of documents in which a token appears in order to be 14 | #' included in the vocabulary. Defaults to 0 (no effect). 15 | #' @param doc_proportion_max Optional. A number between 0 and 1 which specifies 16 | #' the maximum proportion of documents in which a token appears in order to be 17 | #' included in the vocabulary. Defaults to 1 (no effect). 18 | #' @return A vocabulary object used in the text2vec package 19 | #' 20 | #' @export 21 | #' 22 | create_vocabulary <- function(documents, 23 | doc_proportion_min = 0, 24 | doc_proportion_max = 1) { 25 | # For some reason, the preprocessor and stem_tokeniser don't take effect if 26 | # I put them in the itoken function as values to the relevant arguments. 27 | # Please let me know if you understand why this is! 28 | processed_documents <- stem_tokeniser(text_preprocessor(documents)) 29 | 30 | # Warning: token iterators can only be used once. 31 | token_iterator <- text2vec::itoken( 32 | processed_documents, 33 | progressbar = FALSE 34 | ) 35 | 36 | vocabulary <- text2vec::create_vocabulary( 37 | token_iterator, 38 | stopwords = tidytext::stop_words$word 39 | ) 40 | 41 | vocabulary <- text2vec::prune_vocabulary( 42 | vocabulary, 43 | doc_proportion_min = doc_proportion_min, 44 | doc_proportion_max = doc_proportion_max 45 | ) 46 | 47 | return(vocabulary) 48 | } 49 | -------------------------------------------------------------------------------- /R/download-and-read-data.R: -------------------------------------------------------------------------------- 1 | #' Download and parse review data 2 | #' 3 | #' Given the URL of a .zip file, this function will download it and decompress 4 | #' it to a given location. 5 | #' 6 | #' This function was intended to download and uncompress the data located at 7 | #' https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences 8 | #' It is not a general function to download arbitrary data, but may be a 9 | #' useful template. 10 | #' 11 | #' @param data_source_url Character. 12 | #' 13 | #' @return Tibble of parsed data 14 | #' @importFrom rlang .data 15 | #' @export 16 | #' 17 | download_and_read_data <- function(data_source_url) { 18 | temp_dir <- tempdir() 19 | 20 | zip_file <- file.path(temp_dir, "zip_file") 21 | on.exit(unlink(zip_file, recursive = TRUE)) 22 | utils::download.file(data_source_url, zip_file) 23 | data_files <-c("amazon_cells_labelled.txt", 24 | "imdb_labelled.txt", 25 | "yelp_labelled.txt") 26 | data_files_in_zip <- file.path("sentiment labelled sentences", data_files) 27 | utils::unzip( 28 | zip_file, files = data_files_in_zip, junkpaths = TRUE, exdir = temp_dir 29 | ) 30 | downloaded_data_files <- file.path(temp_dir, data_files) 31 | on.exit(unlink(downloaded_data_files), add = TRUE) 32 | 33 | downloaded_data_files %>% 34 | purrr::map(read_review_file) %>% 35 | purrr::reduce(rbind) %>% 36 | dplyr::mutate(sentiment = ifelse(.data$sentiment == 1, "good", "bad")) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /R/map-to-dtm.R: -------------------------------------------------------------------------------- 1 | #' Create a document term matrix using a vectoriser 2 | #' 3 | #' This function uses a vectoriser created with the text2vec package to map 4 | #' a new piece of text, or vector of text, onto a document term matrix (dtm). 5 | #' The vectoriser has a concept of a vocabulary, a set of tokens which 6 | #' determine the columns of the resulting document term matrix. Any term that 7 | #' doesn't match to a token in the vocabulary will be ignored. Optionally, the 8 | #' document term matrix can be weighted by a term frequency-inverse document 9 | #' frequency (tfidf) object, created with the text2vec::TfIdf function. 10 | #' 11 | #' @param x A character or vector of characters, usually sentences, paragraphs 12 | #' or similar pieces of natural language. 13 | #' @param vectoriser A vectoriser constructed with the text2vec package. 14 | #' @param tfidf A tfidf object constructed with the text2vec package. 15 | #' If no tfidf is NULL, then an unweighted document term matrix will be 16 | #' returned. 17 | #' @return A document-term matrix with rows representing the textual objects in 18 | #' x, and columns representing the tokens in the vocabulary used to generate the 19 | #' given vectoriser. 20 | #' 21 | #' @export 22 | #' 23 | map_to_dtm <- function(x, 24 | vectoriser, 25 | tfidf = NULL) { 26 | processed_text <- stem_tokeniser(text_preprocessor(x)) 27 | 28 | # For some reason, the preprocessor and stem_tokeniser don't take effect if 29 | # I put them in the itoken function as values to the relevant arguments. 30 | # Please let me know if you understand why this is! 31 | tokens <- text2vec::itoken( 32 | processed_text, 33 | progressbar = FALSE 34 | ) 35 | 36 | # If the input contains no terms corresponding the to vocabulary used to 37 | # generate the vectoriser, then a warning will occur for an empty dtm. 38 | # Since this is a plausible scenario, we suppress the warning. 39 | suppressWarnings( 40 | dtm <- text2vec::create_dtm(tokens, vectoriser) 41 | ) 42 | 43 | if (!is.null(tfidf)) { 44 | dtm <- tfidf$transform(dtm) 45 | } 46 | 47 | return(dtm) 48 | } 49 | -------------------------------------------------------------------------------- /R/model-execution-plan.R: -------------------------------------------------------------------------------- 1 | #' drake plan for model execution 2 | #' 3 | #' @return A plan to be run with drake::make() 4 | #' @export 5 | #' 6 | model_execution_plan <- function() { 7 | drake::drake_plan( 8 | new_data = new_data_to_be_scored(), 9 | tfidf = readr::read_rds(file_in("artefacts/tfidf.rds")), 10 | vectoriser = readr::read_rds(file_in("artefacts/vectoriser.rds")), 11 | review_rf = readr::read_rds(file_in("artefacts/review_rf.rds")), 12 | predictions = sentiment(new_data$review, 13 | random_forest = review_rf, 14 | vectoriser = vectoriser, 15 | tfidf = tfidf), 16 | validation = validate_predictions(predictions), 17 | submit_predictions = target( 18 | submit_predictions(predictions), 19 | trigger = trigger(condition = validation, mode = "blacklist") 20 | ) 21 | ) 22 | } 23 | -------------------------------------------------------------------------------- /R/model-training-plan.R: -------------------------------------------------------------------------------- 1 | #' drake plan for model training 2 | #' 3 | #' @return A plan to be run with drake::make() 4 | #' @importFrom drake trigger 5 | #' @export 6 | #' 7 | model_training_plan <- function() { 8 | drake::drake_plan( 9 | reviews = target( 10 | download_and_read_data( 11 | file_in("https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip") 12 | ) 13 | ), 14 | vocabulary = create_vocabulary(reviews$review, 15 | doc_proportion_min = 25 / nrow(reviews)), 16 | vectoriser = text2vec::vocab_vectorizer(vocabulary), 17 | dtm_unweighted = map_to_dtm(reviews$review, 18 | vectoriser = vectoriser), 19 | tfidf = create_tfidf(dtm_unweighted), 20 | dtm_tfidf_weighted = map_to_dtm(reviews$review, 21 | vectoriser = vectoriser, 22 | tfidf = tfidf), 23 | review_rf = randomForest::randomForest( 24 | x = as.matrix(dtm_tfidf_weighted), 25 | y = factor(reviews$sentiment), 26 | ntree = 500 27 | ), 28 | validation = validate_model(review_rf, vectoriser, tfidf), 29 | output_model = drake::target( 30 | { 31 | dir.create("artefacts", showWarnings = FALSE) 32 | readr::write_rds(vectoriser, file_out("artefacts/vectoriser.rds")) 33 | readr::write_rds(tfidf, file_out("artefacts/tfidf.rds")) 34 | readr::write_rds(review_rf, file_out("artefacts/review_rf.rds")) 35 | }, 36 | trigger = drake::trigger(condition = validation, mode = "blacklist") 37 | ) 38 | ) 39 | } 40 | 41 | 42 | -------------------------------------------------------------------------------- /R/new-data-to-be-scored.R: -------------------------------------------------------------------------------- 1 | #' Retrieve new data that needs to be scored 2 | #' 3 | #' This function is meant to replicate a function that will retrieve new data 4 | #' that needs to be scored. This is a batch execution, rather than an on-demand 5 | #' API that might be stood up by the plumber package. In practice, the return 6 | #' is a fixed sample of 10 reviews. 7 | #' 8 | #' @return Data frame with two char columns: review and sentiment ("good"/"bad") 9 | #' @export 10 | #' 11 | new_data_to_be_scored <- function() { 12 | tibble::tribble( 13 | ~"review", ~"sentiment", 14 | "I'm still infatuated with this phone.", "good", 15 | "Strike 2, who wants to be rushed.", "bad", 16 | "I enjoyed reading this book to my children when they were little.", "good", 17 | "We had a group of 70+ when we claimed we would only have 40 and they handled us beautifully.", "good", 18 | "The story is lame, not interesting and NEVER really explains the sinister origins of the puppets", "bad", 19 | "Better than you'd expect.", "good", 20 | "It was a huge awkward 1.5lb piece of cow that was 3/4ths gristle and fat.", "bad", 21 | "Yes, it's that bad.", "bad", 22 | "I did not expect this to be so good!", "good", 23 | "The only redeeming quality of the restaurant was that it was very inexpensive.", "good" 24 | ) 25 | } -------------------------------------------------------------------------------- /R/read-review-file.R: -------------------------------------------------------------------------------- 1 | #' Read a review source data file into a data frame 2 | #' 3 | #' This function is specifically intended to parse the three data files in the 4 | #' sentiment analysis data set 5 | #' (https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences). 6 | #' 7 | #' @param file_path character. A file path to a source .txt data file 8 | #' 9 | #' @export 10 | #' 11 | read_review_file <- function(file_path) { 12 | readr::read_tsv( 13 | file_path, 14 | col_names = c("review", "sentiment"), 15 | quote = "", 16 | col_types = readr::cols( 17 | review = readr::col_character(), 18 | sentiment = readr::col_integer() 19 | ) 20 | ) 21 | } 22 | -------------------------------------------------------------------------------- /R/sentiment.R: -------------------------------------------------------------------------------- 1 | #' Classify a review as good or bad 2 | #' 3 | #' @param x Text to be classified, ideally a one-sentence product review. 4 | #' @param random_forest A model created with the randomForest package. 5 | #' @param vectoriser A vectoriser constructed with the text2vec package. 6 | #' @param tfidf A tfidf object constructed with the text2vec package. 7 | #' If no tfidf is NULL, then weighting will not be applied. 8 | #' 9 | #' @importFrom randomForest randomForest 10 | #' @export 11 | #' 12 | sentiment <- function(x, random_forest, vectoriser, tfidf = NULL) { 13 | processed <- map_to_dtm(x, vectoriser = vectoriser, tfidf = tfidf) 14 | as.character(stats::predict(random_forest, processed)) 15 | } 16 | -------------------------------------------------------------------------------- /R/stem-tokeniser.R: -------------------------------------------------------------------------------- 1 | #' Stem and tokenise a sentence 2 | #' 3 | #' @param x A character or vector of characters, usually sentences, paragraphs 4 | #' or similar pieces of natural language. 5 | #' @return A vector of tokens 6 | #' 7 | #' @export 8 | #' 9 | #' @examples stem_tokeniser("information informed informing informs") 10 | #' 11 | stem_tokeniser <- function(x) { 12 | lapply( 13 | text2vec::word_tokenizer(x), 14 | SnowballC::wordStem, 15 | language = "en" 16 | ) 17 | } 18 | -------------------------------------------------------------------------------- /R/submit-predictions.R: -------------------------------------------------------------------------------- 1 | #' Simulate submission of output data 2 | #' 3 | #' This is a dummy function that simulates the submission of newly scored 4 | #' model results to an external platform. In actual fact, this function does 5 | #' nothing, but sleeps for 5 seconds to simulate an upload process. 6 | #' 7 | #' @param predictions Predictions to be submitted. Format is not important. 8 | #' 9 | #' @return Returns output, invisibly 10 | #' @export 11 | #' 12 | submit_predictions <- function(predictions) { 13 | Sys.sleep(5) 14 | invisible(predictions) 15 | } -------------------------------------------------------------------------------- /R/text-preprocessor.R: -------------------------------------------------------------------------------- 1 | #' Create a pruned vocabulary from a token iterator 2 | #' 3 | #' This function creates a vocabulary from a vector of documents. A 4 | #' vocabulary defines the domain of a natural language processing problem. 5 | #' Vocabularies are often used to create vectorisers, which allow novel pieces 6 | #' of text to be mapped to a vocabulary defined by a training set. To exclude 7 | #' frequently and infrequently occurring tokens, the vocabulary is often 8 | #' trimmed. This reduces the dimension of the problem to decrease training time 9 | #' and the potential for overfitting. 10 | #' 11 | #' @param x Character. Text to be processed. 12 | #' @return The same character after processing. 13 | #' 14 | #' @importFrom dplyr %>% 15 | #' @export 16 | #' 17 | 18 | text_preprocessor <- function(x) { 19 | x %>% 20 | tolower %>% 21 | tm::removeNumbers() %>% 22 | tm::removePunctuation() 23 | } 24 | -------------------------------------------------------------------------------- /R/validate-model.R: -------------------------------------------------------------------------------- 1 | #' Assert that model makes sensible predictions 2 | #' 3 | #' This function tests model predictions against a small sample of inputs with 4 | #' predictable outputs. For example, a sentiment analysis model that returns 5 | #' a sentiment of "bad" for "love" should not be used. We can also demand a 6 | #' minimum level of accuracy according to one or more metrics. Any failed 7 | #' assertions will raise an error. This validation approach could also be used 8 | #' to perform rudimentary checks for model bias (in the ethics sense). 9 | #' 10 | #' @param random_forest A model created with the randomForest package. 11 | #' @param vectoriser A vectoriser constructed with the text2vec package. 12 | #' @param tfidf A tfidf object constructed with the text2vec package. 13 | #' If no tfidf is NULL, then weighting will not be applied. 14 | #' 15 | #' @return The MD5 hash of the random_forest object 16 | #' @export 17 | #' 18 | validate_model <- function(random_forest, vectoriser, tfidf = NULL) { 19 | model_sentiment <- function(x) sentiment(x, random_forest, vectoriser, tfidf) 20 | oob <- random_forest$err.rate[random_forest$ntree, "OOB"] 21 | 22 | assertthat::assert_that(model_sentiment("love") == "good") 23 | assertthat::assert_that(model_sentiment("bad") == "bad") 24 | assertthat::assert_that(oob < 0.4) 25 | 26 | TRUE 27 | } -------------------------------------------------------------------------------- /R/validate-predictions.R: -------------------------------------------------------------------------------- 1 | #' Assert that model predictions are sensible 2 | #' 3 | #' This function tests that model predictions make sense. It is intended to be 4 | #' run after new data has been scored, but before it has been submitted. We 5 | #' might ensure that outputs are sensible, within an expected range, or do not 6 | #' contain sensitive data. Failed assertions will raise an error. Upon success, 7 | #' the md5 hash of the predictions will be returned. 8 | #' 9 | #' @param predictions A vector of model predictions 10 | #' 11 | #' @return The MD5 hash of the predictions 12 | #' @export 13 | #' 14 | validate_predictions <- function(predictions) { 15 | assertthat::assert_that(all(predictions == "good" | predictions == "bad")) 16 | 17 | TRUE 18 | } -------------------------------------------------------------------------------- /R/zzz.R: -------------------------------------------------------------------------------- 1 | utils::globalVariables(codetools::findGlobals(model_training_plan)) 2 | utils::globalVariables(codetools::findGlobals(model_execution_plan)) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | [![R build status](https://github.com/mdneuzerling/DrakeModelling/workflows/R-CMD-check/badge.svg)](https://github.com/mdneuzerling/DrakeModelling) 3 | 4 | 5 | # DrakeModelling 6 | 7 | [drake](https://github.com/ropensci/drake) is an excellent tool for managing large and complicated workflows. It is the R equivalent of the `make` command, with a particular emphasis on using R functions. 8 | 9 | ## Concept 10 | 11 | When it comes to code, there are three major components to a machine learning project: 12 | 13 | 1) Exploratory data analysis (EDA) 14 | 2) Model training 15 | 3) Model execution 16 | 17 | These components are run independently of each other. EDA is a largely human task, and is usually only performed when the model is created or updated in some major way. The other two components need not operate together --- if model retraining is expensive, or new training data is infrequently available, we might retrain a model on some monthly basis while scoring new data on a daily basis. 18 | 19 | I pieced together this template that implements these three components using R-specific tools: 20 | 21 | 1) EDA --- **R Markdown** 22 | 2) Model training --- **drake** 23 | 3) Model execution --- **drake** 24 | 25 | All three of these components might use similar functions. Typically we would place all of these functions in a directory (almost always called `R/`) and `source` them as needed. Here I want to try to combine these components into a custom R package. 26 | 27 | R packages are the standard for complicated R projects. With packages, we gain access to the comprehensive `R CMD CHECK`, as well as `testthat` unit tests and `roxygen2` documentation. I'm certainly not the first to combine drake with a package workflow, but I wanted to have a single repository that combines all elements of a machine learning project. 28 | 29 | This template uses a simple random forest sentiment analysis model, based on [labelled data available from the UCI machine learning repository](https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences). Drake takes care of the data caching for us. This means that we can, say, adjust the hyper-parameters of our model and rerun the training plan, and only the modelling step and onward will be rerun. 30 | 31 | This template considers machine learning workflows intended to be executed in batch --- for models that run as APIs, consider using `plumber` instead. 32 | 33 | ## Training and execution 34 | 35 | After cloning the repo, navigate to the directory in which the files are located. The easiest way to do this is to open the project in RStudio. 36 | 37 | Model training and execution plans are generated by functions in the package. The package doesn't actually need to be installed --- we can use `devtools::load_all()` to simulate the installation. The model can be trained with: 38 | 39 | ``` 40 | devtools::load_all() 41 | drake::make(model_training_plan()) 42 | ``` 43 | 44 | ![](inst/img/drake-model-training-plan.png) 45 | 46 | Model execution is run similarly: 47 | 48 | ``` 49 | devtools::load_all() 50 | drake::make(model_execution_plan()) 51 | ``` 52 | 53 | ![](inst/img/drake-model-execution-plan.png) 54 | 55 | Model artefacts --- the random forest model, the vectoriser, and the tfidf weightings --- are saved to and loaded from the `artefacts/` directory. This is an arbitrary choice. We could just as easily use a different directory or remote storage. 56 | 57 | I've simulated a production step with a `new_data_to_be_scored` function that returns a few reviews to be scored. Predictions are "submitted" through the `submit_prediction()` function. This function does nothing except sleep for 5 seconds. In practice we would submit model output wherever it needs to go --- locally, a cloud service, etc. It's hard to "productionise" a model when it's just a toy. 58 | 59 | The exploratory data analysis piece can be found in the `inst/eda/` directory. It is compiled with `knitr`. 60 | 61 | ## Model and prediction verification 62 | 63 | Both training and execution plans include a _verification_ step. These are functions that --- using the `assertthat` package --- ensure certain basic facts about the model and its predictions are true. If any of these assertions is false, an error is returned. 64 | 65 | ``` 66 | validate_model <- function(random_forest, vectoriser, tfidf = NULL) { 67 | model_sentiment <- function(x) sentiment(x, random_forest, vectoriser, tfidf) 68 | oob <- random_forest$err.rate[random_forest$ntree, "OOB"] # out of bag error 69 | 70 | assertthat::assert_that(model_sentiment("love") == "good") 71 | assertthat::assert_that(model_sentiment("bad") == "bad") 72 | assertthat::assert_that(oob < 0.4) 73 | 74 | TRUE 75 | } 76 | ``` 77 | 78 | The model artefacts and predictions cannot be exported without passing this verification step. Their relevant drake targets are conditioned on the validation function returning `TRUE`: 79 | 80 | ``` 81 | output_model = drake::target( 82 | { 83 | dir.create("artefacts", showWarnings = FALSE) 84 | readr::write_rds(vectoriser, file_out("artefacts/vectoriser.rds")) 85 | readr::write_rds(tfidf, file_out("artefacts/tfidf.rds")) 86 | readr::write_rds(review_rf, file_out("artefacts/review_rf.rds")) 87 | }, 88 | trigger = drake::trigger(condition = validation, mode = "blacklist") 89 | ) 90 | ``` 91 | 92 | For example, suppose I changed the assertion above to demand that my model must have an out-of-bag error of less than 0.01% before it can be exported. My model isn't very good, however, so that step will error. The execution steps are dependent on that validation, and so they won't be run. 93 | 94 | ![](inst/img/failed-validation.png) 95 | 96 | The assertions I've included here are very basic. However, I think these steps of the plans are important and extensible. We could assert that a model: 97 | 98 | * produces sensible outputs, based on type or domain. 99 | * has an accuracy above a given threshold, based on one or more metrics. 100 | * does not produce outputs that are biased against a particular group. 101 | 102 | We could also assert that predictions of new data: 103 | 104 | * are sensible. 105 | * do not contain sensitive data. 106 | * are not biased against particular groups. 107 | -------------------------------------------------------------------------------- /inst/eda/eda.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Exploratory data analysis" 3 | author: "David Neuzerling" 4 | date: "`r Sys.Date()`" 5 | output: html_document 6 | --- 7 | 8 | ```{r load_package, include = FALSE} 9 | library(tidyverse) 10 | package_root <- here::here() 11 | devtools::load_all(package_root) 12 | ``` 13 | 14 | # Data load 15 | 16 | I haven't kept the data in this git repository, opting instead to download it if it doesn't already exist. It's a fairly small data set though (3000 rows). 17 | 18 | `download_data` is a package function that downloads and unzips the source data into the `inst/extdata` directory (creating it if necessary). On package compilation, everything in the `inst` folder is moved up to the root directory of the package, and so we can find the `extdata` directory in the finished product. 19 | 20 | ```{r download-data} 21 | extdata <- file.path(package_root, "inst", "extdata") 22 | data_files <- c("amazon_cells_labelled.txt", 23 | "imdb_labelled.txt", 24 | "yelp_labelled.txt") %>% file.path(extdata, .) 25 | if (!all(file.exists(data_files))) { 26 | download_data(extdata) 27 | } 28 | ``` 29 | 30 | Data is loaded in with another custom function, `read_review_file`. This is just `readr::read_tsv` with some special options to cover the pecularities of the raw data. All of these custom functions are documented and stored in the `R` directory. Once the package is installed, function manuals can be called in the usual way (eg. `?read_review_file`). 31 | 32 | This is a simple analysis, so let's just stick to discrete categories for sentiment: "good" and "bad". I don't care too much about how the model performs, as long as it functions. 33 | 34 | ```{r load-data} 35 | reviews <- data_files %>% 36 | purrr::map(read_review_file) %>% 37 | purrr::reduce(rbind) %>% 38 | mutate(sentiment = ifelse(sentiment == 1, "good", "bad")) 39 | reviews %>% head 40 | ``` 41 | 42 | # Exploring data 43 | 44 | We check for missing data using the `naniar` package: 45 | 46 | ```{r naniar} 47 | reviews %>% naniar::miss_var_summary() 48 | ``` 49 | 50 | Let's take a look at which words are the most frequent. First we create a data frame such that each row is an occurrence of a word. Note that we remove stop words --- these are words like "the" that are common and usually provide little semantic content to the text. 51 | 52 | ```{r words} 53 | words <- reviews %>% 54 | tidytext::unnest_tokens( 55 | word, 56 | review 57 | ) %>% 58 | anti_join( 59 | tidytext::stop_words, 60 | by = "word" 61 | ) 62 | words %>% head 63 | ``` 64 | 65 | Now we'll plot the mosst frequently occurring words, keeping a note of which words are "good" and which words are "bad". 66 | 67 | ```{r word_frequency, fig.width = 6, fig.height = 6, out.height = 600, out.width = 600} 68 | words %>% 69 | count(word, sentiment, sort = TRUE) %>% 70 | head(20) %>% 71 | mutate(word = reorder(word, n)) %>% 72 | ggplot(aes(word, n, fill = sentiment)) + 73 | geom_col() + 74 | xlab(NULL) + 75 | theme(text = element_text(size = 16)) + 76 | coord_flip() + 77 | ggtitle("Frequency of words") 78 | ``` 79 | 80 | There are no surprises here! "Bad" is universally bad and "love" is universally good. It's comforting to see. We'll note this and use these words in our unit tests. 81 | 82 | I'm not sure what purpose word clouds serve, but they seem almost mandatory. 83 | 84 | ```{r word_cloud, fig.width = 6, fig.height = 6, out.height = 600, out.width = 600} 85 | words %>% 86 | count(word) %>% 87 | with( 88 | wordcloud::wordcloud( 89 | word, 90 | n, 91 | max.words = 100 92 | ) 93 | ) 94 | ``` 95 | 96 | 97 | # Preprocessing 98 | 99 | We need to apply some preprocessing to our text before we can feed it into a model. The first round of preprocessing is simply ignoring case, punctuation and numbers: 100 | 101 | ```{r text-preprocessor} 102 | text_preprocessor 103 | ``` 104 | 105 | I'm actually not sure that we *should* be removing numbers here. We're dealing with reviews, after all, and a review like "10/10" certainly tells us something about sentiment. But that's beyond the scope of this package. 106 | 107 | The next round of processing involves tokenising our words. This is a process of stripping words down to their base. Another custom function, `stem_tokeniser` plays this role, by calling on the Porter stemming algorithm: 108 | 109 | ```{r stem-tokeniser-example} 110 | stem_tokeniser("information informed informing informs") 111 | ``` 112 | 113 | -------------------------------------------------------------------------------- /inst/execute-model.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | package_name <- basename(here::here()) 3 | drake::expose_imports(package_name, character_only = TRUE) 4 | drake::make(model_execution_plan()) 5 | -------------------------------------------------------------------------------- /inst/img/drake-model-execution-plan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdneuzerling/DrakeModelling/84b843160e549b3a8bb12acca613956943fd510a/inst/img/drake-model-execution-plan.png -------------------------------------------------------------------------------- /inst/img/drake-model-training-plan.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdneuzerling/DrakeModelling/84b843160e549b3a8bb12acca613956943fd510a/inst/img/drake-model-training-plan.png -------------------------------------------------------------------------------- /inst/img/failed-validation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mdneuzerling/DrakeModelling/84b843160e549b3a8bb12acca613956943fd510a/inst/img/failed-validation.png -------------------------------------------------------------------------------- /inst/train-model.R: -------------------------------------------------------------------------------- 1 | devtools::load_all() 2 | package_name <- basename(here::here()) 3 | drake::expose_imports(package_name, character_only = TRUE) 4 | drake::make(model_training_plan()) 5 | -------------------------------------------------------------------------------- /man/create_tfidf.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create-tfidf.R 3 | \name{create_tfidf} 4 | \alias{create_tfidf} 5 | \title{Create a fitted tf-idf} 6 | \usage{ 7 | create_tfidf(dtm_unweighted) 8 | } 9 | \arguments{ 10 | \item{dtm_unweighted}{An unweighted document term matrix created by the 11 | map_to_dtm function, or an equivalent from text2vec.} 12 | } 13 | \value{ 14 | A fitted tfidf objet created by the text2vec package 15 | } 16 | \description{ 17 | The text2vec package uses a Pythonic style of code to create a fitted term 18 | frequency - inverse document frequency (tf-idf) object. A new tfidf object 19 | is initialised and then the tfidf$fit_transform method is applied to an 20 | unweighted document-term matrix. This style of code is unusual in R and 21 | can cause issues with drake. This function creates a fitted tfidf object 22 | with a more functional and R-idiomatic style of code. 23 | } 24 | -------------------------------------------------------------------------------- /man/create_vocabulary.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/create-vocabulary.R 3 | \name{create_vocabulary} 4 | \alias{create_vocabulary} 5 | \title{Create a pruned vocabulary from a token iterator} 6 | \usage{ 7 | create_vocabulary(documents, doc_proportion_min = 0, doc_proportion_max = 1) 8 | } 9 | \arguments{ 10 | \item{documents}{A vector of characters, often sentences or paragraphs.} 11 | 12 | \item{doc_proportion_min}{Optional. A number between 0 and 1 which specifies 13 | the minimum proportion of documents in which a token appears in order to be 14 | included in the vocabulary. Defaults to 0 (no effect).} 15 | 16 | \item{doc_proportion_max}{Optional. A number between 0 and 1 which specifies 17 | the maximum proportion of documents in which a token appears in order to be 18 | included in the vocabulary. Defaults to 1 (no effect).} 19 | } 20 | \value{ 21 | A vocabulary object used in the text2vec package 22 | } 23 | \description{ 24 | This function creates a vocabulary from a vector of documents. A 25 | vocabulary defines the domain of a natural language processing problem. 26 | Vocabularies are often used to create vectorisers, which allow novel pieces 27 | of text to be mapped to a vocabulary defined by a training set. To exclude 28 | frequently and infrequently occurring tokens, the vocabulary is often 29 | trimmed. This reduces the dimension of the problem to decrease training time 30 | and the potential for overfitting. 31 | } 32 | -------------------------------------------------------------------------------- /man/download_and_read_data.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/download-and-read-data.R 3 | \name{download_and_read_data} 4 | \alias{download_and_read_data} 5 | \title{Download and parse review data} 6 | \usage{ 7 | download_and_read_data(data_source_url) 8 | } 9 | \arguments{ 10 | \item{data_source_url}{Character.} 11 | } 12 | \value{ 13 | Tibble of parsed data 14 | } 15 | \description{ 16 | Given the URL of a .zip file, this function will download it and decompress 17 | it to a given location. 18 | } 19 | \details{ 20 | This function was intended to download and uncompress the data located at 21 | https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences 22 | It is not a general function to download arbitrary data, but may be a 23 | useful template. 24 | } 25 | -------------------------------------------------------------------------------- /man/map_to_dtm.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/map-to-dtm.R 3 | \name{map_to_dtm} 4 | \alias{map_to_dtm} 5 | \title{Create a document term matrix using a vectoriser} 6 | \usage{ 7 | map_to_dtm(x, vectoriser, tfidf = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{A character or vector of characters, usually sentences, paragraphs 11 | or similar pieces of natural language.} 12 | 13 | \item{vectoriser}{A vectoriser constructed with the text2vec package.} 14 | 15 | \item{tfidf}{A tfidf object constructed with the text2vec package. 16 | If no tfidf is NULL, then an unweighted document term matrix will be 17 | returned.} 18 | } 19 | \value{ 20 | A document-term matrix with rows representing the textual objects in 21 | x, and columns representing the tokens in the vocabulary used to generate the 22 | given vectoriser. 23 | } 24 | \description{ 25 | This function uses a vectoriser created with the text2vec package to map 26 | a new piece of text, or vector of text, onto a document term matrix (dtm). 27 | The vectoriser has a concept of a vocabulary, a set of tokens which 28 | determine the columns of the resulting document term matrix. Any term that 29 | doesn't match to a token in the vocabulary will be ignored. Optionally, the 30 | document term matrix can be weighted by a term frequency-inverse document 31 | frequency (tfidf) object, created with the text2vec::TfIdf function. 32 | } 33 | -------------------------------------------------------------------------------- /man/model_execution_plan.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/model-execution-plan.R 3 | \name{model_execution_plan} 4 | \alias{model_execution_plan} 5 | \title{drake plan for model execution} 6 | \usage{ 7 | model_execution_plan() 8 | } 9 | \value{ 10 | A plan to be run with drake::make() 11 | } 12 | \description{ 13 | drake plan for model execution 14 | } 15 | -------------------------------------------------------------------------------- /man/model_training_plan.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/model-training-plan.R 3 | \name{model_training_plan} 4 | \alias{model_training_plan} 5 | \title{drake plan for model training} 6 | \usage{ 7 | model_training_plan() 8 | } 9 | \value{ 10 | A plan to be run with drake::make() 11 | } 12 | \description{ 13 | drake plan for model training 14 | } 15 | -------------------------------------------------------------------------------- /man/new_data_to_be_scored.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/new-data-to-be-scored.R 3 | \name{new_data_to_be_scored} 4 | \alias{new_data_to_be_scored} 5 | \title{Retrieve new data that needs to be scored} 6 | \usage{ 7 | new_data_to_be_scored() 8 | } 9 | \value{ 10 | Data frame with two char columns: review and sentiment ("good"/"bad") 11 | } 12 | \description{ 13 | This function is meant to replicate a function that will retrieve new data 14 | that needs to be scored. This is a batch execution, rather than an on-demand 15 | API that might be stood up by the plumber package. In practice, the return 16 | is a fixed sample of 10 reviews. 17 | } 18 | -------------------------------------------------------------------------------- /man/read_review_file.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/read-review-file.R 3 | \name{read_review_file} 4 | \alias{read_review_file} 5 | \title{Read a review source data file into a data frame} 6 | \usage{ 7 | read_review_file(file_path) 8 | } 9 | \arguments{ 10 | \item{file_path}{character. A file path to a source .txt data file} 11 | } 12 | \description{ 13 | This function is specifically intended to parse the three data files in the 14 | sentiment analysis data set 15 | (https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences). 16 | } 17 | -------------------------------------------------------------------------------- /man/sentiment.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/sentiment.R 3 | \name{sentiment} 4 | \alias{sentiment} 5 | \title{Classify a review as good or bad} 6 | \usage{ 7 | sentiment(x, random_forest, vectoriser, tfidf = NULL) 8 | } 9 | \arguments{ 10 | \item{x}{Text to be classified, ideally a one-sentence product review.} 11 | 12 | \item{random_forest}{A model created with the randomForest package.} 13 | 14 | \item{vectoriser}{A vectoriser constructed with the text2vec package.} 15 | 16 | \item{tfidf}{A tfidf object constructed with the text2vec package. 17 | If no tfidf is NULL, then weighting will not be applied.} 18 | } 19 | \description{ 20 | Classify a review as good or bad 21 | } 22 | -------------------------------------------------------------------------------- /man/stem_tokeniser.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/stem-tokeniser.R 3 | \name{stem_tokeniser} 4 | \alias{stem_tokeniser} 5 | \title{Stem and tokenise a sentence} 6 | \usage{ 7 | stem_tokeniser(x) 8 | } 9 | \arguments{ 10 | \item{x}{A character or vector of characters, usually sentences, paragraphs 11 | or similar pieces of natural language.} 12 | } 13 | \value{ 14 | A vector of tokens 15 | } 16 | \description{ 17 | Stem and tokenise a sentence 18 | } 19 | \examples{ 20 | stem_tokeniser("information informed informing informs") 21 | 22 | } 23 | -------------------------------------------------------------------------------- /man/submit_predictions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/submit-predictions.R 3 | \name{submit_predictions} 4 | \alias{submit_predictions} 5 | \title{Simulate submission of output data} 6 | \usage{ 7 | submit_predictions(predictions) 8 | } 9 | \arguments{ 10 | \item{predictions}{Predictions to be submitted. Format is not important.} 11 | } 12 | \value{ 13 | Returns output, invisibly 14 | } 15 | \description{ 16 | This is a dummy function that simulates the submission of newly scored 17 | model results to an external platform. In actual fact, this function does 18 | nothing, but sleeps for 5 seconds to simulate an upload process. 19 | } 20 | -------------------------------------------------------------------------------- /man/text_preprocessor.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/text-preprocessor.R 3 | \name{text_preprocessor} 4 | \alias{text_preprocessor} 5 | \title{Create a pruned vocabulary from a token iterator} 6 | \usage{ 7 | text_preprocessor(x) 8 | } 9 | \arguments{ 10 | \item{x}{Character. Text to be processed.} 11 | } 12 | \value{ 13 | The same character after processing. 14 | } 15 | \description{ 16 | This function creates a vocabulary from a vector of documents. A 17 | vocabulary defines the domain of a natural language processing problem. 18 | Vocabularies are often used to create vectorisers, which allow novel pieces 19 | of text to be mapped to a vocabulary defined by a training set. To exclude 20 | frequently and infrequently occurring tokens, the vocabulary is often 21 | trimmed. This reduces the dimension of the problem to decrease training time 22 | and the potential for overfitting. 23 | } 24 | -------------------------------------------------------------------------------- /man/validate_model.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate-model.R 3 | \name{validate_model} 4 | \alias{validate_model} 5 | \title{Assert that model makes sensible predictions} 6 | \usage{ 7 | validate_model(random_forest, vectoriser, tfidf = NULL) 8 | } 9 | \arguments{ 10 | \item{random_forest}{A model created with the randomForest package.} 11 | 12 | \item{vectoriser}{A vectoriser constructed with the text2vec package.} 13 | 14 | \item{tfidf}{A tfidf object constructed with the text2vec package. 15 | If no tfidf is NULL, then weighting will not be applied.} 16 | } 17 | \value{ 18 | The MD5 hash of the random_forest object 19 | } 20 | \description{ 21 | This function tests model predictions against a small sample of inputs with 22 | predictable outputs. For example, a sentiment analysis model that returns 23 | a sentiment of "bad" for "love" should not be used. We can also demand a 24 | minimum level of accuracy according to one or more metrics. Any failed 25 | assertions will raise an error. This validation approach could also be used 26 | to perform rudimentary checks for model bias (in the ethics sense). 27 | } 28 | -------------------------------------------------------------------------------- /man/validate_predictions.Rd: -------------------------------------------------------------------------------- 1 | % Generated by roxygen2: do not edit by hand 2 | % Please edit documentation in R/validate-predictions.R 3 | \name{validate_predictions} 4 | \alias{validate_predictions} 5 | \title{Assert that model predictions are sensible} 6 | \usage{ 7 | validate_predictions(predictions) 8 | } 9 | \arguments{ 10 | \item{predictions}{A vector of model predictions} 11 | } 12 | \value{ 13 | The MD5 hash of the predictions 14 | } 15 | \description{ 16 | This function tests that model predictions make sense. It is intended to be 17 | run after new data has been scored, but before it has been submitted. We 18 | might ensure that outputs are sensible, within an expected range, or do not 19 | contain sensitive data. Failed assertions will raise an error. Upon success, 20 | the md5 hash of the predictions will be returned. 21 | } 22 | -------------------------------------------------------------------------------- /tests/testthat.R: -------------------------------------------------------------------------------- 1 | library(testthat) 2 | library(DrakeModelling) 3 | 4 | test_check("DrakeModelling") 5 | -------------------------------------------------------------------------------- /tests/testthat/test-map-to-dtm.R: -------------------------------------------------------------------------------- 1 | context("map-to-dtm") 2 | 3 | # Set up a dtm and tfidf based on a small dataset with a predictable outcome 4 | 5 | test_reviews <- tibble::tribble( 6 | ~review, 7 | "This movie is good fun and everyone should see it. Great film.", 8 | "What a poor quality phone", 9 | "This phone has a great battery phone", 10 | "An excellent piece of software" 11 | ) 12 | 13 | test_vocabulary <- create_vocabulary(test_reviews$review) 14 | test_vectoriser <- test_vocabulary %>% text2vec::vocab_vectorizer() 15 | test_dtm_unweighted <- map_to_dtm(test_reviews$review, 16 | vectoriser = test_vectoriser, 17 | tfidf = NULL) 18 | test_tfidf <- create_tfidf(test_dtm_unweighted) 19 | 20 | # The dtms used for testing are below 21 | # Note that "movie" stems to "movi". 22 | 23 | dtm_unweighted <- as.matrix( 24 | map_to_dtm(c("movie", "phone"), 25 | vectoriser = test_vectoriser, 26 | tfidf = NULL) 27 | ) 28 | 29 | dtm_weighted <- as.matrix( 30 | map_to_dtm(c("movie", "phone"), 31 | test_vectoriser, 32 | tfidf = test_tfidf) 33 | ) 34 | 35 | test_that("DTM has common words that aren't stop words", { 36 | expect_true("movi" %in% colnames(dtm_unweighted)) 37 | expect_true("phone" %in% colnames(dtm_unweighted)) 38 | expect_true("film" %in% colnames(dtm_unweighted)) 39 | }) 40 | 41 | test_that("DTM doesn't contain common stop words", { 42 | expect_false("this" %in% colnames(dtm_unweighted)) 43 | expect_false("a" %in% colnames(dtm_unweighted)) 44 | }) 45 | 46 | test_that("DTM correctly represents present and absent words", { 47 | expect_equal(dtm_unweighted[1, "movi"], 1) 48 | expect_equal(dtm_unweighted[2, "phone"], 1) 49 | expect_equal(dtm_unweighted[1, "film"], 0) 50 | expect_equal(dtm_unweighted[2, "film"], 0) 51 | }) 52 | 53 | test_that("DTM is applying weightings", { 54 | expect_false(dtm_weighted[1, "movi"] == 0) 55 | expect_false(dtm_weighted[1, "movi"] == 1) 56 | expect_false(dtm_weighted[2, "phone"] == 0) 57 | expect_false(dtm_weighted[2, "phone"] == 1) 58 | }) 59 | -------------------------------------------------------------------------------- /tests/testthat/test-stem-tokeniser.R: -------------------------------------------------------------------------------- 1 | context("stem-tokeniser") 2 | 3 | test_that("Processing inflections of the same stem", { 4 | expect_equal( 5 | stem_tokeniser("information informed informing informs"), 6 | list(c("inform", "inform", "inform", "inform")) 7 | ) 8 | }) 9 | 10 | test_that("Stemming multiple documents", { 11 | expect_equal( 12 | stem_tokeniser(c("information", "product")), 13 | list("inform", "product") 14 | ) 15 | }) 16 | -------------------------------------------------------------------------------- /tests/testthat/test-text-preprocessor.R: -------------------------------------------------------------------------------- 1 | context("test-text-preprocessor") 2 | 3 | test_that("Case is lowered", { 4 | test_string <- "SPHYNX OF BLACK QUARTZ judge my VoW" 5 | expected_result <- "sphynx of black quartz judge my vow" 6 | expect_equal(text_preprocessor(test_string), expected_result) 7 | }) 8 | 9 | test_that("Punctuation is removed", { 10 | test_string <- "\"sphynx. of black quartz, judge - my vow!\"" 11 | expected_result <- "sphynx of black quartz judge my vow" 12 | expect_equal(text_preprocessor(test_string), expected_result) 13 | }) 14 | 15 | test_that("Numbers are removed", { 16 | test_string <- "\"sphynx. o10f black qu1artz, ju2dge - my vow!\"" 17 | expected_result <- "sphynx of black quartz judge my vow" 18 | expect_equal(text_preprocessor(test_string), expected_result) 19 | }) 20 | 21 | test_that("Case is lowered, punctuation and numbers are removed", { 22 | test_string <- "\"SPHYNX. o10f bLaCk qu1artz, ju2dge - my vow!\"" 23 | expected_result <- "sphynx of black quartz judge my vow" 24 | expect_equal(text_preprocessor(test_string), expected_result) 25 | }) 26 | 27 | test_that("Processing vector of text", { 28 | test_strings <- c("fish", "KING!", "cA4rD") 29 | expected_result <- c("fish", "king", "card") 30 | expect_equal(text_preprocessor(test_strings), expected_result) 31 | }) 32 | --------------------------------------------------------------------------------