├── .Rbuildignore
├── .github
    └── workflows
    │   └── R-CMD-check.yaml
├── .gitignore
├── DESCRIPTION
├── DrakeModelling.Rproj
├── LICENSE
├── NAMESPACE
├── R
    ├── assert-model-input.R
    ├── assert-model-output.R
    ├── create-tfidf.R
    ├── create-vocabulary.R
    ├── download-and-read-data.R
    ├── map-to-dtm.R
    ├── model-execution-plan.R
    ├── model-training-plan.R
    ├── new-data-to-be-scored.R
    ├── read-review-file.R
    ├── sentiment.R
    ├── stem-tokeniser.R
    ├── submit-predictions.R
    ├── text-preprocessor.R
    ├── validate-model.R
    ├── validate-predictions.R
    └── zzz.R
├── README.md
├── inst
    ├── eda
    │   └── eda.Rmd
    ├── execute-model.R
    ├── img
    │   ├── drake-model-execution-plan.png
    │   ├── drake-model-training-plan.png
    │   └── failed-validation.png
    └── train-model.R
├── man
    ├── create_tfidf.Rd
    ├── create_vocabulary.Rd
    ├── download_and_read_data.Rd
    ├── map_to_dtm.Rd
    ├── model_execution_plan.Rd
    ├── model_training_plan.Rd
    ├── new_data_to_be_scored.Rd
    ├── read_review_file.Rd
    ├── sentiment.Rd
    ├── stem_tokeniser.Rd
    ├── submit_predictions.Rd
    ├── text_preprocessor.Rd
    ├── validate_model.Rd
    └── validate_predictions.Rd
└── tests
    ├── testthat.R
    └── testthat
        ├── test-map-to-dtm.R
        ├── test-stem-tokeniser.R
        └── test-text-preprocessor.R


/.Rbuildignore:
--------------------------------------------------------------------------------
 1 | ^.*\.Rproj$
 2 | ^\.Rproj\.user$
 3 | ^doc$
 4 | ^Meta$
 5 | ^LICENSE\.md$
 6 | ^\.github/workflows/R-CMD-check\.yaml$
 7 | ^.drake$
 8 | ^inst/artefacts$
 9 | ^inst/img$
10 | ^artefacts$
11 | 


--------------------------------------------------------------------------------
/.github/workflows/R-CMD-check.yaml:
--------------------------------------------------------------------------------
 1 | on: [push, pull_request]
 2 | 
 3 | name: R-CMD-check
 4 | 
 5 | jobs:
 6 |   R-CMD-check:
 7 |     runs-on: macOS-latest
 8 |     steps:
 9 |       - uses: actions/checkout@v1
10 |       - uses: r-lib/actions/setup-r@master
11 |       - name: Install dependencies
12 |         run: Rscript -e "install.packages(c('remotes', 'rcmdcheck'))" -e "remotes::install_deps(dependencies = TRUE)"
13 |       - name: Check
14 |         run: Rscript -e "rcmdcheck::rcmdcheck(args = '--no-manual', error_on = 'error')"
15 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | .Rproj.user
 2 | .Rhistory
 3 | .RData
 4 | .Ruserdata
 5 | .drake
 6 | inst/doc
 7 | inst/extdata
 8 | inst/artefacts
 9 | inst/eda/*.html
10 | data
11 | vignettes/*.html
12 | vignettes/*.pdf
13 | doc
14 | Meta
15 | artefacts
16 | 


--------------------------------------------------------------------------------
/DESCRIPTION:
--------------------------------------------------------------------------------
 1 | Package: DrakeModelling
 2 | Type: Package
 3 | Title: Using drake to Train and Execute a Model in a Custom Package
 4 | Version: 0.1.0
 5 | Author: David Neuzerling (@mdneuzerling)
 6 | Maintainer: David Neuzerling <mdneuzerling@email.hidden>
 7 | Description: Drake is excellent for doing complicated things like model training or execution. I want to combine that with the benefits of custom R packages.
 8 | Depends: 
 9 |     R (>= 3.4.0)
10 | Imports:
11 |     dplyr,
12 |     readr,
13 |     text2vec,
14 |     tm,
15 |     tidytext,
16 |     SnowballC,
17 |     randomForest,
18 |     assertthat,
19 |     drake,
20 |     here,
21 |     rlang
22 | License: MIT + file LICENSE
23 | Encoding: UTF-8
24 | LazyData: true
25 | RoxygenNote: 7.1.0
26 | Suggests:
27 |     devtools,
28 |     ggplot2,
29 |     naniar,
30 |     purrr,
31 |     usethis,
32 |     withr,
33 |     wordcloud,
34 |     knitr,
35 |     rmarkdown,
36 |     tibble,
37 |     testthat (>= 2.1.0)
38 | VignetteBuilder: knitr
39 | URL: https://github.com/mdneuzerling/DrakeModelling
40 | BugReports: https://github.com/mdneuzerling/DrakeModelling/issues
41 | 


--------------------------------------------------------------------------------
/DrakeModelling.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: Default
 4 | SaveWorkspace: Default
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 
15 | BuildType: Package
16 | PackageUseDevtools: Yes
17 | PackageInstallArgs: --no-multiarch --with-keep.source
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Murray David Neuzerling
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/NAMESPACE:
--------------------------------------------------------------------------------
 1 | # Generated by roxygen2: do not edit by hand
 2 | 
 3 | export(create_tfidf)
 4 | export(create_vocabulary)
 5 | export(download_and_read_data)
 6 | export(map_to_dtm)
 7 | export(model_execution_plan)
 8 | export(model_training_plan)
 9 | export(new_data_to_be_scored)
10 | export(read_review_file)
11 | export(sentiment)
12 | export(stem_tokeniser)
13 | export(submit_predictions)
14 | export(text_preprocessor)
15 | export(validate_model)
16 | export(validate_predictions)
17 | importFrom(assertthat,assert_that)
18 | importFrom(dplyr,"%>%")
19 | importFrom(drake,trigger)
20 | importFrom(randomForest,randomForest)
21 | importFrom(rlang,.data)
22 | 


--------------------------------------------------------------------------------
/R/assert-model-input.R:
--------------------------------------------------------------------------------
 1 | # context("sentiment")
 2 | # 
 3 | # test_that("'bad' is bad sentiment", {
 4 | #   expect_equal(sentiment("bad"), "bad")
 5 | # })
 6 | # 
 7 | # test_that("'love' is good sentiment", {
 8 | #   expect_equal(sentiment("love"), "good")
 9 | # })
10 | 


--------------------------------------------------------------------------------
/R/assert-model-output.R:
--------------------------------------------------------------------------------
 1 | # context("sentiment")
 2 | # 
 3 | # test_that("'bad' is bad sentiment", {
 4 | #   expect_equal(sentiment("bad"), "bad")
 5 | # })
 6 | # 
 7 | # test_that("'love' is good sentiment", {
 8 | #   expect_equal(sentiment("love"), "good")
 9 | # })
10 | 


--------------------------------------------------------------------------------
/R/create-tfidf.R:
--------------------------------------------------------------------------------
 1 | #' Create a fitted tf-idf
 2 | #'
 3 | #' The text2vec package uses a Pythonic style of code to create a fitted term
 4 | #' frequency - inverse document frequency (tf-idf) object. A new tfidf object
 5 | #' is initialised and then the tfidf$fit_transform method is applied to an
 6 | #' unweighted document-term matrix. This style of code is unusual in R and
 7 | #' can cause issues with drake. This function creates a fitted tfidf object
 8 | #' with a more functional and R-idiomatic style of code.
 9 | #'
10 | #' @param dtm_unweighted An unweighted document term matrix created by the
11 | #' map_to_dtm function, or an equivalent from text2vec.
12 | #'
13 | #' @return A fitted tfidf objet created by the text2vec package
14 | #' @export
15 | #'
16 | create_tfidf <- function(dtm_unweighted) {
17 |   tfidf = text2vec::TfIdf$new()
18 |   tfidf$fit_transform(dtm_unweighted)
19 |   return(tfidf)
20 | }


--------------------------------------------------------------------------------
/R/create-vocabulary.R:
--------------------------------------------------------------------------------
 1 | #' Create a pruned vocabulary from a token iterator
 2 | #'
 3 | #' This function creates a vocabulary from a vector of documents. A
 4 | #' vocabulary defines the domain of a natural language processing problem.
 5 | #' Vocabularies are often used to create vectorisers, which allow novel pieces
 6 | #' of text to be mapped to a vocabulary defined by a training set. To exclude
 7 | #' frequently and infrequently occurring tokens, the vocabulary is often
 8 | #' trimmed. This reduces the dimension of the problem to decrease training time
 9 | #' and the potential for overfitting.
10 | #'
11 | #' @param documents A vector of characters, often sentences or paragraphs.
12 | #' @param doc_proportion_min Optional. A number between 0 and 1 which specifies
13 | #' the minimum proportion of documents in which a token appears in order to be
14 | #' included in the vocabulary. Defaults to 0 (no effect).
15 | #' @param doc_proportion_max Optional. A number between 0 and 1 which specifies
16 | #' the maximum proportion of documents in which a token appears in order to be
17 | #' included in the vocabulary. Defaults to 1 (no effect).
18 | #' @return A vocabulary object used in the text2vec package
19 | #'
20 | #' @export
21 | #'
22 | create_vocabulary <- function(documents,
23 |                               doc_proportion_min = 0,
24 |                               doc_proportion_max = 1) {
25 |   # For some reason, the preprocessor and stem_tokeniser don't take effect if
26 |   # I put them in the itoken function as values to the relevant arguments.
27 |   # Please let me know if you understand why this is!
28 |   processed_documents <- stem_tokeniser(text_preprocessor(documents))
29 | 
30 |   # Warning: token iterators can only be used once.
31 |   token_iterator <- text2vec::itoken(
32 |     processed_documents,
33 |     progressbar = FALSE
34 |   )
35 | 
36 |   vocabulary <- text2vec::create_vocabulary(
37 |     token_iterator,
38 |     stopwords = tidytext::stop_words$word
39 |   )
40 | 
41 |   vocabulary <- text2vec::prune_vocabulary(
42 |     vocabulary,
43 |     doc_proportion_min = doc_proportion_min,
44 |     doc_proportion_max = doc_proportion_max
45 |   )
46 | 
47 |   return(vocabulary)
48 | }
49 | 


--------------------------------------------------------------------------------
/R/download-and-read-data.R:
--------------------------------------------------------------------------------
 1 | #' Download and parse review data
 2 | #'
 3 | #' Given the URL of a .zip file, this function will download it and decompress
 4 | #' it to a given location.
 5 | #'
 6 | #' This function was intended to download and uncompress the data located at
 7 | #' https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
 8 | #' It is not a general function to download arbitrary data, but may be a
 9 | #' useful template.
10 | #'
11 | #' @param data_source_url Character.
12 | #'
13 | #' @return Tibble of parsed data
14 | #' @importFrom rlang .data
15 | #' @export
16 | #'
17 | download_and_read_data <- function(data_source_url) {
18 |   temp_dir <- tempdir()
19 |   
20 |   zip_file <- file.path(temp_dir, "zip_file")
21 |   on.exit(unlink(zip_file, recursive = TRUE))
22 |   utils::download.file(data_source_url, zip_file)
23 |   data_files <-c("amazon_cells_labelled.txt",
24 |                  "imdb_labelled.txt",
25 |                  "yelp_labelled.txt")
26 |   data_files_in_zip <- file.path("sentiment labelled sentences", data_files)
27 |   utils::unzip(
28 |     zip_file, files = data_files_in_zip, junkpaths = TRUE, exdir = temp_dir
29 |   )
30 |   downloaded_data_files <- file.path(temp_dir, data_files)
31 |   on.exit(unlink(downloaded_data_files), add = TRUE)
32 |   
33 |   downloaded_data_files %>%
34 |     purrr::map(read_review_file) %>%
35 |     purrr::reduce(rbind) %>%
36 |     dplyr::mutate(sentiment = ifelse(.data$sentiment == 1, "good", "bad"))
37 |   
38 | }
39 | 


--------------------------------------------------------------------------------
/R/map-to-dtm.R:
--------------------------------------------------------------------------------
 1 | #' Create a document term matrix using a vectoriser
 2 | #'
 3 | #' This function uses a vectoriser created with the text2vec package to map
 4 | #' a new piece of text, or vector of text, onto a document term matrix (dtm).
 5 | #' The vectoriser has a concept of a vocabulary, a set of tokens which
 6 | #' determine the columns of the resulting document term matrix. Any term that
 7 | #' doesn't match to a token in the vocabulary will be ignored. Optionally, the
 8 | #' document term matrix can be weighted by a term frequency-inverse document
 9 | #' frequency (tfidf) object, created with the text2vec::TfIdf function.
10 | #'
11 | #' @param x A character or vector of characters, usually sentences, paragraphs
12 | #' or similar pieces of natural language.
13 | #' @param vectoriser A vectoriser constructed with the text2vec package.
14 | #' @param tfidf A tfidf object constructed with the text2vec package.
15 | #' If no tfidf is NULL, then an unweighted document term matrix will be
16 | #' returned.
17 | #' @return A document-term matrix with rows representing the textual objects in
18 | #' x, and columns representing the tokens in the vocabulary used to generate the
19 | #' given vectoriser.
20 | #'
21 | #' @export
22 | #'
23 | map_to_dtm <- function(x,
24 |                        vectoriser,
25 |                        tfidf = NULL) {
26 |   processed_text <- stem_tokeniser(text_preprocessor(x))
27 | 
28 |   # For some reason, the preprocessor and stem_tokeniser don't take effect if
29 |   # I put them in the itoken function as values to the relevant arguments.
30 |   # Please let me know if you understand why this is!
31 |   tokens <- text2vec::itoken(
32 |     processed_text,
33 |     progressbar = FALSE
34 |   )
35 | 
36 |   # If the input contains no terms corresponding the to vocabulary used to
37 |   # generate the vectoriser, then a warning will occur for an empty dtm.
38 |   # Since this is a plausible scenario, we suppress the warning.
39 |   suppressWarnings(
40 |     dtm <- text2vec::create_dtm(tokens, vectoriser)
41 |   )
42 | 
43 |   if (!is.null(tfidf)) {
44 |     dtm <- tfidf$transform(dtm)
45 |   }
46 | 
47 |   return(dtm)
48 | }
49 | 


--------------------------------------------------------------------------------
/R/model-execution-plan.R:
--------------------------------------------------------------------------------
 1 | #' drake plan for model execution
 2 | #'
 3 | #' @return A plan to be run with drake::make()
 4 | #' @export
 5 | #'
 6 | model_execution_plan <- function() {
 7 |   drake::drake_plan(
 8 |     new_data = new_data_to_be_scored(),
 9 |     tfidf = readr::read_rds(file_in("artefacts/tfidf.rds")),
10 |     vectoriser = readr::read_rds(file_in("artefacts/vectoriser.rds")),
11 |     review_rf = readr::read_rds(file_in("artefacts/review_rf.rds")),
12 |     predictions = sentiment(new_data$review,
13 |                             random_forest = review_rf,
14 |                             vectoriser = vectoriser,
15 |                             tfidf = tfidf),
16 |     validation = validate_predictions(predictions),
17 |     submit_predictions = target(
18 |       submit_predictions(predictions),
19 |       trigger = trigger(condition = validation, mode = "blacklist")
20 |     )
21 |   )
22 | }
23 | 


--------------------------------------------------------------------------------
/R/model-training-plan.R:
--------------------------------------------------------------------------------
 1 | #' drake plan for model training
 2 | #'
 3 | #' @return A plan to be run with drake::make()
 4 | #' @importFrom drake trigger
 5 | #' @export
 6 | #'
 7 | model_training_plan <- function() {
 8 |   drake::drake_plan(
 9 |     reviews = target(
10 |       download_and_read_data(
11 |         file_in("https://archive.ics.uci.edu/ml/machine-learning-databases/00331/sentiment%20labelled%20sentences.zip")
12 |       )
13 |     ),
14 |     vocabulary = create_vocabulary(reviews$review,
15 |                                    doc_proportion_min = 25 / nrow(reviews)),
16 |     vectoriser = text2vec::vocab_vectorizer(vocabulary),
17 |     dtm_unweighted = map_to_dtm(reviews$review,
18 |                                 vectoriser = vectoriser),
19 |     tfidf = create_tfidf(dtm_unweighted),
20 |     dtm_tfidf_weighted = map_to_dtm(reviews$review,
21 |                                     vectoriser = vectoriser,
22 |                                     tfidf = tfidf),
23 |     review_rf = randomForest::randomForest(
24 |       x = as.matrix(dtm_tfidf_weighted),
25 |       y = factor(reviews$sentiment),
26 |       ntree = 500
27 |     ),
28 |     validation = validate_model(review_rf, vectoriser, tfidf),
29 |     output_model = drake::target(
30 |       {
31 |         dir.create("artefacts", showWarnings = FALSE)
32 |         readr::write_rds(vectoriser, file_out("artefacts/vectoriser.rds"))
33 |         readr::write_rds(tfidf, file_out("artefacts/tfidf.rds"))
34 |         readr::write_rds(review_rf, file_out("artefacts/review_rf.rds"))
35 |       },
36 |       trigger = drake::trigger(condition = validation, mode = "blacklist")
37 |     )
38 |   )
39 | }
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/R/new-data-to-be-scored.R:
--------------------------------------------------------------------------------
 1 | #' Retrieve new data that needs to be scored
 2 | #'
 3 | #' This function is meant to replicate a function that will retrieve new data
 4 | #' that needs to be scored. This is a batch execution, rather than an on-demand
 5 | #' API that might be stood up by the plumber package. In practice, the return
 6 | #' is a fixed sample of 10 reviews.
 7 | #'
 8 | #' @return Data frame with two char columns: review and sentiment ("good"/"bad")
 9 | #' @export
10 | #'
11 | new_data_to_be_scored <- function() {
12 |   tibble::tribble(
13 |     ~"review", ~"sentiment",
14 |     "I'm still infatuated with this phone.", "good",
15 |     "Strike 2, who wants to be rushed.", "bad",
16 |     "I enjoyed reading this book to my children when they were little.", "good",
17 |     "We had a group of 70+ when we claimed we would only have 40 and they handled us beautifully.", "good",
18 |     "The story is lame, not interesting and NEVER really explains the sinister origins of the puppets", "bad",
19 |     "Better than you'd expect.", "good",
20 |     "It was a huge awkward 1.5lb piece of cow that was 3/4ths gristle and fat.", "bad",
21 |     "Yes, it's that bad.", "bad",
22 |     "I did not expect this to be so good!", "good",
23 |     "The only redeeming quality of the restaurant was that it was very inexpensive.", "good"
24 |   )
25 | }


--------------------------------------------------------------------------------
/R/read-review-file.R:
--------------------------------------------------------------------------------
 1 | #' Read a review source data file into a data frame
 2 | #'
 3 | #' This function is specifically intended to parse the three data files in the
 4 | #' sentiment analysis data set
 5 | #' (https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences).
 6 | #'
 7 | #' @param file_path character. A file path to a source .txt data file
 8 | #'
 9 | #' @export
10 | #'
11 | read_review_file <- function(file_path) {
12 |   readr::read_tsv(
13 |     file_path,
14 |     col_names = c("review", "sentiment"),
15 |     quote = "",
16 |     col_types = readr::cols(
17 |       review = readr::col_character(),
18 |       sentiment = readr::col_integer()
19 |     )
20 |   )
21 | }
22 | 


--------------------------------------------------------------------------------
/R/sentiment.R:
--------------------------------------------------------------------------------
 1 | #' Classify a review as good or bad
 2 | #'
 3 | #' @param x Text to be classified, ideally a one-sentence product review.
 4 | #' @param random_forest A model created with the randomForest package.
 5 | #' @param vectoriser A vectoriser constructed with the text2vec package.
 6 | #' @param tfidf A tfidf object constructed with the text2vec package.
 7 | #' If no tfidf is NULL, then weighting will not be applied.
 8 | #'
 9 | #' @importFrom randomForest randomForest
10 | #' @export
11 | #'
12 | sentiment <- function(x, random_forest, vectoriser, tfidf = NULL) {
13 |   processed <- map_to_dtm(x, vectoriser = vectoriser, tfidf = tfidf)
14 |   as.character(stats::predict(random_forest, processed))
15 | }
16 | 


--------------------------------------------------------------------------------
/R/stem-tokeniser.R:
--------------------------------------------------------------------------------
 1 | #' Stem and tokenise a sentence
 2 | #'
 3 | #' @param x A character or vector of characters, usually sentences, paragraphs
 4 | #' or similar pieces of natural language.
 5 | #' @return A vector of tokens
 6 | #'
 7 | #' @export
 8 | #'
 9 | #' @examples stem_tokeniser("information informed informing informs")
10 | #'
11 | stem_tokeniser <- function(x) {
12 |   lapply(
13 |     text2vec::word_tokenizer(x),
14 |     SnowballC::wordStem,
15 |     language = "en"
16 |   )
17 | }
18 | 


--------------------------------------------------------------------------------
/R/submit-predictions.R:
--------------------------------------------------------------------------------
 1 | #' Simulate submission of output data
 2 | #'
 3 | #' This is a dummy function that simulates the submission of newly scored 
 4 | #' model results to an external platform. In actual fact, this function does
 5 | #' nothing, but sleeps for 5 seconds to simulate an upload process.
 6 | #'
 7 | #' @param predictions Predictions to be submitted. Format is not important. 
 8 | #'
 9 | #' @return Returns output, invisibly
10 | #' @export
11 | #'
12 | submit_predictions <- function(predictions) {
13 |   Sys.sleep(5)
14 |   invisible(predictions)
15 | }


--------------------------------------------------------------------------------
/R/text-preprocessor.R:
--------------------------------------------------------------------------------
 1 | #' Create a pruned vocabulary from a token iterator
 2 | #'
 3 | #' This function creates a vocabulary from a vector of documents. A
 4 | #' vocabulary defines the domain of a natural language processing problem.
 5 | #' Vocabularies are often used to create vectorisers, which allow novel pieces
 6 | #' of text to be mapped to a vocabulary defined by a training set. To exclude
 7 | #' frequently and infrequently occurring tokens, the vocabulary is often
 8 | #' trimmed. This reduces the dimension of the problem to decrease training time
 9 | #' and the potential for overfitting.
10 | #'
11 | #' @param x Character. Text to be processed.
12 | #' @return The same character after processing.
13 | #'
14 | #' @importFrom dplyr %>%
15 | #' @export
16 | #'
17 | 
18 | text_preprocessor <- function(x) {
19 |   x %>%
20 |     tolower %>%
21 |     tm::removeNumbers() %>%
22 |     tm::removePunctuation()
23 | }
24 | 


--------------------------------------------------------------------------------
/R/validate-model.R:
--------------------------------------------------------------------------------
 1 | #' Assert that model makes sensible predictions
 2 | #'
 3 | #' This function tests model predictions against a small sample of inputs with
 4 | #' predictable outputs. For example, a sentiment analysis model that returns
 5 | #' a sentiment of "bad" for "love" should not be used. We can also demand a
 6 | #' minimum level of accuracy according to one or more metrics. Any failed 
 7 | #' assertions will raise an error. This validation approach could also be used
 8 | #' to perform rudimentary checks for model bias (in the ethics sense).
 9 | #'
10 | #' @param random_forest A model created with the randomForest package.
11 | #' @param vectoriser A vectoriser constructed with the text2vec package.
12 | #' @param tfidf A tfidf object constructed with the text2vec package.
13 | #' If no tfidf is NULL, then weighting will not be applied.
14 | #'
15 | #' @return The MD5 hash of the random_forest object
16 | #' @export
17 | #'
18 | validate_model <- function(random_forest, vectoriser, tfidf = NULL) {
19 |   model_sentiment <- function(x) sentiment(x, random_forest, vectoriser, tfidf)
20 |   oob <- random_forest$err.rate[random_forest$ntree, "OOB"]
21 | 
22 |   assertthat::assert_that(model_sentiment("love") == "good")
23 |   assertthat::assert_that(model_sentiment("bad") == "bad")
24 |   assertthat::assert_that(oob < 0.4)
25 | 
26 |   TRUE
27 | }


--------------------------------------------------------------------------------
/R/validate-predictions.R:
--------------------------------------------------------------------------------
 1 | #' Assert that model predictions are sensible
 2 | #'
 3 | #' This function tests that model predictions make sense. It is intended to be
 4 | #' run after new data has been scored, but before it has been submitted. We
 5 | #' might ensure that outputs are sensible, within an expected range, or do not
 6 | #' contain sensitive data. Failed assertions will raise an error. Upon success,
 7 | #' the md5 hash of the predictions will be returned.
 8 | #'
 9 | #' @param predictions A vector of model predictions
10 | #'
11 | #' @return The MD5 hash of the predictions
12 | #' @export
13 | #'
14 | validate_predictions <- function(predictions) {
15 |   assertthat::assert_that(all(predictions == "good" | predictions == "bad"))
16 |   
17 |   TRUE
18 | }


--------------------------------------------------------------------------------
/R/zzz.R:
--------------------------------------------------------------------------------
1 | utils::globalVariables(codetools::findGlobals(model_training_plan))
2 | utils::globalVariables(codetools::findGlobals(model_execution_plan))


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <!-- badges: start -->
  2 | [![R build status](https://github.com/mdneuzerling/DrakeModelling/workflows/R-CMD-check/badge.svg)](https://github.com/mdneuzerling/DrakeModelling)
  3 | <!-- badges: end -->
  4 | 
  5 | # DrakeModelling
  6 | 
  7 | [drake](https://github.com/ropensci/drake) is an excellent tool for managing large and complicated workflows. It is the R equivalent of the `make` command, with a particular emphasis on using R functions.
  8 | 
  9 | ## Concept
 10 | 
 11 | When it comes to code, there are three major components to a machine learning project:
 12 | 
 13 | 1) Exploratory data analysis (EDA)
 14 | 2) Model training
 15 | 3) Model execution
 16 | 
 17 | These components are run independently of each other. EDA is a largely human task, and is usually only performed when the model is created or updated in some major way. The other two components need not operate together --- if model retraining is expensive, or new training data is infrequently available, we might retrain a model on some monthly basis while scoring new data on a daily basis.
 18 | 
 19 | I pieced together this template that implements these three components using R-specific tools:
 20 | 
 21 | 1) EDA --- **R Markdown**
 22 | 2) Model training --- **drake**
 23 | 3) Model execution --- **drake**
 24 | 
 25 | All three of these components might use similar functions. Typically we would place all of these functions in a directory (almost always called `R/`) and `source` them as needed. Here I want to try to combine these components into a custom R package.
 26 | 
 27 | R packages are the standard for complicated R projects. With packages, we gain access to the comprehensive `R CMD CHECK`, as well as `testthat` unit tests and `roxygen2` documentation. I'm certainly not the first to combine drake with a package workflow, but I wanted to have a single repository that combines all elements of a machine learning project.
 28 | 
 29 | This template uses a simple random forest sentiment analysis model, based on [labelled data available from the UCI machine learning repository](https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences). Drake takes care of the data caching for us. This means that we can, say, adjust the hyper-parameters of our model and rerun the training plan, and only the modelling step and onward will be rerun.
 30 | 
 31 | This template considers machine learning workflows intended to be executed in batch --- for models that run as APIs, consider using `plumber` instead.
 32 | 
 33 | ## Training and execution
 34 | 
 35 | After cloning the repo, navigate to the directory in which the files are located. The easiest way to do this is to open the project in RStudio.
 36 | 
 37 | Model training and execution plans are generated by functions in the package. The package doesn't actually need to be installed --- we can use `devtools::load_all()` to simulate the installation. The model can be trained with:
 38 | 
 39 | ```
 40 | devtools::load_all()
 41 | drake::make(model_training_plan())
 42 | ```
 43 | 
 44 | ![](inst/img/drake-model-training-plan.png)
 45 | 
 46 | Model execution is run similarly:
 47 | 
 48 | ```
 49 | devtools::load_all()
 50 | drake::make(model_execution_plan())
 51 | ```
 52 | 
 53 | ![](inst/img/drake-model-execution-plan.png)
 54 | 
 55 | Model artefacts --- the random forest model, the vectoriser, and the tfidf weightings --- are saved to and loaded from the `artefacts/` directory. This is an arbitrary choice. We could just as easily use a different directory or remote storage.
 56 | 
 57 | I've simulated a production step with a `new_data_to_be_scored` function that returns a few reviews to be scored. Predictions are "submitted" through the `submit_prediction()` function. This function does nothing except sleep for 5 seconds. In practice we would submit model output wherever it needs to go --- locally, a cloud service, etc. It's hard to "productionise" a model when it's just a toy.
 58 | 
 59 | The exploratory data analysis piece can be found in the `inst/eda/` directory. It is compiled with `knitr`.
 60 | 
 61 | ## Model and prediction verification
 62 | 
 63 | Both training and execution plans include a _verification_ step. These are functions that --- using the `assertthat` package --- ensure certain basic facts about the model and its predictions are true. If any of these assertions is false, an error is returned.
 64 | 
 65 | ```
 66 | validate_model <- function(random_forest, vectoriser, tfidf = NULL) {
 67 |   model_sentiment <- function(x) sentiment(x, random_forest, vectoriser, tfidf)
 68 |   oob <- random_forest$err.rate[random_forest$ntree, "OOB"] # out of bag error
 69 | 
 70 |   assertthat::assert_that(model_sentiment("love") == "good")
 71 |   assertthat::assert_that(model_sentiment("bad") == "bad")
 72 |   assertthat::assert_that(oob < 0.4)
 73 | 
 74 |   TRUE
 75 | }
 76 | ```
 77 | 
 78 | The model artefacts and predictions cannot be exported without passing this verification step. Their relevant drake targets are conditioned on the validation function returning `TRUE`:
 79 | 
 80 | ```
 81 | output_model = drake::target(
 82 |   {
 83 |     dir.create("artefacts", showWarnings = FALSE)
 84 |     readr::write_rds(vectoriser, file_out("artefacts/vectoriser.rds"))
 85 |     readr::write_rds(tfidf, file_out("artefacts/tfidf.rds"))
 86 |     readr::write_rds(review_rf, file_out("artefacts/review_rf.rds"))
 87 |   },
 88 |   trigger = drake::trigger(condition = validation, mode = "blacklist")
 89 | )
 90 | ```
 91 | 
 92 | For example, suppose I changed the assertion above to demand that my model must have an out-of-bag error of less than 0.01% before it can be exported. My model isn't very good, however, so that step will error. The execution steps are dependent on that validation, and so they won't be run.
 93 | 
 94 | ![](inst/img/failed-validation.png)
 95 | 
 96 | The assertions I've included here are very basic. However, I think these steps of the plans are important and extensible. We could assert that a model:
 97 | 
 98 | * produces sensible outputs, based on type or domain.
 99 | * has an accuracy above a given threshold, based on one or more metrics.
100 | * does not produce outputs that are biased against a particular group.
101 | 
102 | We could also assert that predictions of new data:
103 | 
104 | * are sensible.
105 | * do not contain sensitive data.
106 | * are not biased against particular groups.
107 | 


--------------------------------------------------------------------------------
/inst/eda/eda.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Exploratory data analysis"
  3 | author: "David Neuzerling"
  4 | date: "`r Sys.Date()`"
  5 | output: html_document
  6 | ---
  7 | 
  8 | ```{r load_package, include = FALSE}
  9 | library(tidyverse)
 10 | package_root <- here::here()
 11 | devtools::load_all(package_root)
 12 | ```
 13 | 
 14 | # Data load
 15 | 
 16 | I haven't kept the data in this git repository, opting instead to download it if it doesn't already exist. It's a fairly small data set though (3000 rows).
 17 | 
 18 | `download_data` is a package function that downloads and unzips the source data into the `inst/extdata` directory (creating it if necessary). On package compilation, everything in the `inst` folder is moved up to the root directory of the package, and so we can find the `extdata` directory in the finished product.
 19 | 
 20 | ```{r download-data}
 21 | extdata <- file.path(package_root, "inst", "extdata")
 22 | data_files <- c("amazon_cells_labelled.txt",
 23 |                 "imdb_labelled.txt",
 24 |                 "yelp_labelled.txt") %>% file.path(extdata, .)
 25 | if (!all(file.exists(data_files))) {
 26 |   download_data(extdata)
 27 | }
 28 | ```
 29 | 
 30 | Data is loaded in with another custom function, `read_review_file`. This is just `readr::read_tsv` with some special options to cover the pecularities of the raw data. All of these custom functions are documented and stored in the `R` directory. Once the package is installed, function manuals can be called in the usual way (eg. `?read_review_file`).
 31 | 
 32 | This is a simple analysis, so let's just stick to discrete categories for sentiment: "good" and "bad". I don't care too much about how the model performs, as long as it functions.
 33 | 
 34 | ```{r load-data}
 35 | reviews <- data_files %>% 
 36 |   purrr::map(read_review_file) %>%
 37 |   purrr::reduce(rbind) %>% 
 38 |   mutate(sentiment = ifelse(sentiment == 1, "good", "bad"))
 39 | reviews %>% head
 40 | ```
 41 | 
 42 | # Exploring data
 43 | 
 44 | We check for missing data using the `naniar` package:
 45 | 
 46 | ```{r naniar}
 47 | reviews %>% naniar::miss_var_summary()
 48 | ```
 49 | 
 50 | Let's take a look at which words are the most frequent. First we create a data frame such that each row is an occurrence of a word. Note that we remove stop words --- these are words like "the" that are common and usually provide little semantic content to the text.
 51 | 
 52 | ```{r words}
 53 | words <- reviews %>% 
 54 |   tidytext::unnest_tokens(
 55 |     word, 
 56 |     review
 57 |   ) %>% 
 58 |   anti_join(
 59 |     tidytext::stop_words, 
 60 |     by = "word"
 61 |   )
 62 | words %>% head
 63 | ```
 64 | 
 65 | Now we'll plot the mosst frequently occurring words, keeping a note of which words are "good" and which words are "bad".
 66 | 
 67 | ```{r word_frequency, fig.width = 6, fig.height = 6, out.height = 600, out.width = 600}
 68 | words %>%
 69 |   count(word, sentiment, sort = TRUE) %>%
 70 |   head(20) %>%
 71 |   mutate(word = reorder(word, n)) %>%
 72 |   ggplot(aes(word, n, fill = sentiment)) +
 73 |   geom_col() +
 74 |   xlab(NULL) +
 75 |   theme(text = element_text(size = 16)) +
 76 |   coord_flip() +
 77 |   ggtitle("Frequency of words")
 78 | ```
 79 | 
 80 | There are no surprises here! "Bad" is universally bad and "love" is universally good. It's comforting to see. We'll note this and use these words in our unit tests.
 81 | 
 82 | I'm not sure what purpose word clouds serve, but they seem almost mandatory.
 83 | 
 84 | ```{r word_cloud, fig.width = 6, fig.height = 6, out.height = 600, out.width = 600}
 85 | words %>%
 86 |   count(word) %>%
 87 |   with(
 88 |     wordcloud::wordcloud(
 89 |       word, 
 90 |       n, 
 91 |       max.words = 100
 92 |     )
 93 |   )
 94 | ```
 95 | 
 96 | 
 97 | # Preprocessing
 98 | 
 99 | We need to apply some preprocessing to our text before we can feed it into a model. The first round of preprocessing is simply ignoring case, punctuation and numbers:
100 | 
101 | ```{r text-preprocessor}
102 | text_preprocessor
103 | ```
104 | 
105 | I'm actually not sure that we *should* be removing numbers here. We're dealing with reviews, after all, and a review like "10/10" certainly tells us something about sentiment. But that's beyond the scope of this package.
106 | 
107 | The next round of processing involves tokenising our words. This is a process of stripping words down to their base. Another custom function, `stem_tokeniser` plays this role, by calling on the Porter stemming algorithm:
108 | 
109 | ```{r stem-tokeniser-example}
110 | stem_tokeniser("information informed informing informs")
111 | ```
112 | 
113 | 


--------------------------------------------------------------------------------
/inst/execute-model.R:
--------------------------------------------------------------------------------
1 | devtools::load_all()
2 | package_name <- basename(here::here())
3 | drake::expose_imports(package_name, character_only = TRUE)
4 | drake::make(model_execution_plan())
5 | 


--------------------------------------------------------------------------------
/inst/img/drake-model-execution-plan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdneuzerling/DrakeModelling/84b843160e549b3a8bb12acca613956943fd510a/inst/img/drake-model-execution-plan.png


--------------------------------------------------------------------------------
/inst/img/drake-model-training-plan.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdneuzerling/DrakeModelling/84b843160e549b3a8bb12acca613956943fd510a/inst/img/drake-model-training-plan.png


--------------------------------------------------------------------------------
/inst/img/failed-validation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mdneuzerling/DrakeModelling/84b843160e549b3a8bb12acca613956943fd510a/inst/img/failed-validation.png


--------------------------------------------------------------------------------
/inst/train-model.R:
--------------------------------------------------------------------------------
1 | devtools::load_all()
2 | package_name <- basename(here::here())
3 | drake::expose_imports(package_name, character_only = TRUE)
4 | drake::make(model_training_plan())
5 | 


--------------------------------------------------------------------------------
/man/create_tfidf.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create-tfidf.R
 3 | \name{create_tfidf}
 4 | \alias{create_tfidf}
 5 | \title{Create a fitted tf-idf}
 6 | \usage{
 7 | create_tfidf(dtm_unweighted)
 8 | }
 9 | \arguments{
10 | \item{dtm_unweighted}{An unweighted document term matrix created by the
11 | map_to_dtm function, or an equivalent from text2vec.}
12 | }
13 | \value{
14 | A fitted tfidf objet created by the text2vec package
15 | }
16 | \description{
17 | The text2vec package uses a Pythonic style of code to create a fitted term
18 | frequency - inverse document frequency (tf-idf) object. A new tfidf object
19 | is initialised and then the tfidf$fit_transform method is applied to an
20 | unweighted document-term matrix. This style of code is unusual in R and
21 | can cause issues with drake. This function creates a fitted tfidf object
22 | with a more functional and R-idiomatic style of code.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/create_vocabulary.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/create-vocabulary.R
 3 | \name{create_vocabulary}
 4 | \alias{create_vocabulary}
 5 | \title{Create a pruned vocabulary from a token iterator}
 6 | \usage{
 7 | create_vocabulary(documents, doc_proportion_min = 0, doc_proportion_max = 1)
 8 | }
 9 | \arguments{
10 | \item{documents}{A vector of characters, often sentences or paragraphs.}
11 | 
12 | \item{doc_proportion_min}{Optional. A number between 0 and 1 which specifies
13 | the minimum proportion of documents in which a token appears in order to be
14 | included in the vocabulary. Defaults to 0 (no effect).}
15 | 
16 | \item{doc_proportion_max}{Optional. A number between 0 and 1 which specifies
17 | the maximum proportion of documents in which a token appears in order to be
18 | included in the vocabulary. Defaults to 1 (no effect).}
19 | }
20 | \value{
21 | A vocabulary object used in the text2vec package
22 | }
23 | \description{
24 | This function creates a vocabulary from a vector of documents. A
25 | vocabulary defines the domain of a natural language processing problem.
26 | Vocabularies are often used to create vectorisers, which allow novel pieces
27 | of text to be mapped to a vocabulary defined by a training set. To exclude
28 | frequently and infrequently occurring tokens, the vocabulary is often
29 | trimmed. This reduces the dimension of the problem to decrease training time
30 | and the potential for overfitting.
31 | }
32 | 


--------------------------------------------------------------------------------
/man/download_and_read_data.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/download-and-read-data.R
 3 | \name{download_and_read_data}
 4 | \alias{download_and_read_data}
 5 | \title{Download and parse review data}
 6 | \usage{
 7 | download_and_read_data(data_source_url)
 8 | }
 9 | \arguments{
10 | \item{data_source_url}{Character.}
11 | }
12 | \value{
13 | Tibble of parsed data
14 | }
15 | \description{
16 | Given the URL of a .zip file, this function will download it and decompress
17 | it to a given location.
18 | }
19 | \details{
20 | This function was intended to download and uncompress the data located at
21 | https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences
22 | It is not a general function to download arbitrary data, but may be a
23 | useful template.
24 | }
25 | 


--------------------------------------------------------------------------------
/man/map_to_dtm.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/map-to-dtm.R
 3 | \name{map_to_dtm}
 4 | \alias{map_to_dtm}
 5 | \title{Create a document term matrix using a vectoriser}
 6 | \usage{
 7 | map_to_dtm(x, vectoriser, tfidf = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{A character or vector of characters, usually sentences, paragraphs
11 | or similar pieces of natural language.}
12 | 
13 | \item{vectoriser}{A vectoriser constructed with the text2vec package.}
14 | 
15 | \item{tfidf}{A tfidf object constructed with the text2vec package.
16 | If no tfidf is NULL, then an unweighted document term matrix will be
17 | returned.}
18 | }
19 | \value{
20 | A document-term matrix with rows representing the textual objects in
21 | x, and columns representing the tokens in the vocabulary used to generate the
22 | given vectoriser.
23 | }
24 | \description{
25 | This function uses a vectoriser created with the text2vec package to map
26 | a new piece of text, or vector of text, onto a document term matrix (dtm).
27 | The vectoriser has a concept of a vocabulary, a set of tokens which
28 | determine the columns of the resulting document term matrix. Any term that
29 | doesn't match to a token in the vocabulary will be ignored. Optionally, the
30 | document term matrix can be weighted by a term frequency-inverse document
31 | frequency (tfidf) object, created with the text2vec::TfIdf function.
32 | }
33 | 


--------------------------------------------------------------------------------
/man/model_execution_plan.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/model-execution-plan.R
 3 | \name{model_execution_plan}
 4 | \alias{model_execution_plan}
 5 | \title{drake plan for model execution}
 6 | \usage{
 7 | model_execution_plan()
 8 | }
 9 | \value{
10 | A plan to be run with drake::make()
11 | }
12 | \description{
13 | drake plan for model execution
14 | }
15 | 


--------------------------------------------------------------------------------
/man/model_training_plan.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/model-training-plan.R
 3 | \name{model_training_plan}
 4 | \alias{model_training_plan}
 5 | \title{drake plan for model training}
 6 | \usage{
 7 | model_training_plan()
 8 | }
 9 | \value{
10 | A plan to be run with drake::make()
11 | }
12 | \description{
13 | drake plan for model training
14 | }
15 | 


--------------------------------------------------------------------------------
/man/new_data_to_be_scored.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/new-data-to-be-scored.R
 3 | \name{new_data_to_be_scored}
 4 | \alias{new_data_to_be_scored}
 5 | \title{Retrieve new data that needs to be scored}
 6 | \usage{
 7 | new_data_to_be_scored()
 8 | }
 9 | \value{
10 | Data frame with two char columns: review and sentiment ("good"/"bad")
11 | }
12 | \description{
13 | This function is meant to replicate a function that will retrieve new data
14 | that needs to be scored. This is a batch execution, rather than an on-demand
15 | API that might be stood up by the plumber package. In practice, the return
16 | is a fixed sample of 10 reviews.
17 | }
18 | 


--------------------------------------------------------------------------------
/man/read_review_file.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/read-review-file.R
 3 | \name{read_review_file}
 4 | \alias{read_review_file}
 5 | \title{Read a review source data file into a data frame}
 6 | \usage{
 7 | read_review_file(file_path)
 8 | }
 9 | \arguments{
10 | \item{file_path}{character. A file path to a source .txt data file}
11 | }
12 | \description{
13 | This function is specifically intended to parse the three data files in the
14 | sentiment analysis data set
15 | (https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences).
16 | }
17 | 


--------------------------------------------------------------------------------
/man/sentiment.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/sentiment.R
 3 | \name{sentiment}
 4 | \alias{sentiment}
 5 | \title{Classify a review as good or bad}
 6 | \usage{
 7 | sentiment(x, random_forest, vectoriser, tfidf = NULL)
 8 | }
 9 | \arguments{
10 | \item{x}{Text to be classified, ideally a one-sentence product review.}
11 | 
12 | \item{random_forest}{A model created with the randomForest package.}
13 | 
14 | \item{vectoriser}{A vectoriser constructed with the text2vec package.}
15 | 
16 | \item{tfidf}{A tfidf object constructed with the text2vec package.
17 | If no tfidf is NULL, then weighting will not be applied.}
18 | }
19 | \description{
20 | Classify a review as good or bad
21 | }
22 | 


--------------------------------------------------------------------------------
/man/stem_tokeniser.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/stem-tokeniser.R
 3 | \name{stem_tokeniser}
 4 | \alias{stem_tokeniser}
 5 | \title{Stem and tokenise a sentence}
 6 | \usage{
 7 | stem_tokeniser(x)
 8 | }
 9 | \arguments{
10 | \item{x}{A character or vector of characters, usually sentences, paragraphs
11 | or similar pieces of natural language.}
12 | }
13 | \value{
14 | A vector of tokens
15 | }
16 | \description{
17 | Stem and tokenise a sentence
18 | }
19 | \examples{
20 | stem_tokeniser("information informed informing informs")
21 | 
22 | }
23 | 


--------------------------------------------------------------------------------
/man/submit_predictions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/submit-predictions.R
 3 | \name{submit_predictions}
 4 | \alias{submit_predictions}
 5 | \title{Simulate submission of output data}
 6 | \usage{
 7 | submit_predictions(predictions)
 8 | }
 9 | \arguments{
10 | \item{predictions}{Predictions to be submitted. Format is not important.}
11 | }
12 | \value{
13 | Returns output, invisibly
14 | }
15 | \description{
16 | This is a dummy function that simulates the submission of newly scored 
17 | model results to an external platform. In actual fact, this function does
18 | nothing, but sleeps for 5 seconds to simulate an upload process.
19 | }
20 | 


--------------------------------------------------------------------------------
/man/text_preprocessor.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/text-preprocessor.R
 3 | \name{text_preprocessor}
 4 | \alias{text_preprocessor}
 5 | \title{Create a pruned vocabulary from a token iterator}
 6 | \usage{
 7 | text_preprocessor(x)
 8 | }
 9 | \arguments{
10 | \item{x}{Character. Text to be processed.}
11 | }
12 | \value{
13 | The same character after processing.
14 | }
15 | \description{
16 | This function creates a vocabulary from a vector of documents. A
17 | vocabulary defines the domain of a natural language processing problem.
18 | Vocabularies are often used to create vectorisers, which allow novel pieces
19 | of text to be mapped to a vocabulary defined by a training set. To exclude
20 | frequently and infrequently occurring tokens, the vocabulary is often
21 | trimmed. This reduces the dimension of the problem to decrease training time
22 | and the potential for overfitting.
23 | }
24 | 


--------------------------------------------------------------------------------
/man/validate_model.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validate-model.R
 3 | \name{validate_model}
 4 | \alias{validate_model}
 5 | \title{Assert that model makes sensible predictions}
 6 | \usage{
 7 | validate_model(random_forest, vectoriser, tfidf = NULL)
 8 | }
 9 | \arguments{
10 | \item{random_forest}{A model created with the randomForest package.}
11 | 
12 | \item{vectoriser}{A vectoriser constructed with the text2vec package.}
13 | 
14 | \item{tfidf}{A tfidf object constructed with the text2vec package.
15 | If no tfidf is NULL, then weighting will not be applied.}
16 | }
17 | \value{
18 | The MD5 hash of the random_forest object
19 | }
20 | \description{
21 | This function tests model predictions against a small sample of inputs with
22 | predictable outputs. For example, a sentiment analysis model that returns
23 | a sentiment of "bad" for "love" should not be used. We can also demand a
24 | minimum level of accuracy according to one or more metrics. Any failed 
25 | assertions will raise an error. This validation approach could also be used
26 | to perform rudimentary checks for model bias (in the ethics sense).
27 | }
28 | 


--------------------------------------------------------------------------------
/man/validate_predictions.Rd:
--------------------------------------------------------------------------------
 1 | % Generated by roxygen2: do not edit by hand
 2 | % Please edit documentation in R/validate-predictions.R
 3 | \name{validate_predictions}
 4 | \alias{validate_predictions}
 5 | \title{Assert that model predictions are sensible}
 6 | \usage{
 7 | validate_predictions(predictions)
 8 | }
 9 | \arguments{
10 | \item{predictions}{A vector of model predictions}
11 | }
12 | \value{
13 | The MD5 hash of the predictions
14 | }
15 | \description{
16 | This function tests that model predictions make sense. It is intended to be
17 | run after new data has been scored, but before it has been submitted. We
18 | might ensure that outputs are sensible, within an expected range, or do not
19 | contain sensitive data. Failed assertions will raise an error. Upon success,
20 | the md5 hash of the predictions will be returned.
21 | }
22 | 


--------------------------------------------------------------------------------
/tests/testthat.R:
--------------------------------------------------------------------------------
1 | library(testthat)
2 | library(DrakeModelling)
3 | 
4 | test_check("DrakeModelling")
5 | 


--------------------------------------------------------------------------------
/tests/testthat/test-map-to-dtm.R:
--------------------------------------------------------------------------------
 1 | context("map-to-dtm")
 2 | 
 3 | # Set up a dtm and tfidf based on a small dataset with a predictable outcome
 4 | 
 5 | test_reviews <- tibble::tribble(
 6 |   ~review,
 7 |   "This movie is good fun and everyone should see it. Great film.",
 8 |   "What a poor quality phone",
 9 |   "This phone has a great battery phone",
10 |   "An excellent piece of software"
11 | )
12 | 
13 | test_vocabulary <- create_vocabulary(test_reviews$review)
14 | test_vectoriser <- test_vocabulary %>% text2vec::vocab_vectorizer()
15 | test_dtm_unweighted <- map_to_dtm(test_reviews$review,
16 |                                   vectoriser = test_vectoriser,
17 |                                   tfidf = NULL)
18 | test_tfidf <- create_tfidf(test_dtm_unweighted)
19 | 
20 | # The dtms used for testing are below
21 | # Note that "movie" stems to "movi".
22 | 
23 | dtm_unweighted <- as.matrix(
24 |   map_to_dtm(c("movie", "phone"),
25 |              vectoriser = test_vectoriser,
26 |              tfidf = NULL)
27 | )
28 | 
29 | dtm_weighted <- as.matrix(
30 |   map_to_dtm(c("movie", "phone"),
31 |              test_vectoriser,
32 |              tfidf = test_tfidf)
33 | )
34 | 
35 | test_that("DTM has common words that aren't stop words", {
36 |   expect_true("movi" %in% colnames(dtm_unweighted))
37 |   expect_true("phone" %in% colnames(dtm_unweighted))
38 |   expect_true("film" %in% colnames(dtm_unweighted))
39 | })
40 | 
41 | test_that("DTM doesn't contain common stop words", {
42 |   expect_false("this" %in% colnames(dtm_unweighted))
43 |   expect_false("a" %in% colnames(dtm_unweighted))
44 | })
45 | 
46 | test_that("DTM correctly represents present and absent words", {
47 |   expect_equal(dtm_unweighted[1, "movi"], 1)
48 |   expect_equal(dtm_unweighted[2, "phone"], 1)
49 |   expect_equal(dtm_unweighted[1, "film"], 0)
50 |   expect_equal(dtm_unweighted[2, "film"], 0)
51 | })
52 | 
53 | test_that("DTM is applying weightings", {
54 |   expect_false(dtm_weighted[1, "movi"] == 0)
55 |   expect_false(dtm_weighted[1, "movi"] == 1)
56 |   expect_false(dtm_weighted[2, "phone"] == 0)
57 |   expect_false(dtm_weighted[2, "phone"] == 1)
58 | })
59 | 


--------------------------------------------------------------------------------
/tests/testthat/test-stem-tokeniser.R:
--------------------------------------------------------------------------------
 1 | context("stem-tokeniser")
 2 | 
 3 | test_that("Processing inflections of the same stem", {
 4 |   expect_equal(
 5 |     stem_tokeniser("information informed informing informs"),
 6 |     list(c("inform", "inform", "inform", "inform"))
 7 |   )
 8 | })
 9 | 
10 | test_that("Stemming multiple documents", {
11 |   expect_equal(
12 |     stem_tokeniser(c("information", "product")),
13 |     list("inform", "product")
14 |   )
15 | })
16 | 


--------------------------------------------------------------------------------
/tests/testthat/test-text-preprocessor.R:
--------------------------------------------------------------------------------
 1 | context("test-text-preprocessor")
 2 | 
 3 | test_that("Case is lowered", {
 4 |   test_string <- "SPHYNX OF BLACK QUARTZ judge my VoW"
 5 |   expected_result <- "sphynx of black quartz judge my vow"
 6 |   expect_equal(text_preprocessor(test_string), expected_result)
 7 | })
 8 | 
 9 | test_that("Punctuation is removed", {
10 |     test_string <- "\"sphynx. of black quartz, judge - my vow!\""
11 |     expected_result <- "sphynx of black quartz judge  my vow"
12 |     expect_equal(text_preprocessor(test_string), expected_result)
13 | })
14 | 
15 | test_that("Numbers are removed", {
16 |     test_string <- "\"sphynx. o10f black qu1artz, ju2dge - my vow!\""
17 |     expected_result <- "sphynx of black quartz judge  my vow"
18 |     expect_equal(text_preprocessor(test_string), expected_result)
19 | })
20 | 
21 | test_that("Case is lowered, punctuation and numbers are removed", {
22 |     test_string <- "\"SPHYNX. o10f bLaCk qu1artz, ju2dge - my vow!\""
23 |     expected_result <- "sphynx of black quartz judge  my vow"
24 |     expect_equal(text_preprocessor(test_string), expected_result)
25 | })
26 | 
27 | test_that("Processing vector of text", {
28 |   test_strings <- c("fish", "KING!", "cA4rD")
29 |   expected_result <- c("fish", "king", "card")
30 |   expect_equal(text_preprocessor(test_strings), expected_result)
31 | })
32 | 


--------------------------------------------------------------------------------